1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL 6 7declare i32 @llvm.amdgcn.workitem.id.x() #1 8 9declare i16 @llvm.bitreverse.i16(i16) #1 10declare i32 @llvm.bitreverse.i32(i32) #1 11declare i64 @llvm.bitreverse.i64(i64) #1 12 13declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1 14declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1 15 16declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 17declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 18 19define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { 20; SI-LABEL: s_brev_i16: 21; SI: ; %bb.0: 22; SI-NEXT: s_load_dword s4, s[0:1], 0xb 23; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; SI-NEXT: s_mov_b32 s3, 0xf000 25; SI-NEXT: s_mov_b32 s2, -1 26; SI-NEXT: s_waitcnt lgkmcnt(0) 27; SI-NEXT: s_brev_b32 s4, s4 28; SI-NEXT: s_lshr_b32 s4, s4, 16 29; SI-NEXT: v_mov_b32_e32 v0, s4 30; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 31; SI-NEXT: s_endpgm 32; 33; FLAT-LABEL: s_brev_i16: 34; FLAT: ; %bb.0: 35; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c 36; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 37; FLAT-NEXT: s_mov_b32 s3, 0xf000 38; FLAT-NEXT: s_mov_b32 s2, -1 39; FLAT-NEXT: s_waitcnt lgkmcnt(0) 40; FLAT-NEXT: s_brev_b32 s4, s4 41; FLAT-NEXT: s_lshr_b32 s4, s4, 16 42; FLAT-NEXT: v_mov_b32_e32 v0, s4 43; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 44; FLAT-NEXT: s_endpgm 45; 46; GISEL-LABEL: s_brev_i16: 47; GISEL: ; %bb.0: 48; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c 49; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GISEL-NEXT: s_waitcnt lgkmcnt(0) 51; GISEL-NEXT: s_and_b32 s2, s2, 0xffff 52; GISEL-NEXT: s_brev_b32 s2, s2 53; GISEL-NEXT: s_lshr_b32 s2, s2, 16 54; GISEL-NEXT: v_mov_b32_e32 v0, s0 55; GISEL-NEXT: v_mov_b32_e32 v2, s2 56; GISEL-NEXT: v_mov_b32_e32 v1, s1 57; GISEL-NEXT: flat_store_short v[0:1], v2 58; GISEL-NEXT: s_endpgm 59 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 60 store i16 %brev, i16 addrspace(1)* %out 61 ret void 62} 63 64define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { 65; SI-LABEL: v_brev_i16: 66; SI: ; %bb.0: 67; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 68; SI-NEXT: s_mov_b32 s3, 0xf000 69; SI-NEXT: s_mov_b32 s2, -1 70; SI-NEXT: s_mov_b32 s6, s2 71; SI-NEXT: s_mov_b32 s7, s3 72; SI-NEXT: s_waitcnt lgkmcnt(0) 73; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 74; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 75; SI-NEXT: s_waitcnt vmcnt(0) 76; SI-NEXT: v_bfrev_b32_e32 v0, v0 77; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 78; SI-NEXT: s_waitcnt lgkmcnt(0) 79; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 80; SI-NEXT: s_endpgm 81; 82; FLAT-LABEL: v_brev_i16: 83; FLAT: ; %bb.0: 84; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 85; FLAT-NEXT: s_mov_b32 s3, 0xf000 86; FLAT-NEXT: s_mov_b32 s2, -1 87; FLAT-NEXT: s_mov_b32 s6, s2 88; FLAT-NEXT: s_mov_b32 s7, s3 89; FLAT-NEXT: s_waitcnt lgkmcnt(0) 90; FLAT-NEXT: buffer_load_ushort v0, off, s[4:7], 0 91; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 92; FLAT-NEXT: s_waitcnt vmcnt(0) 93; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 94; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 95; FLAT-NEXT: s_waitcnt lgkmcnt(0) 96; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 97; FLAT-NEXT: s_endpgm 98; 99; GISEL-LABEL: v_brev_i16: 100; GISEL: ; %bb.0: 101; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 102; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 103; GISEL-NEXT: s_waitcnt lgkmcnt(0) 104; GISEL-NEXT: v_mov_b32_e32 v0, s2 105; GISEL-NEXT: v_mov_b32_e32 v1, s3 106; GISEL-NEXT: flat_load_ushort v0, v[0:1] 107; GISEL-NEXT: s_waitcnt vmcnt(0) 108; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 109; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 110; GISEL-NEXT: v_mov_b32_e32 v0, s0 111; GISEL-NEXT: v_mov_b32_e32 v1, s1 112; GISEL-NEXT: flat_store_short v[0:1], v2 113; GISEL-NEXT: s_endpgm 114 %val = load i16, i16 addrspace(1)* %valptr 115 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 116 store i16 %brev, i16 addrspace(1)* %out 117 ret void 118} 119 120define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { 121; SI-LABEL: s_brev_i32: 122; SI: ; %bb.0: 123; SI-NEXT: s_load_dword s4, s[0:1], 0xb 124; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 125; SI-NEXT: s_mov_b32 s3, 0xf000 126; SI-NEXT: s_mov_b32 s2, -1 127; SI-NEXT: s_waitcnt lgkmcnt(0) 128; SI-NEXT: s_brev_b32 s4, s4 129; SI-NEXT: v_mov_b32_e32 v0, s4 130; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 131; SI-NEXT: s_endpgm 132; 133; FLAT-LABEL: s_brev_i32: 134; FLAT: ; %bb.0: 135; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c 136; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 137; FLAT-NEXT: s_mov_b32 s3, 0xf000 138; FLAT-NEXT: s_mov_b32 s2, -1 139; FLAT-NEXT: s_waitcnt lgkmcnt(0) 140; FLAT-NEXT: s_brev_b32 s4, s4 141; FLAT-NEXT: v_mov_b32_e32 v0, s4 142; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 143; FLAT-NEXT: s_endpgm 144; 145; GISEL-LABEL: s_brev_i32: 146; GISEL: ; %bb.0: 147; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c 148; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 149; GISEL-NEXT: s_waitcnt lgkmcnt(0) 150; GISEL-NEXT: s_brev_b32 s2, s2 151; GISEL-NEXT: v_mov_b32_e32 v0, s0 152; GISEL-NEXT: v_mov_b32_e32 v2, s2 153; GISEL-NEXT: v_mov_b32_e32 v1, s1 154; GISEL-NEXT: flat_store_dword v[0:1], v2 155; GISEL-NEXT: s_endpgm 156 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 157 store i32 %brev, i32 addrspace(1)* %out 158 ret void 159} 160 161define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { 162; SI-LABEL: v_brev_i32: 163; SI: ; %bb.0: 164; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 165; SI-NEXT: s_mov_b32 s3, 0xf000 166; SI-NEXT: s_mov_b32 s6, 0 167; SI-NEXT: s_mov_b32 s7, s3 168; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 169; SI-NEXT: v_mov_b32_e32 v1, 0 170; SI-NEXT: s_waitcnt lgkmcnt(0) 171; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 172; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 173; SI-NEXT: s_mov_b32 s2, -1 174; SI-NEXT: s_waitcnt vmcnt(0) 175; SI-NEXT: v_bfrev_b32_e32 v0, v0 176; SI-NEXT: s_waitcnt lgkmcnt(0) 177; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 178; SI-NEXT: s_endpgm 179; 180; FLAT-LABEL: v_brev_i32: 181; FLAT: ; %bb.0: 182; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 183; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 184; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 185; FLAT-NEXT: s_waitcnt lgkmcnt(0) 186; FLAT-NEXT: v_mov_b32_e32 v1, s3 187; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 188; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 189; FLAT-NEXT: flat_load_dword v0, v[0:1] 190; FLAT-NEXT: s_mov_b32 s3, 0xf000 191; FLAT-NEXT: s_mov_b32 s2, -1 192; FLAT-NEXT: s_waitcnt vmcnt(0) 193; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 194; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 195; FLAT-NEXT: s_endpgm 196; 197; GISEL-LABEL: v_brev_i32: 198; GISEL: ; %bb.0: 199; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 200; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 201; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 202; GISEL-NEXT: s_waitcnt lgkmcnt(0) 203; GISEL-NEXT: v_mov_b32_e32 v0, s2 204; GISEL-NEXT: v_mov_b32_e32 v1, s3 205; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 206; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 207; GISEL-NEXT: flat_load_dword v0, v[0:1] 208; GISEL-NEXT: s_waitcnt vmcnt(0) 209; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 210; GISEL-NEXT: v_mov_b32_e32 v0, s0 211; GISEL-NEXT: v_mov_b32_e32 v1, s1 212; GISEL-NEXT: flat_store_dword v[0:1], v2 213; GISEL-NEXT: s_endpgm 214 %tid = call i32 @llvm.amdgcn.workitem.id.x() 215 %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 216 %val = load i32, i32 addrspace(1)* %gep 217 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 218 store i32 %brev, i32 addrspace(1)* %out 219 ret void 220} 221 222define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { 223; SI-LABEL: s_brev_v2i32: 224; SI: ; %bb.0: 225; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 226; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 227; SI-NEXT: s_mov_b32 s3, 0xf000 228; SI-NEXT: s_mov_b32 s2, -1 229; SI-NEXT: s_waitcnt lgkmcnt(0) 230; SI-NEXT: s_brev_b32 s5, s5 231; SI-NEXT: s_brev_b32 s4, s4 232; SI-NEXT: v_mov_b32_e32 v0, s4 233; SI-NEXT: v_mov_b32_e32 v1, s5 234; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 235; SI-NEXT: s_endpgm 236; 237; FLAT-LABEL: s_brev_v2i32: 238; FLAT: ; %bb.0: 239; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 240; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 241; FLAT-NEXT: s_mov_b32 s3, 0xf000 242; FLAT-NEXT: s_mov_b32 s2, -1 243; FLAT-NEXT: s_waitcnt lgkmcnt(0) 244; FLAT-NEXT: s_brev_b32 s5, s5 245; FLAT-NEXT: s_brev_b32 s4, s4 246; FLAT-NEXT: v_mov_b32_e32 v0, s4 247; FLAT-NEXT: v_mov_b32_e32 v1, s5 248; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 249; FLAT-NEXT: s_endpgm 250; 251; GISEL-LABEL: s_brev_v2i32: 252; GISEL: ; %bb.0: 253; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 254; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 255; GISEL-NEXT: s_waitcnt lgkmcnt(0) 256; GISEL-NEXT: s_brev_b32 s2, s2 257; GISEL-NEXT: s_brev_b32 s3, s3 258; GISEL-NEXT: v_mov_b32_e32 v0, s2 259; GISEL-NEXT: v_mov_b32_e32 v3, s1 260; GISEL-NEXT: v_mov_b32_e32 v1, s3 261; GISEL-NEXT: v_mov_b32_e32 v2, s0 262; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 263; GISEL-NEXT: s_endpgm 264 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 265 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out 266 ret void 267} 268 269define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { 270; SI-LABEL: v_brev_v2i32: 271; SI: ; %bb.0: 272; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 273; SI-NEXT: s_mov_b32 s3, 0xf000 274; SI-NEXT: s_mov_b32 s6, 0 275; SI-NEXT: s_mov_b32 s7, s3 276; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 277; SI-NEXT: v_mov_b32_e32 v1, 0 278; SI-NEXT: s_waitcnt lgkmcnt(0) 279; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 280; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 281; SI-NEXT: s_mov_b32 s2, -1 282; SI-NEXT: s_waitcnt vmcnt(0) 283; SI-NEXT: v_bfrev_b32_e32 v1, v1 284; SI-NEXT: v_bfrev_b32_e32 v0, v0 285; SI-NEXT: s_waitcnt lgkmcnt(0) 286; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 287; SI-NEXT: s_endpgm 288; 289; FLAT-LABEL: v_brev_v2i32: 290; FLAT: ; %bb.0: 291; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 292; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 293; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 294; FLAT-NEXT: s_waitcnt lgkmcnt(0) 295; FLAT-NEXT: v_mov_b32_e32 v1, s3 296; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 297; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 298; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 299; FLAT-NEXT: s_mov_b32 s3, 0xf000 300; FLAT-NEXT: s_mov_b32 s2, -1 301; FLAT-NEXT: s_waitcnt vmcnt(0) 302; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 303; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 304; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 305; FLAT-NEXT: s_endpgm 306; 307; GISEL-LABEL: v_brev_v2i32: 308; GISEL: ; %bb.0: 309; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 310; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 311; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 312; GISEL-NEXT: s_waitcnt lgkmcnt(0) 313; GISEL-NEXT: v_mov_b32_e32 v0, s2 314; GISEL-NEXT: v_mov_b32_e32 v1, s3 315; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 316; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 317; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 318; GISEL-NEXT: v_mov_b32_e32 v3, s1 319; GISEL-NEXT: v_mov_b32_e32 v2, s0 320; GISEL-NEXT: s_waitcnt vmcnt(0) 321; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 322; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 323; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 324; GISEL-NEXT: s_endpgm 325 %tid = call i32 @llvm.amdgcn.workitem.id.x() 326 %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 327 %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep 328 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 329 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out 330 ret void 331} 332 333define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { 334; SI-LABEL: s_brev_i64: 335; SI: ; %bb.0: 336; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 337; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 338; SI-NEXT: s_mov_b32 s3, 0xf000 339; SI-NEXT: s_mov_b32 s2, -1 340; SI-NEXT: s_waitcnt lgkmcnt(0) 341; SI-NEXT: s_brev_b64 s[4:5], s[4:5] 342; SI-NEXT: v_mov_b32_e32 v0, s4 343; SI-NEXT: v_mov_b32_e32 v1, s5 344; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 345; SI-NEXT: s_endpgm 346; 347; FLAT-LABEL: s_brev_i64: 348; FLAT: ; %bb.0: 349; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 350; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 351; FLAT-NEXT: s_mov_b32 s3, 0xf000 352; FLAT-NEXT: s_mov_b32 s2, -1 353; FLAT-NEXT: s_waitcnt lgkmcnt(0) 354; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5] 355; FLAT-NEXT: v_mov_b32_e32 v0, s4 356; FLAT-NEXT: v_mov_b32_e32 v1, s5 357; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 358; FLAT-NEXT: s_endpgm 359; 360; GISEL-LABEL: s_brev_i64: 361; GISEL: ; %bb.0: 362; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 363; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 364; GISEL-NEXT: s_waitcnt lgkmcnt(0) 365; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] 366; GISEL-NEXT: v_mov_b32_e32 v0, s2 367; GISEL-NEXT: v_mov_b32_e32 v3, s1 368; GISEL-NEXT: v_mov_b32_e32 v1, s3 369; GISEL-NEXT: v_mov_b32_e32 v2, s0 370; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 371; GISEL-NEXT: s_endpgm 372 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 373 store i64 %brev, i64 addrspace(1)* %out 374 ret void 375} 376 377define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { 378; SI-LABEL: v_brev_i64: 379; SI: ; %bb.0: 380; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 381; SI-NEXT: s_mov_b32 s3, 0xf000 382; SI-NEXT: s_mov_b32 s6, 0 383; SI-NEXT: s_mov_b32 s7, s3 384; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 385; SI-NEXT: v_mov_b32_e32 v1, 0 386; SI-NEXT: s_waitcnt lgkmcnt(0) 387; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 388; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 389; SI-NEXT: s_mov_b32 s2, -1 390; SI-NEXT: s_waitcnt vmcnt(0) 391; SI-NEXT: v_bfrev_b32_e32 v2, v0 392; SI-NEXT: v_bfrev_b32_e32 v1, v1 393; SI-NEXT: s_waitcnt lgkmcnt(0) 394; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 395; SI-NEXT: s_endpgm 396; 397; FLAT-LABEL: v_brev_i64: 398; FLAT: ; %bb.0: 399; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 400; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 401; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 402; FLAT-NEXT: s_waitcnt lgkmcnt(0) 403; FLAT-NEXT: v_mov_b32_e32 v1, s3 404; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 406; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 407; FLAT-NEXT: s_mov_b32 s3, 0xf000 408; FLAT-NEXT: s_mov_b32 s2, -1 409; FLAT-NEXT: s_waitcnt vmcnt(0) 410; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 411; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 412; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 413; FLAT-NEXT: s_endpgm 414; 415; GISEL-LABEL: v_brev_i64: 416; GISEL: ; %bb.0: 417; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 418; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 419; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 420; GISEL-NEXT: s_waitcnt lgkmcnt(0) 421; GISEL-NEXT: v_mov_b32_e32 v0, s2 422; GISEL-NEXT: v_mov_b32_e32 v1, s3 423; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 424; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 425; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 426; GISEL-NEXT: v_mov_b32_e32 v4, s1 427; GISEL-NEXT: v_mov_b32_e32 v3, s0 428; GISEL-NEXT: s_waitcnt vmcnt(0) 429; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 430; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 431; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 432; GISEL-NEXT: s_endpgm 433 %tid = call i32 @llvm.amdgcn.workitem.id.x() 434 %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid 435 %val = load i64, i64 addrspace(1)* %gep 436 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 437 store i64 %brev, i64 addrspace(1)* %out 438 ret void 439} 440 441define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { 442; SI-LABEL: s_brev_v2i64: 443; SI: ; %bb.0: 444; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 445; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 446; SI-NEXT: s_mov_b32 s3, 0xf000 447; SI-NEXT: s_mov_b32 s2, -1 448; SI-NEXT: s_waitcnt lgkmcnt(0) 449; SI-NEXT: s_brev_b64 s[6:7], s[6:7] 450; SI-NEXT: s_brev_b64 s[4:5], s[4:5] 451; SI-NEXT: v_mov_b32_e32 v0, s4 452; SI-NEXT: v_mov_b32_e32 v1, s5 453; SI-NEXT: v_mov_b32_e32 v2, s6 454; SI-NEXT: v_mov_b32_e32 v3, s7 455; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 456; SI-NEXT: s_endpgm 457; 458; FLAT-LABEL: s_brev_v2i64: 459; FLAT: ; %bb.0: 460; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 461; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 462; FLAT-NEXT: s_mov_b32 s3, 0xf000 463; FLAT-NEXT: s_mov_b32 s2, -1 464; FLAT-NEXT: s_waitcnt lgkmcnt(0) 465; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7] 466; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5] 467; FLAT-NEXT: v_mov_b32_e32 v0, s4 468; FLAT-NEXT: v_mov_b32_e32 v1, s5 469; FLAT-NEXT: v_mov_b32_e32 v2, s6 470; FLAT-NEXT: v_mov_b32_e32 v3, s7 471; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 472; FLAT-NEXT: s_endpgm 473; 474; GISEL-LABEL: s_brev_v2i64: 475; GISEL: ; %bb.0: 476; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 477; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 478; GISEL-NEXT: s_waitcnt lgkmcnt(0) 479; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] 480; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] 481; GISEL-NEXT: v_mov_b32_e32 v0, s0 482; GISEL-NEXT: v_mov_b32_e32 v4, s8 483; GISEL-NEXT: v_mov_b32_e32 v1, s1 484; GISEL-NEXT: v_mov_b32_e32 v2, s2 485; GISEL-NEXT: v_mov_b32_e32 v3, s3 486; GISEL-NEXT: v_mov_b32_e32 v5, s9 487; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 488; GISEL-NEXT: s_endpgm 489 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 490 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out 491 ret void 492} 493 494define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { 495; SI-LABEL: v_brev_v2i64: 496; SI: ; %bb.0: 497; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 498; SI-NEXT: s_mov_b32 s3, 0xf000 499; SI-NEXT: s_mov_b32 s6, 0 500; SI-NEXT: s_mov_b32 s7, s3 501; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 502; SI-NEXT: v_mov_b32_e32 v1, 0 503; SI-NEXT: s_waitcnt lgkmcnt(0) 504; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 505; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 506; SI-NEXT: s_mov_b32 s2, -1 507; SI-NEXT: s_waitcnt vmcnt(0) 508; SI-NEXT: v_bfrev_b32_e32 v4, v2 509; SI-NEXT: v_bfrev_b32_e32 v3, v3 510; SI-NEXT: v_bfrev_b32_e32 v2, v0 511; SI-NEXT: v_bfrev_b32_e32 v1, v1 512; SI-NEXT: s_waitcnt lgkmcnt(0) 513; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 514; SI-NEXT: s_endpgm 515; 516; FLAT-LABEL: v_brev_v2i64: 517; FLAT: ; %bb.0: 518; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 519; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 520; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 521; FLAT-NEXT: s_waitcnt lgkmcnt(0) 522; FLAT-NEXT: v_mov_b32_e32 v1, s3 523; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 524; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 525; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 526; FLAT-NEXT: s_mov_b32 s3, 0xf000 527; FLAT-NEXT: s_mov_b32 s2, -1 528; FLAT-NEXT: s_waitcnt vmcnt(0) 529; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 530; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 531; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 532; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 533; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 534; FLAT-NEXT: s_endpgm 535; 536; GISEL-LABEL: v_brev_v2i64: 537; GISEL: ; %bb.0: 538; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 539; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 540; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 541; GISEL-NEXT: s_waitcnt lgkmcnt(0) 542; GISEL-NEXT: v_mov_b32_e32 v0, s2 543; GISEL-NEXT: v_mov_b32_e32 v1, s3 544; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 545; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 546; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 547; GISEL-NEXT: s_waitcnt vmcnt(0) 548; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 549; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 550; GISEL-NEXT: v_mov_b32_e32 v0, s0 551; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 552; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 553; GISEL-NEXT: v_mov_b32_e32 v1, s1 554; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 555; GISEL-NEXT: s_endpgm 556 %tid = call i32 @llvm.amdgcn.workitem.id.x() 557 %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid 558 %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep 559 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 560 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out 561 ret void 562} 563 564define float @missing_truncate_promote_bitreverse(i32 %arg) { 565; SI-LABEL: missing_truncate_promote_bitreverse: 566; SI: ; %bb.0: ; %bb 567; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 568; SI-NEXT: v_bfrev_b32_e32 v0, v0 569; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 570; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 571; SI-NEXT: s_setpc_b64 s[30:31] 572; 573; FLAT-LABEL: missing_truncate_promote_bitreverse: 574; FLAT: ; %bb.0: ; %bb 575; FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 576; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 577; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 578; FLAT-NEXT: s_setpc_b64 s[30:31] 579; 580; GISEL-LABEL: missing_truncate_promote_bitreverse: 581; GISEL: ; %bb.0: ; %bb 582; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 583; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 584; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 585; GISEL-NEXT: s_setpc_b64 s[30:31] 586bb: 587 %tmp = trunc i32 %arg to i16 588 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp) 589 %tmp2 = bitcast i16 %tmp1 to half 590 %tmp3 = fpext half %tmp2 to float 591 ret float %tmp3 592} 593 594attributes #0 = { nounwind } 595attributes #1 = { nounwind readnone } 596