1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL 6 7declare i32 @llvm.amdgcn.workitem.id.x() #1 8 9declare i16 @llvm.bitreverse.i16(i16) #1 10declare i32 @llvm.bitreverse.i32(i32) #1 11declare i64 @llvm.bitreverse.i64(i64) #1 12 13declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1 14declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1 15 16declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 17declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 18 19define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { 20; SI-LABEL: s_brev_i16: 21; SI: ; %bb.0: 22; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 23; SI-NEXT: s_load_dword s0, s[0:1], 0xb 24; SI-NEXT: s_mov_b32 s7, 0xf000 25; SI-NEXT: s_mov_b32 s6, -1 26; SI-NEXT: s_waitcnt lgkmcnt(0) 27; SI-NEXT: s_brev_b32 s0, s0 28; SI-NEXT: s_lshr_b32 s0, s0, 16 29; SI-NEXT: v_mov_b32_e32 v0, s0 30; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 31; SI-NEXT: s_endpgm 32; 33; FLAT-LABEL: s_brev_i16: 34; FLAT: ; %bb.0: 35; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 36; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c 37; FLAT-NEXT: s_mov_b32 s7, 0xf000 38; FLAT-NEXT: s_mov_b32 s6, -1 39; FLAT-NEXT: s_waitcnt lgkmcnt(0) 40; FLAT-NEXT: s_brev_b32 s0, s0 41; FLAT-NEXT: s_lshr_b32 s0, s0, 16 42; FLAT-NEXT: v_mov_b32_e32 v0, s0 43; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 44; FLAT-NEXT: s_endpgm 45; 46; GISEL-LABEL: s_brev_i16: 47; GISEL: ; %bb.0: 48; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 49; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c 50; GISEL-NEXT: s_waitcnt lgkmcnt(0) 51; GISEL-NEXT: v_mov_b32_e32 v0, s2 52; GISEL-NEXT: s_and_b32 s0, s0, 0xffff 53; GISEL-NEXT: s_brev_b32 s0, s0 54; GISEL-NEXT: s_lshr_b32 s0, s0, 16 55; GISEL-NEXT: v_mov_b32_e32 v2, s0 56; GISEL-NEXT: v_mov_b32_e32 v1, s3 57; GISEL-NEXT: flat_store_short v[0:1], v2 58; GISEL-NEXT: s_endpgm 59 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 60 store i16 %brev, i16 addrspace(1)* %out 61 ret void 62} 63 64define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { 65; SI-LABEL: v_brev_i16: 66; SI: ; %bb.0: 67; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 68; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 69; SI-NEXT: s_mov_b32 s7, 0xf000 70; SI-NEXT: s_mov_b32 s6, -1 71; SI-NEXT: s_mov_b32 s2, s6 72; SI-NEXT: s_mov_b32 s3, s7 73; SI-NEXT: s_waitcnt lgkmcnt(0) 74; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 75; SI-NEXT: s_waitcnt vmcnt(0) 76; SI-NEXT: v_bfrev_b32_e32 v0, v0 77; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 78; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 79; SI-NEXT: s_endpgm 80; 81; FLAT-LABEL: v_brev_i16: 82; FLAT: ; %bb.0: 83; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 84; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 85; FLAT-NEXT: s_mov_b32 s7, 0xf000 86; FLAT-NEXT: s_mov_b32 s6, -1 87; FLAT-NEXT: s_mov_b32 s2, s6 88; FLAT-NEXT: s_mov_b32 s3, s7 89; FLAT-NEXT: s_waitcnt lgkmcnt(0) 90; FLAT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 91; FLAT-NEXT: s_waitcnt vmcnt(0) 92; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 93; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 94; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 95; FLAT-NEXT: s_endpgm 96; 97; GISEL-LABEL: v_brev_i16: 98; GISEL: ; %bb.0: 99; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 100; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 101; GISEL-NEXT: s_waitcnt lgkmcnt(0) 102; GISEL-NEXT: v_mov_b32_e32 v0, s0 103; GISEL-NEXT: v_mov_b32_e32 v1, s1 104; GISEL-NEXT: flat_load_ushort v0, v[0:1] 105; GISEL-NEXT: s_waitcnt vmcnt(0) 106; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 107; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 108; GISEL-NEXT: v_mov_b32_e32 v0, s2 109; GISEL-NEXT: v_mov_b32_e32 v1, s3 110; GISEL-NEXT: flat_store_short v[0:1], v2 111; GISEL-NEXT: s_endpgm 112 %val = load i16, i16 addrspace(1)* %valptr 113 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 114 store i16 %brev, i16 addrspace(1)* %out 115 ret void 116} 117 118define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { 119; SI-LABEL: s_brev_i32: 120; SI: ; %bb.0: 121; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 122; SI-NEXT: s_load_dword s0, s[0:1], 0xb 123; SI-NEXT: s_mov_b32 s7, 0xf000 124; SI-NEXT: s_mov_b32 s6, -1 125; SI-NEXT: s_waitcnt lgkmcnt(0) 126; SI-NEXT: s_brev_b32 s0, s0 127; SI-NEXT: v_mov_b32_e32 v0, s0 128; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 129; SI-NEXT: s_endpgm 130; 131; FLAT-LABEL: s_brev_i32: 132; FLAT: ; %bb.0: 133; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 134; FLAT-NEXT: s_load_dword s0, s[0:1], 0x2c 135; FLAT-NEXT: s_mov_b32 s7, 0xf000 136; FLAT-NEXT: s_mov_b32 s6, -1 137; FLAT-NEXT: s_waitcnt lgkmcnt(0) 138; FLAT-NEXT: s_brev_b32 s0, s0 139; FLAT-NEXT: v_mov_b32_e32 v0, s0 140; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 141; FLAT-NEXT: s_endpgm 142; 143; GISEL-LABEL: s_brev_i32: 144; GISEL: ; %bb.0: 145; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 146; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c 147; GISEL-NEXT: s_waitcnt lgkmcnt(0) 148; GISEL-NEXT: v_mov_b32_e32 v0, s2 149; GISEL-NEXT: s_brev_b32 s0, s0 150; GISEL-NEXT: v_mov_b32_e32 v2, s0 151; GISEL-NEXT: v_mov_b32_e32 v1, s3 152; GISEL-NEXT: flat_store_dword v[0:1], v2 153; GISEL-NEXT: s_endpgm 154 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 155 store i32 %brev, i32 addrspace(1)* %out 156 ret void 157} 158 159define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { 160; SI-LABEL: v_brev_i32: 161; SI: ; %bb.0: 162; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 163; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 164; SI-NEXT: s_mov_b32 s7, 0xf000 165; SI-NEXT: s_mov_b32 s2, 0 166; SI-NEXT: s_mov_b32 s3, s7 167; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 168; SI-NEXT: v_mov_b32_e32 v1, 0 169; SI-NEXT: s_waitcnt lgkmcnt(0) 170; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 171; SI-NEXT: s_mov_b32 s6, -1 172; SI-NEXT: s_waitcnt vmcnt(0) 173; SI-NEXT: v_bfrev_b32_e32 v0, v0 174; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 175; SI-NEXT: s_endpgm 176; 177; FLAT-LABEL: v_brev_i32: 178; FLAT: ; %bb.0: 179; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 180; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 181; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; FLAT-NEXT: s_mov_b32 s7, 0xf000 183; FLAT-NEXT: s_mov_b32 s6, -1 184; FLAT-NEXT: s_waitcnt lgkmcnt(0) 185; FLAT-NEXT: v_mov_b32_e32 v1, s1 186; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 187; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 188; FLAT-NEXT: flat_load_dword v0, v[0:1] 189; FLAT-NEXT: s_waitcnt vmcnt(0) 190; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 191; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 192; FLAT-NEXT: s_endpgm 193; 194; GISEL-LABEL: v_brev_i32: 195; GISEL: ; %bb.0: 196; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 197; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 198; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 199; GISEL-NEXT: s_waitcnt lgkmcnt(0) 200; GISEL-NEXT: v_mov_b32_e32 v0, s0 201; GISEL-NEXT: v_mov_b32_e32 v1, s1 202; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 203; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 204; GISEL-NEXT: flat_load_dword v0, v[0:1] 205; GISEL-NEXT: s_waitcnt vmcnt(0) 206; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 207; GISEL-NEXT: v_mov_b32_e32 v0, s2 208; GISEL-NEXT: v_mov_b32_e32 v1, s3 209; GISEL-NEXT: flat_store_dword v[0:1], v2 210; GISEL-NEXT: s_endpgm 211 %tid = call i32 @llvm.amdgcn.workitem.id.x() 212 %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 213 %val = load i32, i32 addrspace(1)* %gep 214 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 215 store i32 %brev, i32 addrspace(1)* %out 216 ret void 217} 218 219define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { 220; SI-LABEL: s_brev_v2i32: 221; SI: ; %bb.0: 222; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 223; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 224; SI-NEXT: s_mov_b32 s7, 0xf000 225; SI-NEXT: s_mov_b32 s6, -1 226; SI-NEXT: s_waitcnt lgkmcnt(0) 227; SI-NEXT: s_brev_b32 s1, s1 228; SI-NEXT: s_brev_b32 s0, s0 229; SI-NEXT: v_mov_b32_e32 v0, s0 230; SI-NEXT: v_mov_b32_e32 v1, s1 231; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 232; SI-NEXT: s_endpgm 233; 234; FLAT-LABEL: s_brev_v2i32: 235; FLAT: ; %bb.0: 236; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 237; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 238; FLAT-NEXT: s_mov_b32 s7, 0xf000 239; FLAT-NEXT: s_mov_b32 s6, -1 240; FLAT-NEXT: s_waitcnt lgkmcnt(0) 241; FLAT-NEXT: s_brev_b32 s1, s1 242; FLAT-NEXT: s_brev_b32 s0, s0 243; FLAT-NEXT: v_mov_b32_e32 v0, s0 244; FLAT-NEXT: v_mov_b32_e32 v1, s1 245; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 246; FLAT-NEXT: s_endpgm 247; 248; GISEL-LABEL: s_brev_v2i32: 249; GISEL: ; %bb.0: 250; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 251; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 252; GISEL-NEXT: s_waitcnt lgkmcnt(0) 253; GISEL-NEXT: v_mov_b32_e32 v2, s2 254; GISEL-NEXT: s_brev_b32 s0, s0 255; GISEL-NEXT: s_brev_b32 s1, s1 256; GISEL-NEXT: v_mov_b32_e32 v0, s0 257; GISEL-NEXT: v_mov_b32_e32 v1, s1 258; GISEL-NEXT: v_mov_b32_e32 v3, s3 259; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 260; GISEL-NEXT: s_endpgm 261 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 262 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out 263 ret void 264} 265 266define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { 267; SI-LABEL: v_brev_v2i32: 268; SI: ; %bb.0: 269; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 270; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 271; SI-NEXT: s_mov_b32 s7, 0xf000 272; SI-NEXT: s_mov_b32 s2, 0 273; SI-NEXT: s_mov_b32 s3, s7 274; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 275; SI-NEXT: v_mov_b32_e32 v1, 0 276; SI-NEXT: s_waitcnt lgkmcnt(0) 277; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 278; SI-NEXT: s_mov_b32 s6, -1 279; SI-NEXT: s_waitcnt vmcnt(0) 280; SI-NEXT: v_bfrev_b32_e32 v1, v1 281; SI-NEXT: v_bfrev_b32_e32 v0, v0 282; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 283; SI-NEXT: s_endpgm 284; 285; FLAT-LABEL: v_brev_v2i32: 286; FLAT: ; %bb.0: 287; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 288; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 289; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 290; FLAT-NEXT: s_mov_b32 s7, 0xf000 291; FLAT-NEXT: s_mov_b32 s6, -1 292; FLAT-NEXT: s_waitcnt lgkmcnt(0) 293; FLAT-NEXT: v_mov_b32_e32 v1, s1 294; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 295; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 296; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 297; FLAT-NEXT: s_waitcnt vmcnt(0) 298; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 299; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 300; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 301; FLAT-NEXT: s_endpgm 302; 303; GISEL-LABEL: v_brev_v2i32: 304; GISEL: ; %bb.0: 305; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 306; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 307; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 308; GISEL-NEXT: s_waitcnt lgkmcnt(0) 309; GISEL-NEXT: v_mov_b32_e32 v0, s0 310; GISEL-NEXT: v_mov_b32_e32 v1, s1 311; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 312; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 313; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 314; GISEL-NEXT: v_mov_b32_e32 v2, s2 315; GISEL-NEXT: v_mov_b32_e32 v3, s3 316; GISEL-NEXT: s_waitcnt vmcnt(0) 317; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 318; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 319; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 320; GISEL-NEXT: s_endpgm 321 %tid = call i32 @llvm.amdgcn.workitem.id.x() 322 %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 323 %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep 324 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 325 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out 326 ret void 327} 328 329define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { 330; SI-LABEL: s_brev_i64: 331; SI: ; %bb.0: 332; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 333; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 334; SI-NEXT: s_mov_b32 s7, 0xf000 335; SI-NEXT: s_mov_b32 s6, -1 336; SI-NEXT: s_waitcnt lgkmcnt(0) 337; SI-NEXT: s_brev_b64 s[0:1], s[0:1] 338; SI-NEXT: v_mov_b32_e32 v0, s0 339; SI-NEXT: v_mov_b32_e32 v1, s1 340; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 341; SI-NEXT: s_endpgm 342; 343; FLAT-LABEL: s_brev_i64: 344; FLAT: ; %bb.0: 345; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 346; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 347; FLAT-NEXT: s_mov_b32 s7, 0xf000 348; FLAT-NEXT: s_mov_b32 s6, -1 349; FLAT-NEXT: s_waitcnt lgkmcnt(0) 350; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] 351; FLAT-NEXT: v_mov_b32_e32 v0, s0 352; FLAT-NEXT: v_mov_b32_e32 v1, s1 353; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 354; FLAT-NEXT: s_endpgm 355; 356; GISEL-LABEL: s_brev_i64: 357; GISEL: ; %bb.0: 358; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 359; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 360; GISEL-NEXT: s_waitcnt lgkmcnt(0) 361; GISEL-NEXT: v_mov_b32_e32 v2, s2 362; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] 363; GISEL-NEXT: v_mov_b32_e32 v0, s0 364; GISEL-NEXT: v_mov_b32_e32 v1, s1 365; GISEL-NEXT: v_mov_b32_e32 v3, s3 366; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 367; GISEL-NEXT: s_endpgm 368 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 369 store i64 %brev, i64 addrspace(1)* %out 370 ret void 371} 372 373define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { 374; SI-LABEL: v_brev_i64: 375; SI: ; %bb.0: 376; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 377; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 378; SI-NEXT: s_mov_b32 s7, 0xf000 379; SI-NEXT: s_mov_b32 s2, 0 380; SI-NEXT: s_mov_b32 s3, s7 381; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 382; SI-NEXT: v_mov_b32_e32 v1, 0 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 385; SI-NEXT: s_mov_b32 s6, -1 386; SI-NEXT: s_waitcnt vmcnt(0) 387; SI-NEXT: v_bfrev_b32_e32 v2, v0 388; SI-NEXT: v_bfrev_b32_e32 v1, v1 389; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 390; SI-NEXT: s_endpgm 391; 392; FLAT-LABEL: v_brev_i64: 393; FLAT: ; %bb.0: 394; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 395; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 396; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 397; FLAT-NEXT: s_mov_b32 s7, 0xf000 398; FLAT-NEXT: s_mov_b32 s6, -1 399; FLAT-NEXT: s_waitcnt lgkmcnt(0) 400; FLAT-NEXT: v_mov_b32_e32 v1, s1 401; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 402; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 403; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 404; FLAT-NEXT: s_waitcnt vmcnt(0) 405; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 406; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 407; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 408; FLAT-NEXT: s_endpgm 409; 410; GISEL-LABEL: v_brev_i64: 411; GISEL: ; %bb.0: 412; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 413; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 414; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 415; GISEL-NEXT: s_waitcnt lgkmcnt(0) 416; GISEL-NEXT: v_mov_b32_e32 v4, s3 417; GISEL-NEXT: v_mov_b32_e32 v0, s0 418; GISEL-NEXT: v_mov_b32_e32 v1, s1 419; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 420; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 421; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 422; GISEL-NEXT: v_mov_b32_e32 v3, s2 423; GISEL-NEXT: s_waitcnt vmcnt(0) 424; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 425; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 426; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 427; GISEL-NEXT: s_endpgm 428 %tid = call i32 @llvm.amdgcn.workitem.id.x() 429 %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid 430 %val = load i64, i64 addrspace(1)* %gep 431 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 432 store i64 %brev, i64 addrspace(1)* %out 433 ret void 434} 435 436define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 { 437; SI-LABEL: s_brev_v2i64: 438; SI: ; %bb.0: 439; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 440; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 441; SI-NEXT: s_mov_b32 s7, 0xf000 442; SI-NEXT: s_mov_b32 s6, -1 443; SI-NEXT: s_waitcnt lgkmcnt(0) 444; SI-NEXT: s_brev_b64 s[2:3], s[2:3] 445; SI-NEXT: s_brev_b64 s[0:1], s[0:1] 446; SI-NEXT: v_mov_b32_e32 v0, s0 447; SI-NEXT: v_mov_b32_e32 v1, s1 448; SI-NEXT: v_mov_b32_e32 v2, s2 449; SI-NEXT: v_mov_b32_e32 v3, s3 450; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 451; SI-NEXT: s_endpgm 452; 453; FLAT-LABEL: s_brev_v2i64: 454; FLAT: ; %bb.0: 455; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 456; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 457; FLAT-NEXT: s_mov_b32 s7, 0xf000 458; FLAT-NEXT: s_mov_b32 s6, -1 459; FLAT-NEXT: s_waitcnt lgkmcnt(0) 460; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] 461; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] 462; FLAT-NEXT: v_mov_b32_e32 v0, s0 463; FLAT-NEXT: v_mov_b32_e32 v1, s1 464; FLAT-NEXT: v_mov_b32_e32 v2, s2 465; FLAT-NEXT: v_mov_b32_e32 v3, s3 466; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 467; FLAT-NEXT: s_endpgm 468; 469; GISEL-LABEL: s_brev_v2i64: 470; GISEL: ; %bb.0: 471; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 472; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 473; GISEL-NEXT: s_waitcnt lgkmcnt(0) 474; GISEL-NEXT: v_mov_b32_e32 v4, s4 475; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] 476; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] 477; GISEL-NEXT: v_mov_b32_e32 v0, s0 478; GISEL-NEXT: v_mov_b32_e32 v1, s1 479; GISEL-NEXT: v_mov_b32_e32 v2, s2 480; GISEL-NEXT: v_mov_b32_e32 v3, s3 481; GISEL-NEXT: v_mov_b32_e32 v5, s5 482; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 483; GISEL-NEXT: s_endpgm 484 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 485 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out 486 ret void 487} 488 489define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { 490; SI-LABEL: v_brev_v2i64: 491; SI: ; %bb.0: 492; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 493; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 494; SI-NEXT: s_mov_b32 s7, 0xf000 495; SI-NEXT: s_mov_b32 s2, 0 496; SI-NEXT: s_mov_b32 s3, s7 497; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 498; SI-NEXT: v_mov_b32_e32 v1, 0 499; SI-NEXT: s_waitcnt lgkmcnt(0) 500; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 501; SI-NEXT: s_mov_b32 s6, -1 502; SI-NEXT: s_waitcnt vmcnt(0) 503; SI-NEXT: v_bfrev_b32_e32 v4, v2 504; SI-NEXT: v_bfrev_b32_e32 v3, v3 505; SI-NEXT: v_bfrev_b32_e32 v2, v0 506; SI-NEXT: v_bfrev_b32_e32 v1, v1 507; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 508; SI-NEXT: s_endpgm 509; 510; FLAT-LABEL: v_brev_v2i64: 511; FLAT: ; %bb.0: 512; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 513; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 514; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 515; FLAT-NEXT: s_mov_b32 s7, 0xf000 516; FLAT-NEXT: s_mov_b32 s6, -1 517; FLAT-NEXT: s_waitcnt lgkmcnt(0) 518; FLAT-NEXT: v_mov_b32_e32 v1, s1 519; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 520; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 521; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 522; FLAT-NEXT: s_waitcnt vmcnt(0) 523; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 524; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 525; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 526; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 527; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 528; FLAT-NEXT: s_endpgm 529; 530; GISEL-LABEL: v_brev_v2i64: 531; GISEL: ; %bb.0: 532; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 533; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 534; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 535; GISEL-NEXT: s_waitcnt lgkmcnt(0) 536; GISEL-NEXT: v_mov_b32_e32 v0, s0 537; GISEL-NEXT: v_mov_b32_e32 v1, s1 538; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 539; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 540; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 541; GISEL-NEXT: s_waitcnt vmcnt(0) 542; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 543; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 544; GISEL-NEXT: v_mov_b32_e32 v0, s2 545; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 546; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 547; GISEL-NEXT: v_mov_b32_e32 v1, s3 548; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 549; GISEL-NEXT: s_endpgm 550 %tid = call i32 @llvm.amdgcn.workitem.id.x() 551 %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid 552 %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep 553 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 554 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out 555 ret void 556} 557 558define float @missing_truncate_promote_bitreverse(i32 %arg) { 559; SI-LABEL: missing_truncate_promote_bitreverse: 560; SI: ; %bb.0: ; %bb 561; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 562; SI-NEXT: v_bfrev_b32_e32 v0, v0 563; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 564; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 565; SI-NEXT: s_setpc_b64 s[30:31] 566; 567; FLAT-LABEL: missing_truncate_promote_bitreverse: 568; FLAT: ; %bb.0: ; %bb 569; FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 570; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 571; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 572; FLAT-NEXT: s_setpc_b64 s[30:31] 573; 574; GISEL-LABEL: missing_truncate_promote_bitreverse: 575; GISEL: ; %bb.0: ; %bb 576; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 577; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 578; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 579; GISEL-NEXT: s_setpc_b64 s[30:31] 580bb: 581 %tmp = trunc i32 %arg to i16 582 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp) 583 %tmp2 = bitcast i16 %tmp1 to half 584 %tmp3 = fpext half %tmp2 to float 585 ret float %tmp3 586} 587 588attributes #0 = { nounwind } 589attributes #1 = { nounwind readnone } 590