1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI 4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG 5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL 7; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11 8 9declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone 10declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 11declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone 12 13declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 14declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 15declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 16 17declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 18declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 19declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 20 21declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 22 23define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 24; SI-LABEL: s_ctlz_i32: 25; SI: ; %bb.0: 26; SI-NEXT: s_load_dword s2, s[0:1], 0xb 27; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 28; SI-NEXT: s_mov_b32 s3, 0xf000 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: s_flbit_i32_b32 s2, s2 31; SI-NEXT: s_min_u32 s4, s2, 32 32; SI-NEXT: s_mov_b32 s2, -1 33; SI-NEXT: v_mov_b32_e32 v0, s4 34; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 35; SI-NEXT: s_endpgm 36; 37; VI-LABEL: s_ctlz_i32: 38; VI: ; %bb.0: 39; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 40; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; VI-NEXT: s_mov_b32 s3, 0xf000 42; VI-NEXT: s_mov_b32 s2, -1 43; VI-NEXT: s_waitcnt lgkmcnt(0) 44; VI-NEXT: s_flbit_i32_b32 s4, s4 45; VI-NEXT: s_min_u32 s4, s4, 32 46; VI-NEXT: v_mov_b32_e32 v0, s4 47; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 48; VI-NEXT: s_endpgm 49; 50; EG-LABEL: s_ctlz_i32: 51; EG: ; %bb.0: 52; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 53; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 54; EG-NEXT: CF_END 55; EG-NEXT: PAD 56; EG-NEXT: ALU clause starting at 4: 57; EG-NEXT: FFBH_UINT * T0.W, KC0[2].Z, 58; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 59; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 60; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 61; 62; GFX10-LABEL: s_ctlz_i32: 63; GFX10: ; %bb.0: 64; GFX10-NEXT: s_clause 0x1 65; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c 66; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 67; GFX10-NEXT: v_mov_b32_e32 v0, 0 68; GFX10-NEXT: s_waitcnt lgkmcnt(0) 69; GFX10-NEXT: s_flbit_i32_b32 s0, s4 70; GFX10-NEXT: s_min_u32 s0, s0, 32 71; GFX10-NEXT: v_mov_b32_e32 v1, s0 72; GFX10-NEXT: global_store_dword v0, v1, s[2:3] 73; GFX10-NEXT: s_endpgm 74; 75; GFX10-GISEL-LABEL: s_ctlz_i32: 76; GFX10-GISEL: ; %bb.0: 77; GFX10-GISEL-NEXT: s_clause 0x1 78; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 79; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 80; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 81; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 82; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 83; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 84; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 85; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 86; GFX10-GISEL-NEXT: s_endpgm 87; 88; GFX11-LABEL: s_ctlz_i32: 89; GFX11: ; %bb.0: 90; GFX11-NEXT: s_clause 0x1 91; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c 92; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 93; GFX11-NEXT: s_waitcnt lgkmcnt(0) 94; GFX11-NEXT: s_clz_i32_u32 s2, s2 95; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 96; GFX11-NEXT: s_min_u32 s2, s2, 32 97; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 98; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 99; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 100; GFX11-NEXT: s_endpgm 101 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 102 store i32 %ctlz, i32 addrspace(1)* %out, align 4 103 ret void 104} 105 106define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 107; SI-LABEL: v_ctlz_i32: 108; SI: ; %bb.0: 109; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 110; SI-NEXT: s_mov_b32 s3, 0xf000 111; SI-NEXT: s_mov_b32 s6, 0 112; SI-NEXT: s_mov_b32 s7, s3 113; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 114; SI-NEXT: v_mov_b32_e32 v1, 0 115; SI-NEXT: s_waitcnt lgkmcnt(0) 116; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 117; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 118; SI-NEXT: s_mov_b32 s2, -1 119; SI-NEXT: s_waitcnt vmcnt(0) 120; SI-NEXT: v_ffbh_u32_e32 v0, v0 121; SI-NEXT: v_min_u32_e32 v0, 32, v0 122; SI-NEXT: s_waitcnt lgkmcnt(0) 123; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 124; SI-NEXT: s_endpgm 125; 126; VI-LABEL: v_ctlz_i32: 127; VI: ; %bb.0: 128; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 129; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 130; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 131; VI-NEXT: s_waitcnt lgkmcnt(0) 132; VI-NEXT: v_mov_b32_e32 v1, s3 133; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 134; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 135; VI-NEXT: flat_load_dword v0, v[0:1] 136; VI-NEXT: s_mov_b32 s3, 0xf000 137; VI-NEXT: s_mov_b32 s2, -1 138; VI-NEXT: s_waitcnt vmcnt(0) 139; VI-NEXT: v_ffbh_u32_e32 v0, v0 140; VI-NEXT: v_min_u32_e32 v0, 32, v0 141; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 142; VI-NEXT: s_endpgm 143; 144; EG-LABEL: v_ctlz_i32: 145; EG: ; %bb.0: 146; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 147; EG-NEXT: TEX 0 @6 148; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 149; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 150; EG-NEXT: CF_END 151; EG-NEXT: PAD 152; EG-NEXT: Fetch clause starting at 6: 153; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 154; EG-NEXT: ALU clause starting at 8: 155; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 156; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 157; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 158; EG-NEXT: ALU clause starting at 11: 159; EG-NEXT: FFBH_UINT * T0.W, T0.X, 160; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 161; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 162; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 163; 164; GFX10-LABEL: v_ctlz_i32: 165; GFX10: ; %bb.0: 166; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 167; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 168; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 169; GFX10-NEXT: v_mov_b32_e32 v1, 0 170; GFX10-NEXT: s_waitcnt lgkmcnt(0) 171; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 172; GFX10-NEXT: s_waitcnt vmcnt(0) 173; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 174; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 175; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 176; GFX10-NEXT: s_endpgm 177; 178; GFX10-GISEL-LABEL: v_ctlz_i32: 179; GFX10-GISEL: ; %bb.0: 180; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 181; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 183; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 184; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 185; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 186; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 187; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 188; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 189; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 190; GFX10-GISEL-NEXT: s_endpgm 191; 192; GFX11-LABEL: v_ctlz_i32: 193; GFX11: ; %bb.0: 194; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 195; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 196; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 197; GFX11-NEXT: s_waitcnt lgkmcnt(0) 198; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 199; GFX11-NEXT: s_waitcnt vmcnt(0) 200; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 201; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 202; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 203; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 204; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 205; GFX11-NEXT: s_endpgm 206 %tid = call i32 @llvm.amdgcn.workitem.id.x() 207 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 208 %val = load i32, i32 addrspace(1)* %in.gep, align 4 209 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 210 store i32 %ctlz, i32 addrspace(1)* %out, align 4 211 ret void 212} 213 214define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 215; SI-LABEL: v_ctlz_v2i32: 216; SI: ; %bb.0: 217; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 218; SI-NEXT: s_mov_b32 s3, 0xf000 219; SI-NEXT: s_mov_b32 s6, 0 220; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 221; SI-NEXT: v_mov_b32_e32 v1, 0 222; SI-NEXT: s_mov_b32 s7, s3 223; SI-NEXT: s_waitcnt lgkmcnt(0) 224; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 225; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 226; SI-NEXT: s_mov_b32 s2, -1 227; SI-NEXT: s_waitcnt vmcnt(0) 228; SI-NEXT: v_ffbh_u32_e32 v1, v1 229; SI-NEXT: v_ffbh_u32_e32 v0, v0 230; SI-NEXT: v_min_u32_e32 v1, 32, v1 231; SI-NEXT: v_min_u32_e32 v0, 32, v0 232; SI-NEXT: s_waitcnt lgkmcnt(0) 233; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 234; SI-NEXT: s_endpgm 235; 236; VI-LABEL: v_ctlz_v2i32: 237; VI: ; %bb.0: 238; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 239; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 240; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 241; VI-NEXT: s_waitcnt lgkmcnt(0) 242; VI-NEXT: v_mov_b32_e32 v1, s3 243; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 244; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 245; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 246; VI-NEXT: s_mov_b32 s3, 0xf000 247; VI-NEXT: s_mov_b32 s2, -1 248; VI-NEXT: s_waitcnt vmcnt(0) 249; VI-NEXT: v_ffbh_u32_e32 v1, v1 250; VI-NEXT: v_ffbh_u32_e32 v0, v0 251; VI-NEXT: v_min_u32_e32 v1, 32, v1 252; VI-NEXT: v_min_u32_e32 v0, 32, v0 253; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 254; VI-NEXT: s_endpgm 255; 256; EG-LABEL: v_ctlz_v2i32: 257; EG: ; %bb.0: 258; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 259; EG-NEXT: TEX 0 @6 260; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 261; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 262; EG-NEXT: CF_END 263; EG-NEXT: PAD 264; EG-NEXT: Fetch clause starting at 6: 265; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 266; EG-NEXT: ALU clause starting at 8: 267; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 268; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 269; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 270; EG-NEXT: ALU clause starting at 11: 271; EG-NEXT: FFBH_UINT * T0.W, T0.Y, 272; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 273; EG-NEXT: FFBH_UINT * T0.W, T0.X, 274; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 275; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 276; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 277; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 278; 279; GFX10-LABEL: v_ctlz_v2i32: 280; GFX10: ; %bb.0: 281; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 282; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 283; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 284; GFX10-NEXT: v_mov_b32_e32 v2, 0 285; GFX10-NEXT: s_waitcnt lgkmcnt(0) 286; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 287; GFX10-NEXT: s_waitcnt vmcnt(0) 288; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 289; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 290; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 291; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 292; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 293; GFX10-NEXT: s_endpgm 294; 295; GFX10-GISEL-LABEL: v_ctlz_v2i32: 296; GFX10-GISEL: ; %bb.0: 297; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 298; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 299; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 300; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 301; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 302; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 303; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 304; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 305; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 306; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 307; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 308; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 309; GFX10-GISEL-NEXT: s_endpgm 310; 311; GFX11-LABEL: v_ctlz_v2i32: 312; GFX11: ; %bb.0: 313; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 314; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 315; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 316; GFX11-NEXT: v_mov_b32_e32 v2, 0 317; GFX11-NEXT: s_waitcnt lgkmcnt(0) 318; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 319; GFX11-NEXT: s_waitcnt vmcnt(0) 320; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 321; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 322; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 323; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 324; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 325; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 326; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 327; GFX11-NEXT: s_endpgm 328 %tid = call i32 @llvm.amdgcn.workitem.id.x() 329 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 330 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 331 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 332 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 333 ret void 334} 335 336define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 337; SI-LABEL: v_ctlz_v4i32: 338; SI: ; %bb.0: 339; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 340; SI-NEXT: s_mov_b32 s3, 0xf000 341; SI-NEXT: s_mov_b32 s6, 0 342; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 343; SI-NEXT: v_mov_b32_e32 v1, 0 344; SI-NEXT: s_mov_b32 s7, s3 345; SI-NEXT: s_waitcnt lgkmcnt(0) 346; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 347; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 348; SI-NEXT: s_mov_b32 s2, -1 349; SI-NEXT: s_waitcnt vmcnt(0) 350; SI-NEXT: v_ffbh_u32_e32 v3, v3 351; SI-NEXT: v_ffbh_u32_e32 v2, v2 352; SI-NEXT: v_ffbh_u32_e32 v1, v1 353; SI-NEXT: v_ffbh_u32_e32 v0, v0 354; SI-NEXT: v_min_u32_e32 v3, 32, v3 355; SI-NEXT: v_min_u32_e32 v2, 32, v2 356; SI-NEXT: v_min_u32_e32 v1, 32, v1 357; SI-NEXT: v_min_u32_e32 v0, 32, v0 358; SI-NEXT: s_waitcnt lgkmcnt(0) 359; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 360; SI-NEXT: s_endpgm 361; 362; VI-LABEL: v_ctlz_v4i32: 363; VI: ; %bb.0: 364; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 365; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 366; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 367; VI-NEXT: s_waitcnt lgkmcnt(0) 368; VI-NEXT: v_mov_b32_e32 v1, s3 369; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 370; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 371; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 372; VI-NEXT: s_mov_b32 s3, 0xf000 373; VI-NEXT: s_mov_b32 s2, -1 374; VI-NEXT: s_waitcnt vmcnt(0) 375; VI-NEXT: v_ffbh_u32_e32 v3, v3 376; VI-NEXT: v_ffbh_u32_e32 v2, v2 377; VI-NEXT: v_ffbh_u32_e32 v1, v1 378; VI-NEXT: v_ffbh_u32_e32 v0, v0 379; VI-NEXT: v_min_u32_e32 v3, 32, v3 380; VI-NEXT: v_min_u32_e32 v2, 32, v2 381; VI-NEXT: v_min_u32_e32 v1, 32, v1 382; VI-NEXT: v_min_u32_e32 v0, 32, v0 383; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 384; VI-NEXT: s_endpgm 385; 386; EG-LABEL: v_ctlz_v4i32: 387; EG: ; %bb.0: 388; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 389; EG-NEXT: TEX 0 @6 390; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 391; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 392; EG-NEXT: CF_END 393; EG-NEXT: PAD 394; EG-NEXT: Fetch clause starting at 6: 395; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 396; EG-NEXT: ALU clause starting at 8: 397; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 398; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 399; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 400; EG-NEXT: ALU clause starting at 11: 401; EG-NEXT: FFBH_UINT * T1.W, T0.W, 402; EG-NEXT: FFBH_UINT T2.W, T0.Z, 403; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 404; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 405; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 406; EG-NEXT: FFBH_UINT * T1.W, T0.Y, 407; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 408; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 409; EG-NEXT: FFBH_UINT * T1.W, T0.X, 410; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 411; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 412; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 413; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 414; 415; GFX10-LABEL: v_ctlz_v4i32: 416; GFX10: ; %bb.0: 417; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 418; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 419; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 420; GFX10-NEXT: v_mov_b32_e32 v4, 0 421; GFX10-NEXT: s_waitcnt lgkmcnt(0) 422; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 423; GFX10-NEXT: s_waitcnt vmcnt(0) 424; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 425; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 426; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 427; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 428; GFX10-NEXT: v_min_u32_e32 v3, 32, v3 429; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 430; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 431; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 432; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 433; GFX10-NEXT: s_endpgm 434; 435; GFX10-GISEL-LABEL: v_ctlz_v4i32: 436; GFX10-GISEL: ; %bb.0: 437; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 438; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 439; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 440; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 441; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 442; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 443; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 444; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 445; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 446; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 447; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 448; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 449; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 450; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 451; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 452; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 453; GFX10-GISEL-NEXT: s_endpgm 454; 455; GFX11-LABEL: v_ctlz_v4i32: 456; GFX11: ; %bb.0: 457; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 458; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 459; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 460; GFX11-NEXT: v_mov_b32_e32 v4, 0 461; GFX11-NEXT: s_waitcnt lgkmcnt(0) 462; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 463; GFX11-NEXT: s_waitcnt vmcnt(0) 464; GFX11-NEXT: v_clz_i32_u32_e32 v3, v3 465; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 466; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 467; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 468; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 469; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 470; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 471; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 472; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 473; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 474; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 475; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 476; GFX11-NEXT: s_endpgm 477 %tid = call i32 @llvm.amdgcn.workitem.id.x() 478 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 479 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 480 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 481 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 482 ret void 483} 484 485define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 486; SI-LABEL: v_ctlz_i8: 487; SI: ; %bb.0: 488; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 489; SI-NEXT: s_mov_b32 s3, 0xf000 490; SI-NEXT: s_mov_b32 s2, -1 491; SI-NEXT: s_mov_b32 s6, s2 492; SI-NEXT: s_mov_b32 s7, s3 493; SI-NEXT: s_waitcnt lgkmcnt(0) 494; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 495; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 496; SI-NEXT: s_waitcnt vmcnt(0) 497; SI-NEXT: v_ffbh_u32_e32 v0, v0 498; SI-NEXT: v_min_u32_e32 v0, 32, v0 499; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 500; SI-NEXT: s_waitcnt lgkmcnt(0) 501; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 502; SI-NEXT: s_endpgm 503; 504; VI-LABEL: v_ctlz_i8: 505; VI: ; %bb.0: 506; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 507; VI-NEXT: s_mov_b32 s3, 0xf000 508; VI-NEXT: s_mov_b32 s2, -1 509; VI-NEXT: s_mov_b32 s6, s2 510; VI-NEXT: s_mov_b32 s7, s3 511; VI-NEXT: s_waitcnt lgkmcnt(0) 512; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 513; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 514; VI-NEXT: s_waitcnt vmcnt(0) 515; VI-NEXT: v_ffbh_u32_e32 v0, v0 516; VI-NEXT: v_min_u32_e32 v0, 32, v0 517; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 518; VI-NEXT: v_add_u16_e32 v0, -8, v0 519; VI-NEXT: s_waitcnt lgkmcnt(0) 520; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 521; VI-NEXT: s_endpgm 522; 523; EG-LABEL: v_ctlz_i8: 524; EG: ; %bb.0: 525; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 526; EG-NEXT: TEX 0 @6 527; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 528; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 529; EG-NEXT: CF_END 530; EG-NEXT: PAD 531; EG-NEXT: Fetch clause starting at 6: 532; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 533; EG-NEXT: ALU clause starting at 8: 534; EG-NEXT: MOV * T0.X, KC0[2].Z, 535; EG-NEXT: ALU clause starting at 9: 536; EG-NEXT: FFBH_UINT * T0.W, T0.X, 537; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W, 538; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 539; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45) 540; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 541; EG-NEXT: -24(nan), 0(0.000000e+00) 542; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 543; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 544; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 545; EG-NEXT: LSHL T0.X, PV.W, PS, 546; EG-NEXT: LSHL * T0.W, literal.x, PS, 547; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 548; EG-NEXT: MOV T0.Y, 0.0, 549; EG-NEXT: MOV * T0.Z, 0.0, 550; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 551; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 552; 553; GFX10-LABEL: v_ctlz_i8: 554; GFX10: ; %bb.0: 555; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 556; GFX10-NEXT: v_mov_b32_e32 v0, 0 557; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 558; GFX10-NEXT: s_waitcnt lgkmcnt(0) 559; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 560; GFX10-NEXT: s_waitcnt vmcnt(0) 561; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 562; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 563; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1 564; GFX10-NEXT: v_add_nc_u16 v1, v1, -8 565; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 566; GFX10-NEXT: s_endpgm 567; 568; GFX10-GISEL-LABEL: v_ctlz_i8: 569; GFX10-GISEL: ; %bb.0: 570; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 571; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 572; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 573; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 574; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 575; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 576; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 577; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 578; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 579; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 580; GFX10-GISEL-NEXT: s_endpgm 581; 582; GFX11-LABEL: v_ctlz_i8: 583; GFX11: ; %bb.0: 584; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 585; GFX11-NEXT: v_mov_b32_e32 v0, 0 586; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 587; GFX11-NEXT: s_waitcnt lgkmcnt(0) 588; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] 589; GFX11-NEXT: s_waitcnt vmcnt(0) 590; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 591; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 592; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 593; GFX11-NEXT: v_add_nc_u32_e32 v1, -16, v1 594; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 595; GFX11-NEXT: v_add_nc_u16 v1, v1, -8 596; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] 597; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 598; GFX11-NEXT: s_endpgm 599 %val = load i8, i8 addrspace(1)* %valptr 600 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 601 store i8 %ctlz, i8 addrspace(1)* %out 602 ret void 603} 604 605define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 606; SI-LABEL: s_ctlz_i64: 607; SI: ; %bb.0: 608; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 609; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 610; SI-NEXT: s_mov_b32 s3, 0xf000 611; SI-NEXT: s_mov_b32 s2, -1 612; SI-NEXT: s_waitcnt lgkmcnt(0) 613; SI-NEXT: s_flbit_i32_b32 s4, s4 614; SI-NEXT: s_flbit_i32_b32 s5, s5 615; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf 616; SI-NEXT: v_mov_b32_e32 v0, s5 617; SI-NEXT: s_add_i32 s4, s4, 32 618; SI-NEXT: v_min3_u32 v0, s4, v0, 64 619; SI-NEXT: v_mov_b32_e32 v1, 0 620; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 621; SI-NEXT: s_endpgm 622; 623; VI-LABEL: s_ctlz_i64: 624; VI: ; %bb.0: 625; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c 626; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 627; VI-NEXT: s_mov_b32 s3, 0xf000 628; VI-NEXT: s_mov_b32 s2, -1 629; VI-NEXT: v_mov_b32_e32 v1, 0 630; VI-NEXT: s_waitcnt lgkmcnt(0) 631; VI-NEXT: s_flbit_i32_b32 s4, s4 632; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp 633; VI-NEXT: s_flbit_i32_b32 s4, s5 634; VI-NEXT: v_min3_u32 v0, v0, s4, 64 635; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 636; VI-NEXT: s_endpgm 637; 638; EG-LABEL: s_ctlz_i64: 639; EG: ; %bb.0: 640; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 641; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 642; EG-NEXT: CF_END 643; EG-NEXT: PAD 644; EG-NEXT: ALU clause starting at 4: 645; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 646; EG-NEXT: CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W, 647; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 648; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 649; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 650; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 651; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 652; EG-NEXT: MOV T0.Y, 0.0, 653; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 654; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 655; 656; GFX10-LABEL: s_ctlz_i64: 657; GFX10: ; %bb.0: 658; GFX10-NEXT: s_clause 0x1 659; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 660; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 661; GFX10-NEXT: v_mov_b32_e32 v1, 0 662; GFX10-NEXT: s_waitcnt lgkmcnt(0) 663; GFX10-NEXT: s_flbit_i32_b32 s0, s2 664; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 665; GFX10-NEXT: s_flbit_i32_b32 s0, s3 666; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 667; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] 668; GFX10-NEXT: s_endpgm 669; 670; GFX10-GISEL-LABEL: s_ctlz_i64: 671; GFX10-GISEL: ; %bb.0: 672; GFX10-GISEL-NEXT: s_clause 0x1 673; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 674; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 675; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 676; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 677; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] 678; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 679; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 680; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 681; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 682; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 683; GFX10-GISEL-NEXT: s_endpgm 684; 685; GFX11-LABEL: s_ctlz_i64: 686; GFX11: ; %bb.0: 687; GFX11-NEXT: s_clause 0x1 688; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c 689; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 690; GFX11-NEXT: v_mov_b32_e32 v1, 0 691; GFX11-NEXT: s_waitcnt lgkmcnt(0) 692; GFX11-NEXT: s_clz_i32_u32 s2, s2 693; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 694; GFX11-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp 695; GFX11-NEXT: s_clz_i32_u32 s2, s3 696; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 697; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64 698; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] 699; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 700; GFX11-NEXT: s_endpgm 701 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 702 store i64 %ctlz, i64 addrspace(1)* %out 703 ret void 704} 705 706define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 707; SI-LABEL: s_ctlz_i64_trunc: 708; SI: ; %bb.0: 709; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 710; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 711; SI-NEXT: s_mov_b32 s3, 0xf000 712; SI-NEXT: s_mov_b32 s2, -1 713; SI-NEXT: s_waitcnt lgkmcnt(0) 714; SI-NEXT: s_flbit_i32_b32 s4, s4 715; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf 716; SI-NEXT: s_flbit_i32_b32 s5, s5 717; SI-NEXT: s_add_i32 s4, s4, 32 718; SI-NEXT: v_mov_b32_e32 v0, s5 719; SI-NEXT: v_min3_u32 v0, s4, v0, 64 720; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 721; SI-NEXT: s_endpgm 722; 723; VI-LABEL: s_ctlz_i64_trunc: 724; VI: ; %bb.0: 725; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 726; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 727; VI-NEXT: s_mov_b32 s3, 0xf000 728; VI-NEXT: s_mov_b32 s2, -1 729; VI-NEXT: s_waitcnt lgkmcnt(0) 730; VI-NEXT: s_flbit_i32_b32 s4, s4 731; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp 732; VI-NEXT: s_flbit_i32_b32 s4, s5 733; VI-NEXT: v_min3_u32 v0, v0, s4, 64 734; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 735; VI-NEXT: s_endpgm 736; 737; EG-LABEL: s_ctlz_i64_trunc: 738; EG: ; %bb.0: 739; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 740; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 741; EG-NEXT: CF_END 742; EG-NEXT: PAD 743; EG-NEXT: ALU clause starting at 4: 744; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 745; EG-NEXT: CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W, 746; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 747; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 748; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 749; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 750; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 751; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 752; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 753; 754; GFX10-LABEL: s_ctlz_i64_trunc: 755; GFX10: ; %bb.0: 756; GFX10-NEXT: s_clause 0x1 757; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 758; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 759; GFX10-NEXT: v_mov_b32_e32 v1, 0 760; GFX10-NEXT: s_waitcnt lgkmcnt(0) 761; GFX10-NEXT: s_flbit_i32_b32 s0, s2 762; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 763; GFX10-NEXT: s_flbit_i32_b32 s0, s3 764; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 765; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 766; GFX10-NEXT: s_endpgm 767; 768; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: 769; GFX10-GISEL: ; %bb.0: 770; GFX10-GISEL-NEXT: s_clause 0x1 771; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 772; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 773; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 774; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 775; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] 776; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 777; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 778; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] 779; GFX10-GISEL-NEXT: s_endpgm 780; 781; GFX11-LABEL: s_ctlz_i64_trunc: 782; GFX11: ; %bb.0: 783; GFX11-NEXT: s_clause 0x1 784; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 785; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 786; GFX11-NEXT: v_mov_b32_e32 v1, 0 787; GFX11-NEXT: s_waitcnt lgkmcnt(0) 788; GFX11-NEXT: s_clz_i32_u32 s2, s2 789; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 790; GFX11-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp 791; GFX11-NEXT: s_clz_i32_u32 s2, s3 792; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 793; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64 794; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 795; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 796; GFX11-NEXT: s_endpgm 797 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 798 %trunc = trunc i64 %ctlz to i32 799 store i32 %trunc, i32 addrspace(1)* %out 800 ret void 801} 802 803define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 804; SI-LABEL: v_ctlz_i64: 805; SI: ; %bb.0: 806; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 807; SI-NEXT: s_mov_b32 s7, 0xf000 808; SI-NEXT: s_mov_b32 s6, 0 809; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 810; SI-NEXT: v_mov_b32_e32 v1, 0 811; SI-NEXT: s_waitcnt lgkmcnt(0) 812; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 813; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 814; SI-NEXT: s_waitcnt vmcnt(0) 815; SI-NEXT: v_ffbh_u32_e32 v2, v2 816; SI-NEXT: v_min_u32_e32 v2, 0xffffffdf, v2 817; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 818; SI-NEXT: v_ffbh_u32_e32 v3, v3 819; SI-NEXT: v_min3_u32 v2, v2, v3, 64 820; SI-NEXT: v_mov_b32_e32 v3, v1 821; SI-NEXT: s_waitcnt lgkmcnt(0) 822; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 823; SI-NEXT: s_endpgm 824; 825; VI-LABEL: v_ctlz_i64: 826; VI: ; %bb.0: 827; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 828; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 829; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 830; VI-NEXT: v_mov_b32_e32 v2, 0 831; VI-NEXT: s_waitcnt lgkmcnt(0) 832; VI-NEXT: v_mov_b32_e32 v1, s3 833; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 834; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 835; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 836; VI-NEXT: v_mov_b32_e32 v4, s1 837; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 838; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 839; VI-NEXT: s_waitcnt vmcnt(0) 840; VI-NEXT: v_ffbh_u32_e32 v0, v0 841; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 842; VI-NEXT: v_ffbh_u32_e32 v1, v1 843; VI-NEXT: v_min3_u32 v1, v0, v1, 64 844; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 845; VI-NEXT: s_endpgm 846; 847; EG-LABEL: v_ctlz_i64: 848; EG: ; %bb.0: 849; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 850; EG-NEXT: TEX 0 @6 851; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 852; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 853; EG-NEXT: CF_END 854; EG-NEXT: PAD 855; EG-NEXT: Fetch clause starting at 6: 856; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 857; EG-NEXT: ALU clause starting at 8: 858; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 859; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 860; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 861; EG-NEXT: ALU clause starting at 11: 862; EG-NEXT: FFBH_UINT * T1.W, T0.X, 863; EG-NEXT: CNDE_INT * T1.W, T0.X, literal.x, PV.W, 864; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 865; EG-NEXT: FFBH_UINT T2.W, T0.Y, 866; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 867; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 868; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 869; EG-NEXT: MOV T0.Y, 0.0, 870; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 871; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 872; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 873; 874; GFX10-LABEL: v_ctlz_i64: 875; GFX10: ; %bb.0: 876; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 877; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 878; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 879; GFX10-NEXT: s_waitcnt lgkmcnt(0) 880; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 881; GFX10-NEXT: s_waitcnt vmcnt(0) 882; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 883; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 884; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 885; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 886; GFX10-NEXT: v_mov_b32_e32 v1, 0 887; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 888; GFX10-NEXT: s_endpgm 889; 890; GFX10-GISEL-LABEL: v_ctlz_i64: 891; GFX10-GISEL: ; %bb.0: 892; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 893; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 894; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 895; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 896; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 897; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 898; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 899; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 900; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 901; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 902; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 903; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 904; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 905; GFX10-GISEL-NEXT: s_endpgm 906; 907; GFX11-LABEL: v_ctlz_i64: 908; GFX11: ; %bb.0: 909; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 910; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 911; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 912; GFX11-NEXT: s_waitcnt lgkmcnt(0) 913; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 914; GFX11-NEXT: s_waitcnt vmcnt(0) 915; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 916; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 917; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 918; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 919; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 920; GFX11-NEXT: v_mov_b32_e32 v1, 0 921; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 922; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 923; GFX11-NEXT: s_endpgm 924 %tid = call i32 @llvm.amdgcn.workitem.id.x() 925 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 926 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 927 %val = load i64, i64 addrspace(1)* %in.gep 928 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 929 store i64 %ctlz, i64 addrspace(1)* %out.gep 930 ret void 931} 932 933define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 934; SI-LABEL: v_ctlz_i64_trunc: 935; SI: ; %bb.0: 936; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 937; SI-NEXT: s_mov_b32 s7, 0xf000 938; SI-NEXT: s_mov_b32 s6, 0 939; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 940; SI-NEXT: v_mov_b32_e32 v2, 0 941; SI-NEXT: s_waitcnt lgkmcnt(0) 942; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 943; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 944; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 945; SI-NEXT: s_waitcnt vmcnt(0) 946; SI-NEXT: v_ffbh_u32_e32 v0, v3 947; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 948; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 949; SI-NEXT: v_ffbh_u32_e32 v3, v4 950; SI-NEXT: v_min3_u32 v0, v0, v3, 64 951; SI-NEXT: s_waitcnt lgkmcnt(0) 952; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 953; SI-NEXT: s_endpgm 954; 955; VI-LABEL: v_ctlz_i64_trunc: 956; VI: ; %bb.0: 957; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 958; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 959; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 960; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 961; VI-NEXT: s_waitcnt lgkmcnt(0) 962; VI-NEXT: v_mov_b32_e32 v2, s3 963; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 964; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 965; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 966; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 967; VI-NEXT: v_mov_b32_e32 v4, s1 968; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 969; VI-NEXT: s_waitcnt vmcnt(0) 970; VI-NEXT: v_ffbh_u32_e32 v0, v1 971; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 972; VI-NEXT: v_ffbh_u32_e32 v1, v2 973; VI-NEXT: v_min3_u32 v0, v0, v1, 64 974; VI-NEXT: flat_store_dword v[3:4], v0 975; VI-NEXT: s_endpgm 976; 977; EG-LABEL: v_ctlz_i64_trunc: 978; EG: ; %bb.0: 979; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 980; EG-NEXT: TEX 0 @6 981; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 982; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 983; EG-NEXT: CF_END 984; EG-NEXT: PAD 985; EG-NEXT: Fetch clause starting at 6: 986; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 987; EG-NEXT: ALU clause starting at 8: 988; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 989; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 990; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 991; EG-NEXT: ALU clause starting at 11: 992; EG-NEXT: FFBH_UINT * T0.W, T1.X, 993; EG-NEXT: CNDE_INT * T0.W, T1.X, literal.x, PV.W, 994; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 995; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 996; EG-NEXT: FFBH_UINT T1.W, T1.Y, 997; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 998; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 999; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 1000; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 1001; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 1002; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1003; 1004; GFX10-LABEL: v_ctlz_i64_trunc: 1005; GFX10: ; %bb.0: 1006; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1007; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1008; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1009; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1010; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 1012; GFX10-NEXT: s_waitcnt vmcnt(0) 1013; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 1014; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 1015; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 1016; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 1017; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1018; GFX10-NEXT: s_endpgm 1019; 1020; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: 1021; GFX10-GISEL: ; %bb.0: 1022; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1023; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1024; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1025; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1026; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1027; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 1028; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1029; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 1030; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 1031; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 1032; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 1033; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 1034; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1035; GFX10-GISEL-NEXT: s_endpgm 1036; 1037; GFX11-LABEL: v_ctlz_i64_trunc: 1038; GFX11: ; %bb.0: 1039; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1040; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1041; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1042; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1043; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1044; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] 1045; GFX11-NEXT: s_waitcnt vmcnt(0) 1046; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 1047; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 1048; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1049; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 1050; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 1051; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1052; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1053; GFX11-NEXT: s_endpgm 1054 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1055 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 1056 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 1057 %val = load i64, i64 addrspace(1)* %in.gep 1058 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 1059 %trunc = trunc i64 %ctlz to i32 1060 store i32 %trunc, i32 addrspace(1)* %out.gep 1061 ret void 1062} 1063 1064define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1065; SI-LABEL: v_ctlz_i32_sel_eq_neg1: 1066; SI: ; %bb.0: 1067; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1068; SI-NEXT: s_mov_b32 s3, 0xf000 1069; SI-NEXT: s_mov_b32 s6, 0 1070; SI-NEXT: s_mov_b32 s7, s3 1071; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1072; SI-NEXT: v_mov_b32_e32 v1, 0 1073; SI-NEXT: s_waitcnt lgkmcnt(0) 1074; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1075; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1076; SI-NEXT: s_mov_b32 s2, -1 1077; SI-NEXT: s_waitcnt vmcnt(0) 1078; SI-NEXT: v_ffbh_u32_e32 v0, v0 1079; SI-NEXT: s_waitcnt lgkmcnt(0) 1080; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1081; SI-NEXT: s_endpgm 1082; 1083; VI-LABEL: v_ctlz_i32_sel_eq_neg1: 1084; VI: ; %bb.0: 1085; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1086; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1087; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1088; VI-NEXT: s_waitcnt lgkmcnt(0) 1089; VI-NEXT: v_mov_b32_e32 v1, s3 1090; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1091; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1092; VI-NEXT: flat_load_dword v0, v[0:1] 1093; VI-NEXT: s_mov_b32 s3, 0xf000 1094; VI-NEXT: s_mov_b32 s2, -1 1095; VI-NEXT: s_waitcnt vmcnt(0) 1096; VI-NEXT: v_ffbh_u32_e32 v0, v0 1097; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1098; VI-NEXT: s_endpgm 1099; 1100; EG-LABEL: v_ctlz_i32_sel_eq_neg1: 1101; EG: ; %bb.0: 1102; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1103; EG-NEXT: TEX 0 @6 1104; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1105; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1106; EG-NEXT: CF_END 1107; EG-NEXT: PAD 1108; EG-NEXT: Fetch clause starting at 6: 1109; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1110; EG-NEXT: ALU clause starting at 8: 1111; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1112; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1113; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1114; EG-NEXT: ALU clause starting at 11: 1115; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1116; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1117; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1118; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1119; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1120; EG-NEXT: -1(nan), 2(2.802597e-45) 1121; 1122; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: 1123; GFX10: ; %bb.0: 1124; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1125; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1126; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1127; GFX10-NEXT: v_mov_b32_e32 v1, 0 1128; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1130; GFX10-NEXT: s_waitcnt vmcnt(0) 1131; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1132; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1133; GFX10-NEXT: s_endpgm 1134; 1135; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: 1136; GFX10-GISEL: ; %bb.0: 1137; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1138; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1139; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1140; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1142; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1143; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1144; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1145; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1146; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 1147; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1148; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1149; GFX10-GISEL-NEXT: s_endpgm 1150; 1151; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: 1152; GFX11: ; %bb.0: 1153; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1154; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1155; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1156; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1157; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1158; GFX11-NEXT: s_waitcnt vmcnt(0) 1159; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1160; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1161; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1162; GFX11-NEXT: s_endpgm 1163 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1164 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1165 %val = load i32, i32 addrspace(1)* %in.gep 1166 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1167 %cmp = icmp eq i32 %val, 0 1168 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1169 store i32 %sel, i32 addrspace(1)* %out 1170 ret void 1171} 1172 1173define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1174; SI-LABEL: v_ctlz_i32_sel_ne_neg1: 1175; SI: ; %bb.0: 1176; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1177; SI-NEXT: s_mov_b32 s3, 0xf000 1178; SI-NEXT: s_mov_b32 s6, 0 1179; SI-NEXT: s_mov_b32 s7, s3 1180; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1181; SI-NEXT: v_mov_b32_e32 v1, 0 1182; SI-NEXT: s_waitcnt lgkmcnt(0) 1183; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1184; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1185; SI-NEXT: s_mov_b32 s2, -1 1186; SI-NEXT: s_waitcnt vmcnt(0) 1187; SI-NEXT: v_ffbh_u32_e32 v0, v0 1188; SI-NEXT: s_waitcnt lgkmcnt(0) 1189; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1190; SI-NEXT: s_endpgm 1191; 1192; VI-LABEL: v_ctlz_i32_sel_ne_neg1: 1193; VI: ; %bb.0: 1194; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1195; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1196; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1197; VI-NEXT: s_waitcnt lgkmcnt(0) 1198; VI-NEXT: v_mov_b32_e32 v1, s3 1199; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1200; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1201; VI-NEXT: flat_load_dword v0, v[0:1] 1202; VI-NEXT: s_mov_b32 s3, 0xf000 1203; VI-NEXT: s_mov_b32 s2, -1 1204; VI-NEXT: s_waitcnt vmcnt(0) 1205; VI-NEXT: v_ffbh_u32_e32 v0, v0 1206; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1207; VI-NEXT: s_endpgm 1208; 1209; EG-LABEL: v_ctlz_i32_sel_ne_neg1: 1210; EG: ; %bb.0: 1211; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1212; EG-NEXT: TEX 0 @6 1213; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1214; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1215; EG-NEXT: CF_END 1216; EG-NEXT: PAD 1217; EG-NEXT: Fetch clause starting at 6: 1218; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1219; EG-NEXT: ALU clause starting at 8: 1220; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1221; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1222; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1223; EG-NEXT: ALU clause starting at 11: 1224; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1225; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1226; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1227; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1228; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1229; EG-NEXT: -1(nan), 2(2.802597e-45) 1230; 1231; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: 1232; GFX10: ; %bb.0: 1233; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1234; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1235; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1236; GFX10-NEXT: v_mov_b32_e32 v1, 0 1237; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1239; GFX10-NEXT: s_waitcnt vmcnt(0) 1240; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1241; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1242; GFX10-NEXT: s_endpgm 1243; 1244; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: 1245; GFX10-GISEL: ; %bb.0: 1246; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1247; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1248; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1249; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1250; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1251; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1252; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1253; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1254; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1255; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo 1256; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1257; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1258; GFX10-GISEL-NEXT: s_endpgm 1259; 1260; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: 1261; GFX11: ; %bb.0: 1262; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1263; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1264; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1265; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1266; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1267; GFX11-NEXT: s_waitcnt vmcnt(0) 1268; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1269; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1270; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1271; GFX11-NEXT: s_endpgm 1272 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1273 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1274 %val = load i32, i32 addrspace(1)* %in.gep 1275 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1276 %cmp = icmp ne i32 %val, 0 1277 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1278 store i32 %sel, i32 addrspace(1)* %out 1279 ret void 1280} 1281 1282; TODO: Should be able to eliminate select here as well. 1283define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1284; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1285; SI: ; %bb.0: 1286; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1287; SI-NEXT: s_mov_b32 s3, 0xf000 1288; SI-NEXT: s_mov_b32 s6, 0 1289; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1290; SI-NEXT: v_mov_b32_e32 v1, 0 1291; SI-NEXT: s_mov_b32 s7, s3 1292; SI-NEXT: s_waitcnt lgkmcnt(0) 1293; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1294; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1295; SI-NEXT: s_mov_b32 s2, -1 1296; SI-NEXT: s_waitcnt vmcnt(0) 1297; SI-NEXT: v_ffbh_u32_e32 v0, v0 1298; SI-NEXT: v_min_u32_e32 v0, 32, v0 1299; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1300; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1301; SI-NEXT: s_waitcnt lgkmcnt(0) 1302; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1303; SI-NEXT: s_endpgm 1304; 1305; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1306; VI: ; %bb.0: 1307; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1308; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1309; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1310; VI-NEXT: s_waitcnt lgkmcnt(0) 1311; VI-NEXT: v_mov_b32_e32 v1, s3 1312; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1313; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1314; VI-NEXT: flat_load_dword v0, v[0:1] 1315; VI-NEXT: s_mov_b32 s3, 0xf000 1316; VI-NEXT: s_mov_b32 s2, -1 1317; VI-NEXT: s_waitcnt vmcnt(0) 1318; VI-NEXT: v_ffbh_u32_e32 v0, v0 1319; VI-NEXT: v_min_u32_e32 v0, 32, v0 1320; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1321; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1322; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1323; VI-NEXT: s_endpgm 1324; 1325; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1326; EG: ; %bb.0: 1327; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1328; EG-NEXT: TEX 0 @6 1329; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1330; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1331; EG-NEXT: CF_END 1332; EG-NEXT: PAD 1333; EG-NEXT: Fetch clause starting at 6: 1334; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1335; EG-NEXT: ALU clause starting at 8: 1336; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1337; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1338; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1339; EG-NEXT: ALU clause starting at 11: 1340; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1341; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1342; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1343; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 1344; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1345; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 1346; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1347; EG-NEXT: -1(nan), 2(2.802597e-45) 1348; 1349; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1350; GFX10: ; %bb.0: 1351; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1352; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1353; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1354; GFX10-NEXT: v_mov_b32_e32 v1, 0 1355; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1356; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1357; GFX10-NEXT: s_waitcnt vmcnt(0) 1358; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1359; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1360; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1361; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1362; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1363; GFX10-NEXT: s_endpgm 1364; 1365; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1366; GFX10-GISEL: ; %bb.0: 1367; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1368; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1369; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1370; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1371; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1373; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1374; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1375; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1376; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 1377; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 1378; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1379; GFX10-GISEL-NEXT: s_endpgm 1380; 1381; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1382; GFX11: ; %bb.0: 1383; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1384; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1385; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1386; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1387; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1388; GFX11-NEXT: s_waitcnt vmcnt(0) 1389; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1390; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1391; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 1392; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1393; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1394; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1395; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1396; GFX11-NEXT: s_endpgm 1397 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1398 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1399 %val = load i32, i32 addrspace(1)* %in.gep 1400 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1401 %cmp = icmp eq i32 %ctlz, 32 1402 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1403 store i32 %sel, i32 addrspace(1)* %out 1404 ret void 1405} 1406 1407define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1408; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1409; SI: ; %bb.0: 1410; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1411; SI-NEXT: s_mov_b32 s3, 0xf000 1412; SI-NEXT: s_mov_b32 s6, 0 1413; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1414; SI-NEXT: v_mov_b32_e32 v1, 0 1415; SI-NEXT: s_mov_b32 s7, s3 1416; SI-NEXT: s_waitcnt lgkmcnt(0) 1417; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1418; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1419; SI-NEXT: s_mov_b32 s2, -1 1420; SI-NEXT: s_waitcnt vmcnt(0) 1421; SI-NEXT: v_ffbh_u32_e32 v0, v0 1422; SI-NEXT: v_min_u32_e32 v0, 32, v0 1423; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1424; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1425; SI-NEXT: s_waitcnt lgkmcnt(0) 1426; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1427; SI-NEXT: s_endpgm 1428; 1429; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1430; VI: ; %bb.0: 1431; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1432; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1433; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1434; VI-NEXT: s_waitcnt lgkmcnt(0) 1435; VI-NEXT: v_mov_b32_e32 v1, s3 1436; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1437; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1438; VI-NEXT: flat_load_dword v0, v[0:1] 1439; VI-NEXT: s_mov_b32 s3, 0xf000 1440; VI-NEXT: s_mov_b32 s2, -1 1441; VI-NEXT: s_waitcnt vmcnt(0) 1442; VI-NEXT: v_ffbh_u32_e32 v0, v0 1443; VI-NEXT: v_min_u32_e32 v0, 32, v0 1444; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1445; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1446; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1447; VI-NEXT: s_endpgm 1448; 1449; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1450; EG: ; %bb.0: 1451; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1452; EG-NEXT: TEX 0 @6 1453; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1454; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1455; EG-NEXT: CF_END 1456; EG-NEXT: PAD 1457; EG-NEXT: Fetch clause starting at 6: 1458; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1459; EG-NEXT: ALU clause starting at 8: 1460; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1461; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1462; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1463; EG-NEXT: ALU clause starting at 11: 1464; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1465; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1466; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1467; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1468; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1469; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1470; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1471; EG-NEXT: -1(nan), 2(2.802597e-45) 1472; 1473; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1474; GFX10: ; %bb.0: 1475; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1476; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1477; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1478; GFX10-NEXT: v_mov_b32_e32 v1, 0 1479; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1481; GFX10-NEXT: s_waitcnt vmcnt(0) 1482; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1483; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1484; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1485; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1486; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1487; GFX10-NEXT: s_endpgm 1488; 1489; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1490; GFX10-GISEL: ; %bb.0: 1491; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1492; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1493; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1494; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1495; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1497; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1498; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1499; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1500; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1501; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1502; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1503; GFX10-GISEL-NEXT: s_endpgm 1504; 1505; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1506; GFX11: ; %bb.0: 1507; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1508; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1509; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1510; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1511; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1512; GFX11-NEXT: s_waitcnt vmcnt(0) 1513; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1514; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1515; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 1516; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1517; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1518; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1519; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1520; GFX11-NEXT: s_endpgm 1521 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1522 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1523 %val = load i32, i32 addrspace(1)* %in.gep 1524 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1525 %cmp = icmp ne i32 %ctlz, 32 1526 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1527 store i32 %sel, i32 addrspace(1)* %out 1528 ret void 1529} 1530 1531 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 1532; SI-LABEL: v_ctlz_i8_sel_eq_neg1: 1533; SI: ; %bb.0: 1534; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1535; SI-NEXT: s_mov_b32 s3, 0xf000 1536; SI-NEXT: v_mov_b32_e32 v1, 0 1537; SI-NEXT: s_mov_b32 s6, 0 1538; SI-NEXT: s_mov_b32 s7, s3 1539; SI-NEXT: s_waitcnt lgkmcnt(0) 1540; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1541; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1542; SI-NEXT: s_mov_b32 s2, -1 1543; SI-NEXT: s_waitcnt vmcnt(0) 1544; SI-NEXT: v_ffbh_u32_e32 v0, v0 1545; SI-NEXT: s_waitcnt lgkmcnt(0) 1546; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1547; SI-NEXT: s_endpgm 1548; 1549; VI-LABEL: v_ctlz_i8_sel_eq_neg1: 1550; VI: ; %bb.0: 1551; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1552; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1553; VI-NEXT: s_waitcnt lgkmcnt(0) 1554; VI-NEXT: v_mov_b32_e32 v1, s3 1555; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1556; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1557; VI-NEXT: flat_load_ubyte v0, v[0:1] 1558; VI-NEXT: s_mov_b32 s3, 0xf000 1559; VI-NEXT: s_mov_b32 s2, -1 1560; VI-NEXT: s_waitcnt vmcnt(0) 1561; VI-NEXT: v_ffbh_u32_e32 v0, v0 1562; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1563; VI-NEXT: s_endpgm 1564; 1565; EG-LABEL: v_ctlz_i8_sel_eq_neg1: 1566; EG: ; %bb.0: 1567; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1568; EG-NEXT: TEX 0 @6 1569; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1570; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1571; EG-NEXT: CF_END 1572; EG-NEXT: PAD 1573; EG-NEXT: Fetch clause starting at 6: 1574; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1575; EG-NEXT: ALU clause starting at 8: 1576; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1577; EG-NEXT: ALU clause starting at 9: 1578; EG-NEXT: FFBH_UINT T0.W, T0.X, 1579; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1580; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1581; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1582; EG-NEXT: LSHL * T1.W, PS, literal.y, 1583; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1584; EG-NEXT: LSHL T0.X, PV.W, PS, 1585; EG-NEXT: LSHL * T0.W, literal.x, PS, 1586; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1587; EG-NEXT: MOV T0.Y, 0.0, 1588; EG-NEXT: MOV * T0.Z, 0.0, 1589; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1590; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1591; 1592; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: 1593; GFX10: ; %bb.0: 1594; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1595; GFX10-NEXT: v_mov_b32_e32 v1, 0 1596; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1597; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1599; GFX10-NEXT: s_waitcnt vmcnt(0) 1600; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1601; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1602; GFX10-NEXT: s_endpgm 1603; 1604; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: 1605; GFX10-GISEL: ; %bb.0: 1606; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1607; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1608; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1609; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1610; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1611; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1612; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1613; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1614; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1615; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1616; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1617; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1618; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1619; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 1620; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 1621; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1622; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1623; GFX10-GISEL-NEXT: s_endpgm 1624; 1625; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: 1626; GFX11: ; %bb.0: 1627; GFX11-NEXT: s_clause 0x1 1628; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1629; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1630; GFX11-NEXT: v_mov_b32_e32 v1, 0 1631; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1632; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1633; GFX11-NEXT: s_waitcnt vmcnt(0) 1634; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1635; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] 1636; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1637; GFX11-NEXT: s_endpgm 1638 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1639 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 1640 %val = load i8, i8 addrspace(1)* %valptr.gep 1641 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 1642 %cmp = icmp eq i8 %val, 0 1643 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1644 store i8 %sel, i8 addrspace(1)* %out 1645 ret void 1646} 1647 1648 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { 1649; SI-LABEL: v_ctlz_i16_sel_eq_neg1: 1650; SI: ; %bb.0: 1651; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1652; SI-NEXT: s_mov_b32 s3, 0xf000 1653; SI-NEXT: s_mov_b32 s2, -1 1654; SI-NEXT: s_mov_b32 s6, s2 1655; SI-NEXT: s_mov_b32 s7, s3 1656; SI-NEXT: s_waitcnt lgkmcnt(0) 1657; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1658; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1659; SI-NEXT: s_waitcnt vmcnt(0) 1660; SI-NEXT: v_ffbh_u32_e32 v0, v0 1661; SI-NEXT: s_waitcnt lgkmcnt(0) 1662; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1663; SI-NEXT: s_endpgm 1664; 1665; VI-LABEL: v_ctlz_i16_sel_eq_neg1: 1666; VI: ; %bb.0: 1667; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 1668; VI-NEXT: s_mov_b32 s3, 0xf000 1669; VI-NEXT: s_mov_b32 s2, -1 1670; VI-NEXT: s_mov_b32 s6, s2 1671; VI-NEXT: s_mov_b32 s7, s3 1672; VI-NEXT: s_waitcnt lgkmcnt(0) 1673; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1674; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1675; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1676; VI-NEXT: s_waitcnt vmcnt(0) 1677; VI-NEXT: v_ffbh_u32_e32 v2, v0 1678; VI-NEXT: v_min_u32_e32 v2, 32, v2 1679; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 1680; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1681; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1682; VI-NEXT: s_waitcnt lgkmcnt(0) 1683; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1684; VI-NEXT: s_endpgm 1685; 1686; EG-LABEL: v_ctlz_i16_sel_eq_neg1: 1687; EG: ; %bb.0: 1688; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1689; EG-NEXT: TEX 0 @6 1690; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1691; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1692; EG-NEXT: CF_END 1693; EG-NEXT: PAD 1694; EG-NEXT: Fetch clause starting at 6: 1695; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1696; EG-NEXT: ALU clause starting at 8: 1697; EG-NEXT: MOV * T0.X, KC0[2].Z, 1698; EG-NEXT: ALU clause starting at 9: 1699; EG-NEXT: FFBH_UINT T0.W, T0.X, 1700; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1701; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1702; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1703; EG-NEXT: LSHL * T1.W, PS, literal.y, 1704; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1705; EG-NEXT: LSHL T0.X, PV.W, PS, 1706; EG-NEXT: LSHL * T0.W, literal.x, PS, 1707; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1708; EG-NEXT: MOV T0.Y, 0.0, 1709; EG-NEXT: MOV * T0.Z, 0.0, 1710; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1711; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1712; 1713; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: 1714; GFX10: ; %bb.0: 1715; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1716; GFX10-NEXT: v_mov_b32_e32 v0, 0 1717; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1718; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1720; GFX10-NEXT: s_waitcnt vmcnt(0) 1721; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 1722; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1723; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 1724; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 1725; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1726; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1727; GFX10-NEXT: s_endpgm 1728; 1729; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: 1730; GFX10-GISEL: ; %bb.0: 1731; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1732; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 1733; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1734; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1735; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] 1736; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1737; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 1738; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1739; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1740; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 1741; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 1742; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo 1743; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1744; GFX10-GISEL-NEXT: s_endpgm 1745; 1746; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: 1747; GFX11: ; %bb.0: 1748; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1749; GFX11-NEXT: v_mov_b32_e32 v0, 0 1750; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1751; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 1753; GFX11-NEXT: s_waitcnt vmcnt(0) 1754; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 1755; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1756; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1757; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 1758; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 1759; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1760; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1761; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1762; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1763; GFX11-NEXT: s_endpgm 1764 %val = load i16, i16 addrspace(1)* %valptr 1765 %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone 1766 %cmp = icmp eq i16 %val, 0 1767 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1768 store i16 %sel, i16 addrspace(1)* %out 1769 ret void 1770} 1771 1772; FIXME: Need to handle non-uniform case for function below (load without gep). 1773define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { 1774; SI-LABEL: v_ctlz_i7_sel_eq_neg1: 1775; SI: ; %bb.0: 1776; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1777; SI-NEXT: s_mov_b32 s3, 0xf000 1778; SI-NEXT: v_mov_b32_e32 v1, 0 1779; SI-NEXT: s_mov_b32 s6, 0 1780; SI-NEXT: s_mov_b32 s7, s3 1781; SI-NEXT: s_waitcnt lgkmcnt(0) 1782; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1783; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1784; SI-NEXT: s_mov_b32 s2, -1 1785; SI-NEXT: s_waitcnt vmcnt(0) 1786; SI-NEXT: v_ffbh_u32_e32 v0, v0 1787; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1788; SI-NEXT: s_waitcnt lgkmcnt(0) 1789; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1790; SI-NEXT: s_endpgm 1791; 1792; VI-LABEL: v_ctlz_i7_sel_eq_neg1: 1793; VI: ; %bb.0: 1794; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1795; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1796; VI-NEXT: s_waitcnt lgkmcnt(0) 1797; VI-NEXT: v_mov_b32_e32 v1, s3 1798; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1799; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1800; VI-NEXT: flat_load_ubyte v0, v[0:1] 1801; VI-NEXT: s_mov_b32 s3, 0xf000 1802; VI-NEXT: s_mov_b32 s2, -1 1803; VI-NEXT: s_waitcnt vmcnt(0) 1804; VI-NEXT: v_ffbh_u32_e32 v0, v0 1805; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1806; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1807; VI-NEXT: s_endpgm 1808; 1809; EG-LABEL: v_ctlz_i7_sel_eq_neg1: 1810; EG: ; %bb.0: 1811; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1812; EG-NEXT: TEX 0 @6 1813; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1814; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1815; EG-NEXT: CF_END 1816; EG-NEXT: PAD 1817; EG-NEXT: Fetch clause starting at 6: 1818; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1819; EG-NEXT: ALU clause starting at 8: 1820; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1821; EG-NEXT: ALU clause starting at 9: 1822; EG-NEXT: FFBH_UINT T0.W, T0.X, 1823; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1824; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1825; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1826; EG-NEXT: LSHL * T1.W, PS, literal.y, 1827; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1828; EG-NEXT: LSHL T0.X, PV.W, PS, 1829; EG-NEXT: LSHL * T0.W, literal.x, PS, 1830; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1831; EG-NEXT: MOV T0.Y, 0.0, 1832; EG-NEXT: MOV * T0.Z, 0.0, 1833; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1834; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1835; 1836; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: 1837; GFX10: ; %bb.0: 1838; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1839; GFX10-NEXT: v_mov_b32_e32 v1, 0 1840; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1841; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1842; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1843; GFX10-NEXT: s_waitcnt vmcnt(0) 1844; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1845; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 1846; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1847; GFX10-NEXT: s_endpgm 1848; 1849; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: 1850; GFX10-GISEL: ; %bb.0: 1851; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1852; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1853; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1854; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1856; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1857; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1858; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1859; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1860; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1861; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1862; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1863; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1864; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1865; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 1866; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo 1867; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1868; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1869; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1870; GFX10-GISEL-NEXT: s_endpgm 1871; 1872; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: 1873; GFX11: ; %bb.0: 1874; GFX11-NEXT: s_clause 0x1 1875; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 1876; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1877; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1878; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1879; GFX11-NEXT: s_waitcnt vmcnt(0) 1880; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1881; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1882; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 1883; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] 1884; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1885; GFX11-NEXT: s_endpgm 1886 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1887 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid 1888 %val = load i7, i7 addrspace(1)* %valptr.gep 1889 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone 1890 %cmp = icmp eq i7 %val, 0 1891 %sel = select i1 %cmp, i7 -1, i7 %ctlz 1892 store i7 %sel, i7 addrspace(1)* %out 1893 ret void 1894} 1895