1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s 5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s 6 7declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 8 9declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 10declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 11declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 12 13declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 14declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 15declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 16 17declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 18 19define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 20; SI-LABEL: s_ctlz_zero_undef_i32: 21; SI: ; %bb.0: 22; SI-NEXT: s_load_dword s2, s[0:1], 0xb 23; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; SI-NEXT: s_mov_b32 s3, 0xf000 25; SI-NEXT: s_waitcnt lgkmcnt(0) 26; SI-NEXT: s_flbit_i32_b32 s4, s2 27; SI-NEXT: s_mov_b32 s2, -1 28; SI-NEXT: v_mov_b32_e32 v0, s4 29; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 30; SI-NEXT: s_endpgm 31; 32; VI-LABEL: s_ctlz_zero_undef_i32: 33; VI: ; %bb.0: 34; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 35; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 36; VI-NEXT: s_waitcnt lgkmcnt(0) 37; VI-NEXT: s_flbit_i32_b32 s2, s2 38; VI-NEXT: v_mov_b32_e32 v0, s0 39; VI-NEXT: v_mov_b32_e32 v1, s1 40; VI-NEXT: v_mov_b32_e32 v2, s2 41; VI-NEXT: flat_store_dword v[0:1], v2 42; VI-NEXT: s_endpgm 43; 44; EG-LABEL: s_ctlz_zero_undef_i32: 45; EG: ; %bb.0: 46; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 47; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 48; EG-NEXT: CF_END 49; EG-NEXT: PAD 50; EG-NEXT: ALU clause starting at 4: 51; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 52; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 53; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z, 54; 55; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: 56; GFX9-GISEL: ; %bb.0: 57; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 58; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 59; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 60; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 61; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 62; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 63; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 64; GFX9-GISEL-NEXT: s_endpgm 65 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 66 store i32 %ctlz, i32 addrspace(1)* %out, align 4 67 ret void 68} 69 70define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 71; SI-LABEL: v_ctlz_zero_undef_i32: 72; SI: ; %bb.0: 73; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 74; SI-NEXT: s_mov_b32 s3, 0xf000 75; SI-NEXT: s_mov_b32 s6, 0 76; SI-NEXT: s_mov_b32 s7, s3 77; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 78; SI-NEXT: v_mov_b32_e32 v1, 0 79; SI-NEXT: s_waitcnt lgkmcnt(0) 80; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 81; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 82; SI-NEXT: s_mov_b32 s2, -1 83; SI-NEXT: s_waitcnt vmcnt(0) 84; SI-NEXT: v_ffbh_u32_e32 v0, v0 85; SI-NEXT: s_waitcnt lgkmcnt(0) 86; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 87; SI-NEXT: s_endpgm 88; 89; VI-LABEL: v_ctlz_zero_undef_i32: 90; VI: ; %bb.0: 91; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 92; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 93; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 94; VI-NEXT: s_waitcnt lgkmcnt(0) 95; VI-NEXT: v_mov_b32_e32 v1, s3 96; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 97; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 98; VI-NEXT: flat_load_dword v0, v[0:1] 99; VI-NEXT: s_waitcnt vmcnt(0) 100; VI-NEXT: v_ffbh_u32_e32 v2, v0 101; VI-NEXT: v_mov_b32_e32 v0, s0 102; VI-NEXT: v_mov_b32_e32 v1, s1 103; VI-NEXT: flat_store_dword v[0:1], v2 104; VI-NEXT: s_endpgm 105; 106; EG-LABEL: v_ctlz_zero_undef_i32: 107; EG: ; %bb.0: 108; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 109; EG-NEXT: TEX 0 @6 110; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] 111; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 112; EG-NEXT: CF_END 113; EG-NEXT: PAD 114; EG-NEXT: Fetch clause starting at 6: 115; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 116; EG-NEXT: ALU clause starting at 8: 117; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 118; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 119; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 120; EG-NEXT: ALU clause starting at 11: 121; EG-NEXT: FFBH_UINT T0.X, T0.X, 122; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 123; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 124; 125; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: 126; GFX9-GISEL: ; %bb.0: 127; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 128; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 129; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 130; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 131; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 132; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 133; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 134; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 135; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 136; GFX9-GISEL-NEXT: s_endpgm 137 %tid = call i32 @llvm.amdgcn.workitem.id.x() 138 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 139 %val = load i32, i32 addrspace(1)* %in.gep, align 4 140 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 141 store i32 %ctlz, i32 addrspace(1)* %out, align 4 142 ret void 143} 144 145define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 146; SI-LABEL: v_ctlz_zero_undef_v2i32: 147; SI: ; %bb.0: 148; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 149; SI-NEXT: s_mov_b32 s3, 0xf000 150; SI-NEXT: s_mov_b32 s6, 0 151; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 152; SI-NEXT: v_mov_b32_e32 v1, 0 153; SI-NEXT: s_mov_b32 s7, s3 154; SI-NEXT: s_waitcnt lgkmcnt(0) 155; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 156; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 157; SI-NEXT: s_mov_b32 s2, -1 158; SI-NEXT: s_waitcnt vmcnt(0) 159; SI-NEXT: v_ffbh_u32_e32 v1, v1 160; SI-NEXT: v_ffbh_u32_e32 v0, v0 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 163; SI-NEXT: s_endpgm 164; 165; VI-LABEL: v_ctlz_zero_undef_v2i32: 166; VI: ; %bb.0: 167; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 168; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 169; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 170; VI-NEXT: s_waitcnt lgkmcnt(0) 171; VI-NEXT: v_mov_b32_e32 v1, s3 172; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 173; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 174; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 175; VI-NEXT: v_mov_b32_e32 v3, s1 176; VI-NEXT: v_mov_b32_e32 v2, s0 177; VI-NEXT: s_waitcnt vmcnt(0) 178; VI-NEXT: v_ffbh_u32_e32 v1, v1 179; VI-NEXT: v_ffbh_u32_e32 v0, v0 180; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 181; VI-NEXT: s_endpgm 182; 183; EG-LABEL: v_ctlz_zero_undef_v2i32: 184; EG: ; %bb.0: 185; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 186; EG-NEXT: TEX 0 @6 187; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 188; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 189; EG-NEXT: CF_END 190; EG-NEXT: PAD 191; EG-NEXT: Fetch clause starting at 6: 192; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 193; EG-NEXT: ALU clause starting at 8: 194; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 195; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 196; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 197; EG-NEXT: ALU clause starting at 11: 198; EG-NEXT: FFBH_UINT * T0.Y, T0.Y, 199; EG-NEXT: FFBH_UINT T0.X, T0.X, 200; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 201; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 202; 203; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: 204; GFX9-GISEL: ; %bb.0: 205; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 206; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 207; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 208; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 209; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 210; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 211; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 212; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 213; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 214; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 215; GFX9-GISEL-NEXT: s_endpgm 216 %tid = call i32 @llvm.amdgcn.workitem.id.x() 217 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 218 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 219 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone 220 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 221 ret void 222} 223 224define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 225; SI-LABEL: v_ctlz_zero_undef_v4i32: 226; SI: ; %bb.0: 227; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 228; SI-NEXT: s_mov_b32 s3, 0xf000 229; SI-NEXT: s_mov_b32 s6, 0 230; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 231; SI-NEXT: v_mov_b32_e32 v1, 0 232; SI-NEXT: s_mov_b32 s7, s3 233; SI-NEXT: s_waitcnt lgkmcnt(0) 234; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 235; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 236; SI-NEXT: s_mov_b32 s2, -1 237; SI-NEXT: s_waitcnt vmcnt(0) 238; SI-NEXT: v_ffbh_u32_e32 v3, v3 239; SI-NEXT: v_ffbh_u32_e32 v2, v2 240; SI-NEXT: v_ffbh_u32_e32 v1, v1 241; SI-NEXT: v_ffbh_u32_e32 v0, v0 242; SI-NEXT: s_waitcnt lgkmcnt(0) 243; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 244; SI-NEXT: s_endpgm 245; 246; VI-LABEL: v_ctlz_zero_undef_v4i32: 247; VI: ; %bb.0: 248; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 249; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 250; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 251; VI-NEXT: s_waitcnt lgkmcnt(0) 252; VI-NEXT: v_mov_b32_e32 v1, s3 253; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 254; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 255; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 256; VI-NEXT: v_mov_b32_e32 v5, s1 257; VI-NEXT: v_mov_b32_e32 v4, s0 258; VI-NEXT: s_waitcnt vmcnt(0) 259; VI-NEXT: v_ffbh_u32_e32 v3, v3 260; VI-NEXT: v_ffbh_u32_e32 v2, v2 261; VI-NEXT: v_ffbh_u32_e32 v1, v1 262; VI-NEXT: v_ffbh_u32_e32 v0, v0 263; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 264; VI-NEXT: s_endpgm 265; 266; EG-LABEL: v_ctlz_zero_undef_v4i32: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 278; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 279; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 280; EG-NEXT: ALU clause starting at 11: 281; EG-NEXT: FFBH_UINT * T0.W, T0.W, 282; EG-NEXT: FFBH_UINT * T0.Z, T0.Z, 283; EG-NEXT: FFBH_UINT * T0.Y, T0.Y, 284; EG-NEXT: FFBH_UINT T0.X, T0.X, 285; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 286; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 287; 288; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: 289; GFX9-GISEL: ; %bb.0: 290; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 291; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 292; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 293; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 294; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 295; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 296; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 297; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 298; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 299; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 300; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 301; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 302; GFX9-GISEL-NEXT: s_endpgm 303 %tid = call i32 @llvm.amdgcn.workitem.id.x() 304 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 305 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 306 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone 307 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 308 ret void 309} 310 311define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 312; SI-LABEL: v_ctlz_zero_undef_i8: 313; SI: ; %bb.0: 314; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 315; SI-NEXT: s_mov_b32 s3, 0xf000 316; SI-NEXT: v_mov_b32_e32 v1, 0 317; SI-NEXT: s_mov_b32 s6, 0 318; SI-NEXT: s_mov_b32 s7, s3 319; SI-NEXT: s_waitcnt lgkmcnt(0) 320; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 321; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 322; SI-NEXT: s_mov_b32 s2, -1 323; SI-NEXT: s_waitcnt vmcnt(0) 324; SI-NEXT: v_ffbh_u32_e32 v0, v0 325; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 326; SI-NEXT: s_waitcnt lgkmcnt(0) 327; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 328; SI-NEXT: s_endpgm 329; 330; VI-LABEL: v_ctlz_zero_undef_i8: 331; VI: ; %bb.0: 332; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 333; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 334; VI-NEXT: s_waitcnt lgkmcnt(0) 335; VI-NEXT: v_mov_b32_e32 v1, s3 336; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 337; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 338; VI-NEXT: flat_load_ubyte v0, v[0:1] 339; VI-NEXT: s_waitcnt vmcnt(0) 340; VI-NEXT: v_ffbh_u32_e32 v0, v0 341; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 342; VI-NEXT: v_add_u16_e32 v2, -8, v0 343; VI-NEXT: v_mov_b32_e32 v0, s0 344; VI-NEXT: v_mov_b32_e32 v1, s1 345; VI-NEXT: flat_store_byte v[0:1], v2 346; VI-NEXT: s_endpgm 347; 348; EG-LABEL: v_ctlz_zero_undef_i8: 349; EG: ; %bb.0: 350; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 351; EG-NEXT: TEX 0 @6 352; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 353; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 354; EG-NEXT: CF_END 355; EG-NEXT: PAD 356; EG-NEXT: Fetch clause starting at 6: 357; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 358; EG-NEXT: ALU clause starting at 8: 359; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 360; EG-NEXT: ALU clause starting at 9: 361; EG-NEXT: FFBH_UINT T0.W, T0.X, 362; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 363; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 364; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 365; EG-NEXT: -24(nan), 0(0.000000e+00) 366; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 367; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 368; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 369; EG-NEXT: LSHL T0.X, PV.W, PS, 370; EG-NEXT: LSHL * T0.W, literal.x, PS, 371; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 372; EG-NEXT: MOV T0.Y, 0.0, 373; EG-NEXT: MOV * T0.Z, 0.0, 374; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 375; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 376; 377; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: 378; GFX9-GISEL: ; %bb.0: 379; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 380; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 381; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 382; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 383; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 384; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 385; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 386; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc 387; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 388; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 389; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 390; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 391; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0 392; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 393; GFX9-GISEL-NEXT: s_endpgm 394 %tid = call i32 @llvm.amdgcn.workitem.id.x() 395 %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 396 %val = load i8, i8 addrspace(1)* %in.gep 397 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone 398 store i8 %ctlz, i8 addrspace(1)* %out 399 ret void 400} 401 402define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 403; SI-LABEL: s_ctlz_zero_undef_i64: 404; SI: ; %bb.0: 405; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 406; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 407; SI-NEXT: s_mov_b32 s3, 0xf000 408; SI-NEXT: s_mov_b32 s2, -1 409; SI-NEXT: s_waitcnt lgkmcnt(0) 410; SI-NEXT: s_flbit_i32_b32 s4, s4 411; SI-NEXT: s_flbit_i32_b32 s5, s5 412; SI-NEXT: s_add_i32 s4, s4, 32 413; SI-NEXT: s_min_u32 s4, s4, s5 414; SI-NEXT: v_mov_b32_e32 v1, 0 415; SI-NEXT: v_mov_b32_e32 v0, s4 416; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 417; SI-NEXT: s_endpgm 418; 419; VI-LABEL: s_ctlz_zero_undef_i64: 420; VI: ; %bb.0: 421; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 422; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 423; VI-NEXT: v_mov_b32_e32 v1, 0 424; VI-NEXT: s_waitcnt lgkmcnt(0) 425; VI-NEXT: s_flbit_i32_b32 s2, s2 426; VI-NEXT: s_flbit_i32_b32 s3, s3 427; VI-NEXT: s_add_i32 s2, s2, 32 428; VI-NEXT: s_min_u32 s2, s2, s3 429; VI-NEXT: v_mov_b32_e32 v3, s1 430; VI-NEXT: v_mov_b32_e32 v0, s2 431; VI-NEXT: v_mov_b32_e32 v2, s0 432; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 433; VI-NEXT: s_endpgm 434; 435; EG-LABEL: s_ctlz_zero_undef_i64: 436; EG: ; %bb.0: 437; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 438; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 439; EG-NEXT: CF_END 440; EG-NEXT: PAD 441; EG-NEXT: ALU clause starting at 4: 442; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 443; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 444; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 445; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 446; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 447; EG-NEXT: MOV T0.Y, 0.0, 448; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 449; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 450; 451; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: 452; GFX9-GISEL: ; %bb.0: 453; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 454; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 455; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 456; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 457; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] 458; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 459; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 460; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 461; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 462; GFX9-GISEL-NEXT: s_endpgm 463 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 464 store i64 %ctlz, i64 addrspace(1)* %out 465 ret void 466} 467 468define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 469; SI-LABEL: s_ctlz_zero_undef_i64_trunc: 470; SI: ; %bb.0: 471; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 472; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 473; SI-NEXT: s_mov_b32 s3, 0xf000 474; SI-NEXT: s_waitcnt lgkmcnt(0) 475; SI-NEXT: s_flbit_i32_b32 s2, s4 476; SI-NEXT: s_flbit_i32_b32 s4, s5 477; SI-NEXT: s_add_i32 s2, s2, 32 478; SI-NEXT: s_min_u32 s4, s2, s4 479; SI-NEXT: s_mov_b32 s2, -1 480; SI-NEXT: v_mov_b32_e32 v0, s4 481; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 482; SI-NEXT: s_endpgm 483; 484; VI-LABEL: s_ctlz_zero_undef_i64_trunc: 485; VI: ; %bb.0: 486; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 487; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 488; VI-NEXT: s_waitcnt lgkmcnt(0) 489; VI-NEXT: s_flbit_i32_b32 s2, s2 490; VI-NEXT: s_flbit_i32_b32 s3, s3 491; VI-NEXT: s_add_i32 s2, s2, 32 492; VI-NEXT: s_min_u32 s2, s2, s3 493; VI-NEXT: v_mov_b32_e32 v0, s0 494; VI-NEXT: v_mov_b32_e32 v1, s1 495; VI-NEXT: v_mov_b32_e32 v2, s2 496; VI-NEXT: flat_store_dword v[0:1], v2 497; VI-NEXT: s_endpgm 498; 499; EG-LABEL: s_ctlz_zero_undef_i64_trunc: 500; EG: ; %bb.0: 501; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 502; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 503; EG-NEXT: CF_END 504; EG-NEXT: PAD 505; EG-NEXT: ALU clause starting at 4: 506; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 507; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 508; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 509; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 510; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 511; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 512; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 513; 514; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: 515; GFX9-GISEL: ; %bb.0: 516; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 517; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 518; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 519; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 520; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] 521; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 522; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] 523; GFX9-GISEL-NEXT: s_endpgm 524 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 525 %trunc = trunc i64 %ctlz to i32 526 store i32 %trunc, i32 addrspace(1)* %out 527 ret void 528} 529 530define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 531; SI-LABEL: v_ctlz_zero_undef_i64: 532; SI: ; %bb.0: 533; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 534; SI-NEXT: s_mov_b32 s7, 0xf000 535; SI-NEXT: s_mov_b32 s6, 0 536; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 537; SI-NEXT: v_mov_b32_e32 v1, 0 538; SI-NEXT: s_waitcnt lgkmcnt(0) 539; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 540; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 541; SI-NEXT: s_waitcnt vmcnt(0) 542; SI-NEXT: v_ffbh_u32_e32 v2, v2 543; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 544; SI-NEXT: v_ffbh_u32_e32 v3, v3 545; SI-NEXT: v_min_u32_e32 v2, v2, v3 546; SI-NEXT: v_mov_b32_e32 v3, v1 547; SI-NEXT: s_waitcnt lgkmcnt(0) 548; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 549; SI-NEXT: s_endpgm 550; 551; VI-LABEL: v_ctlz_zero_undef_i64: 552; VI: ; %bb.0: 553; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 554; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 555; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 556; VI-NEXT: v_mov_b32_e32 v2, 0 557; VI-NEXT: s_waitcnt lgkmcnt(0) 558; VI-NEXT: v_mov_b32_e32 v1, s3 559; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 560; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 561; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 562; VI-NEXT: v_mov_b32_e32 v4, s1 563; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 564; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 565; VI-NEXT: s_waitcnt vmcnt(0) 566; VI-NEXT: v_ffbh_u32_e32 v0, v0 567; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 568; VI-NEXT: v_ffbh_u32_e32 v1, v1 569; VI-NEXT: v_min_u32_e32 v1, v0, v1 570; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 571; VI-NEXT: s_endpgm 572; 573; EG-LABEL: v_ctlz_zero_undef_i64: 574; EG: ; %bb.0: 575; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 576; EG-NEXT: TEX 0 @6 577; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 578; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 579; EG-NEXT: CF_END 580; EG-NEXT: PAD 581; EG-NEXT: Fetch clause starting at 6: 582; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 583; EG-NEXT: ALU clause starting at 8: 584; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 585; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 586; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 587; EG-NEXT: ALU clause starting at 11: 588; EG-NEXT: FFBH_UINT * T1.W, T0.X, 589; EG-NEXT: FFBH_UINT T2.W, T0.Y, 590; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 591; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 592; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 593; EG-NEXT: MOV T0.Y, 0.0, 594; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 595; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 596; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 597; 598; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: 599; GFX9-GISEL: ; %bb.0: 600; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 601; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 602; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 603; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 604; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 605; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 606; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 607; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 608; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 609; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 610; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 611; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 612; GFX9-GISEL-NEXT: s_endpgm 613 %tid = call i32 @llvm.amdgcn.workitem.id.x() 614 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 615 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 616 %val = load i64, i64 addrspace(1)* %in.gep 617 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 618 store i64 %ctlz, i64 addrspace(1)* %out.gep 619 ret void 620} 621 622define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 623; SI-LABEL: v_ctlz_zero_undef_i64_trunc: 624; SI: ; %bb.0: 625; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 626; SI-NEXT: s_mov_b32 s7, 0xf000 627; SI-NEXT: s_mov_b32 s6, 0 628; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 629; SI-NEXT: v_mov_b32_e32 v2, 0 630; SI-NEXT: s_waitcnt lgkmcnt(0) 631; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 632; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 633; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 634; SI-NEXT: s_waitcnt vmcnt(0) 635; SI-NEXT: v_ffbh_u32_e32 v0, v3 636; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 637; SI-NEXT: v_ffbh_u32_e32 v3, v4 638; SI-NEXT: v_min_u32_e32 v0, v0, v3 639; SI-NEXT: s_waitcnt lgkmcnt(0) 640; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 641; SI-NEXT: s_endpgm 642; 643; VI-LABEL: v_ctlz_zero_undef_i64_trunc: 644; VI: ; %bb.0: 645; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 646; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 647; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 648; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 649; VI-NEXT: s_waitcnt lgkmcnt(0) 650; VI-NEXT: v_mov_b32_e32 v2, s3 651; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 652; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 653; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 654; VI-NEXT: v_mov_b32_e32 v4, s1 655; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 656; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 657; VI-NEXT: s_waitcnt vmcnt(0) 658; VI-NEXT: v_ffbh_u32_e32 v0, v1 659; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 660; VI-NEXT: v_ffbh_u32_e32 v1, v2 661; VI-NEXT: v_min_u32_e32 v0, v0, v1 662; VI-NEXT: flat_store_dword v[3:4], v0 663; VI-NEXT: s_endpgm 664; 665; EG-LABEL: v_ctlz_zero_undef_i64_trunc: 666; EG: ; %bb.0: 667; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 668; EG-NEXT: TEX 0 @6 669; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 670; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 671; EG-NEXT: CF_END 672; EG-NEXT: PAD 673; EG-NEXT: Fetch clause starting at 6: 674; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 675; EG-NEXT: ALU clause starting at 8: 676; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 677; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 678; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 679; EG-NEXT: ALU clause starting at 11: 680; EG-NEXT: FFBH_UINT * T0.W, T1.X, 681; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 682; EG-NEXT: FFBH_UINT T1.W, T1.Y, 683; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 684; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 685; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 686; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 687; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 688; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 689; 690; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: 691; GFX9-GISEL: ; %bb.0: 692; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 693; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 694; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 695; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 696; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 697; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 698; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 699; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 700; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 701; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 702; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 703; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 704; GFX9-GISEL-NEXT: s_endpgm 705 %tid = call i32 @llvm.amdgcn.workitem.id.x() 706 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 707 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 708 %val = load i64, i64 addrspace(1)* %in.gep 709 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 710 %trunc = trunc i64 %ctlz to i32 711 store i32 %trunc, i32 addrspace(1)* %out.gep 712 ret void 713} 714 715define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 716; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 717; SI: ; %bb.0: 718; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 719; SI-NEXT: s_mov_b32 s3, 0xf000 720; SI-NEXT: s_mov_b32 s6, 0 721; SI-NEXT: s_mov_b32 s7, s3 722; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 723; SI-NEXT: v_mov_b32_e32 v1, 0 724; SI-NEXT: s_waitcnt lgkmcnt(0) 725; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 726; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 727; SI-NEXT: s_mov_b32 s2, -1 728; SI-NEXT: s_waitcnt vmcnt(0) 729; SI-NEXT: v_ffbh_u32_e32 v0, v0 730; SI-NEXT: s_waitcnt lgkmcnt(0) 731; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 732; SI-NEXT: s_endpgm 733; 734; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 735; VI: ; %bb.0: 736; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 737; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 738; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 739; VI-NEXT: s_waitcnt lgkmcnt(0) 740; VI-NEXT: v_mov_b32_e32 v1, s3 741; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 742; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 743; VI-NEXT: flat_load_dword v0, v[0:1] 744; VI-NEXT: s_waitcnt vmcnt(0) 745; VI-NEXT: v_ffbh_u32_e32 v2, v0 746; VI-NEXT: v_mov_b32_e32 v0, s0 747; VI-NEXT: v_mov_b32_e32 v1, s1 748; VI-NEXT: flat_store_dword v[0:1], v2 749; VI-NEXT: s_endpgm 750; 751; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 752; EG: ; %bb.0: 753; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 754; EG-NEXT: TEX 0 @6 755; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 756; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 757; EG-NEXT: CF_END 758; EG-NEXT: PAD 759; EG-NEXT: Fetch clause starting at 6: 760; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 761; EG-NEXT: ALU clause starting at 8: 762; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 763; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 764; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 765; EG-NEXT: ALU clause starting at 11: 766; EG-NEXT: FFBH_UINT * T0.W, T0.X, 767; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 768; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 769; EG-NEXT: -1(nan), 2(2.802597e-45) 770; 771; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 772; GFX9-GISEL: ; %bb.0: 773; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 774; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 775; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 776; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 777; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 778; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 779; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 780; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 781; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 782; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 783; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 784; GFX9-GISEL-NEXT: s_endpgm 785 %tid = call i32 @llvm.amdgcn.workitem.id.x() 786 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 787 %val = load i32, i32 addrspace(1)* %in.gep 788 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 789 %cmp = icmp eq i32 %val, 0 790 %sel = select i1 %cmp, i32 -1, i32 %ctlz 791 store i32 %sel, i32 addrspace(1)* %out 792 ret void 793} 794 795define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 796; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 797; SI: ; %bb.0: 798; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 799; SI-NEXT: s_mov_b32 s3, 0xf000 800; SI-NEXT: s_mov_b32 s6, 0 801; SI-NEXT: s_mov_b32 s7, s3 802; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 803; SI-NEXT: v_mov_b32_e32 v1, 0 804; SI-NEXT: s_waitcnt lgkmcnt(0) 805; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 806; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 807; SI-NEXT: s_mov_b32 s2, -1 808; SI-NEXT: s_waitcnt vmcnt(0) 809; SI-NEXT: v_ffbh_u32_e32 v0, v0 810; SI-NEXT: s_waitcnt lgkmcnt(0) 811; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 812; SI-NEXT: s_endpgm 813; 814; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 815; VI: ; %bb.0: 816; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 817; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 818; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 819; VI-NEXT: s_waitcnt lgkmcnt(0) 820; VI-NEXT: v_mov_b32_e32 v1, s3 821; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 822; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 823; VI-NEXT: flat_load_dword v0, v[0:1] 824; VI-NEXT: s_waitcnt vmcnt(0) 825; VI-NEXT: v_ffbh_u32_e32 v2, v0 826; VI-NEXT: v_mov_b32_e32 v0, s0 827; VI-NEXT: v_mov_b32_e32 v1, s1 828; VI-NEXT: flat_store_dword v[0:1], v2 829; VI-NEXT: s_endpgm 830; 831; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 832; EG: ; %bb.0: 833; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 834; EG-NEXT: TEX 0 @6 835; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 836; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 837; EG-NEXT: CF_END 838; EG-NEXT: PAD 839; EG-NEXT: Fetch clause starting at 6: 840; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 841; EG-NEXT: ALU clause starting at 8: 842; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 843; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 844; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 845; EG-NEXT: ALU clause starting at 11: 846; EG-NEXT: FFBH_UINT * T0.W, T0.X, 847; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 848; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 849; EG-NEXT: -1(nan), 2(2.802597e-45) 850; 851; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 852; GFX9-GISEL: ; %bb.0: 853; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 854; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 855; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 856; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 857; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 858; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 859; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 860; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 861; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc 862; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 863; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 864; GFX9-GISEL-NEXT: s_endpgm 865 %tid = call i32 @llvm.amdgcn.workitem.id.x() 866 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 867 %val = load i32, i32 addrspace(1)* %in.gep 868 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 869 %cmp = icmp ne i32 %val, 0 870 %sel = select i1 %cmp, i32 %ctlz, i32 -1 871 store i32 %sel, i32 addrspace(1)* %out 872 ret void 873} 874 875define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 876; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 877; SI: ; %bb.0: 878; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 879; SI-NEXT: s_mov_b32 s3, 0xf000 880; SI-NEXT: v_mov_b32_e32 v1, 0 881; SI-NEXT: s_mov_b32 s6, 0 882; SI-NEXT: s_mov_b32 s7, s3 883; SI-NEXT: s_waitcnt lgkmcnt(0) 884; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 885; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 886; SI-NEXT: s_mov_b32 s2, -1 887; SI-NEXT: s_waitcnt vmcnt(0) 888; SI-NEXT: v_ffbh_u32_e32 v0, v0 889; SI-NEXT: s_waitcnt lgkmcnt(0) 890; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 891; SI-NEXT: s_endpgm 892; 893; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 894; VI: ; %bb.0: 895; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 896; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 897; VI-NEXT: s_waitcnt lgkmcnt(0) 898; VI-NEXT: v_mov_b32_e32 v1, s3 899; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 900; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 901; VI-NEXT: flat_load_ubyte v0, v[0:1] 902; VI-NEXT: s_waitcnt vmcnt(0) 903; VI-NEXT: v_ffbh_u32_e32 v2, v0 904; VI-NEXT: v_mov_b32_e32 v0, s0 905; VI-NEXT: v_mov_b32_e32 v1, s1 906; VI-NEXT: flat_store_byte v[0:1], v2 907; VI-NEXT: s_endpgm 908; 909; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 910; EG: ; %bb.0: 911; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 912; EG-NEXT: TEX 0 @6 913; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 914; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 915; EG-NEXT: CF_END 916; EG-NEXT: PAD 917; EG-NEXT: Fetch clause starting at 6: 918; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 919; EG-NEXT: ALU clause starting at 8: 920; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 921; EG-NEXT: ALU clause starting at 9: 922; EG-NEXT: FFBH_UINT T0.W, T0.X, 923; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 924; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 925; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 926; EG-NEXT: LSHL * T1.W, PS, literal.y, 927; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 928; EG-NEXT: LSHL T0.X, PV.W, PS, 929; EG-NEXT: LSHL * T0.W, literal.x, PS, 930; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 931; EG-NEXT: MOV T0.Y, 0.0, 932; EG-NEXT: MOV * T0.Z, 0.0, 933; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 934; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 935; 936; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 937; GFX9-GISEL: ; %bb.0: 938; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 939; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 940; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 941; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 942; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 943; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 944; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 945; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc 946; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 947; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 948; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 949; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 950; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 951; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 952; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 953; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 954; GFX9-GISEL-NEXT: s_endpgm 955 %tid = call i32 @llvm.amdgcn.workitem.id.x() 956 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 957 %val = load i8, i8 addrspace(1)* %valptr.gep 958 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone 959 %cmp = icmp eq i8 %val, 0 960 %sel = select i1 %cmp, i8 -1, i8 %ctlz 961 store i8 %sel, i8 addrspace(1)* %out 962 ret void 963} 964 965define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 966; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 967; SI: ; %bb.0: 968; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 969; SI-NEXT: s_mov_b32 s3, 0xf000 970; SI-NEXT: s_mov_b32 s6, 0 971; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 972; SI-NEXT: v_mov_b32_e32 v1, 0 973; SI-NEXT: s_mov_b32 s7, s3 974; SI-NEXT: s_waitcnt lgkmcnt(0) 975; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 976; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 977; SI-NEXT: s_mov_b32 s2, -1 978; SI-NEXT: s_waitcnt vmcnt(0) 979; SI-NEXT: v_ffbh_u32_e32 v1, v0 980; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 981; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 982; SI-NEXT: s_waitcnt lgkmcnt(0) 983; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 984; SI-NEXT: s_waitcnt vmcnt(0) 985; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 986; SI-NEXT: s_waitcnt vmcnt(0) 987; SI-NEXT: s_endpgm 988; 989; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 990; VI: ; %bb.0: 991; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 992; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 993; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 994; VI-NEXT: s_waitcnt lgkmcnt(0) 995; VI-NEXT: v_mov_b32_e32 v1, s3 996; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 997; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 998; VI-NEXT: flat_load_dword v2, v[0:1] 999; VI-NEXT: v_mov_b32_e32 v0, s0 1000; VI-NEXT: v_mov_b32_e32 v1, s1 1001; VI-NEXT: s_waitcnt vmcnt(0) 1002; VI-NEXT: v_ffbh_u32_e32 v3, v2 1003; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1004; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 1005; VI-NEXT: flat_store_dword v[0:1], v3 1006; VI-NEXT: s_waitcnt vmcnt(0) 1007; VI-NEXT: flat_store_byte v[0:1], v2 1008; VI-NEXT: s_waitcnt vmcnt(0) 1009; VI-NEXT: s_endpgm 1010; 1011; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 1012; EG: ; %bb.0: 1013; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1014; EG-NEXT: TEX 0 @6 1015; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[] 1016; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 1017; EG-NEXT: MEM_RAT MSKOR T1.XW, T2.X 1018; EG-NEXT: CF_END 1019; EG-NEXT: Fetch clause starting at 6: 1020; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1021; EG-NEXT: ALU clause starting at 8: 1022; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1023; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1024; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1025; EG-NEXT: ALU clause starting at 11: 1026; EG-NEXT: SETE_INT * T0.W, T0.X, 0.0, 1027; EG-NEXT: AND_INT T1.X, PV.W, 1, 1028; EG-NEXT: MOV * T1.W, literal.x, 1029; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1030; EG-NEXT: MOV T1.Y, 0.0, 1031; EG-NEXT: MOV * T1.Z, 0.0, 1032; EG-NEXT: MOV T2.X, literal.x, 1033; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1034; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1035; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1036; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.y, 1037; EG-NEXT: -1(nan), 2(2.802597e-45) 1038; 1039; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 1040; GFX9-GISEL: ; %bb.0: 1041; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1042; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1043; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1044; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1045; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1047; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1048; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 1049; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1050; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 1051; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 1052; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1053; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1054; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off 1055; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1056; GFX9-GISEL-NEXT: s_endpgm 1057 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1058 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1059 %val = load i32, i32 addrspace(1)* %in.gep 1060 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1061 %cmp = icmp eq i32 %val, 0 1062 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1063 store volatile i32 %sel, i32 addrspace(1)* %out 1064 store volatile i1 %cmp, i1 addrspace(1)* undef 1065 ret void 1066} 1067 1068; Selected on wrong constant 1069define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1070; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1071; SI: ; %bb.0: 1072; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1073; SI-NEXT: s_mov_b32 s3, 0xf000 1074; SI-NEXT: s_mov_b32 s6, 0 1075; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1076; SI-NEXT: v_mov_b32_e32 v1, 0 1077; SI-NEXT: s_mov_b32 s7, s3 1078; SI-NEXT: s_waitcnt lgkmcnt(0) 1079; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1080; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1081; SI-NEXT: s_mov_b32 s2, -1 1082; SI-NEXT: s_waitcnt vmcnt(0) 1083; SI-NEXT: v_ffbh_u32_e32 v1, v0 1084; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1085; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1086; SI-NEXT: s_waitcnt lgkmcnt(0) 1087; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1088; SI-NEXT: s_endpgm 1089; 1090; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1091; VI: ; %bb.0: 1092; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1093; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1094; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1095; VI-NEXT: s_waitcnt lgkmcnt(0) 1096; VI-NEXT: v_mov_b32_e32 v1, s3 1097; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1098; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1099; VI-NEXT: flat_load_dword v0, v[0:1] 1100; VI-NEXT: s_waitcnt vmcnt(0) 1101; VI-NEXT: v_ffbh_u32_e32 v1, v0 1102; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1103; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 1104; VI-NEXT: v_mov_b32_e32 v0, s0 1105; VI-NEXT: v_mov_b32_e32 v1, s1 1106; VI-NEXT: flat_store_dword v[0:1], v2 1107; VI-NEXT: s_endpgm 1108; 1109; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1110; EG: ; %bb.0: 1111; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1112; EG-NEXT: TEX 0 @6 1113; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 1114; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1115; EG-NEXT: CF_END 1116; EG-NEXT: PAD 1117; EG-NEXT: Fetch clause starting at 6: 1118; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1119; EG-NEXT: ALU clause starting at 8: 1120; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1121; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1122; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1123; EG-NEXT: ALU clause starting at 11: 1124; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1125; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W, 1126; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1127; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1128; 1129; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1130; GFX9-GISEL: ; %bb.0: 1131; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1132; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1133; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1134; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1136; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1137; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1138; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1139; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 1140; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1141; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1142; GFX9-GISEL-NEXT: s_endpgm 1143 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1144 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1145 %val = load i32, i32 addrspace(1)* %in.gep 1146 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1147 %cmp = icmp eq i32 %val, 0 1148 %sel = select i1 %cmp, i32 0, i32 %ctlz 1149 store i32 %sel, i32 addrspace(1)* %out 1150 ret void 1151} 1152 1153; Selected on wrong constant 1154define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1155; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1156; SI: ; %bb.0: 1157; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1158; SI-NEXT: s_mov_b32 s3, 0xf000 1159; SI-NEXT: s_mov_b32 s6, 0 1160; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1161; SI-NEXT: v_mov_b32_e32 v1, 0 1162; SI-NEXT: s_mov_b32 s7, s3 1163; SI-NEXT: s_waitcnt lgkmcnt(0) 1164; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1165; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1166; SI-NEXT: s_mov_b32 s2, -1 1167; SI-NEXT: s_waitcnt vmcnt(0) 1168; SI-NEXT: v_ffbh_u32_e32 v1, v0 1169; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1170; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1171; SI-NEXT: s_waitcnt lgkmcnt(0) 1172; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1173; SI-NEXT: s_endpgm 1174; 1175; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1176; VI: ; %bb.0: 1177; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1178; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1179; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1180; VI-NEXT: s_waitcnt lgkmcnt(0) 1181; VI-NEXT: v_mov_b32_e32 v1, s3 1182; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1183; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1184; VI-NEXT: flat_load_dword v0, v[0:1] 1185; VI-NEXT: s_waitcnt vmcnt(0) 1186; VI-NEXT: v_ffbh_u32_e32 v1, v0 1187; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1188; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 1189; VI-NEXT: v_mov_b32_e32 v0, s0 1190; VI-NEXT: v_mov_b32_e32 v1, s1 1191; VI-NEXT: flat_store_dword v[0:1], v2 1192; VI-NEXT: s_endpgm 1193; 1194; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1195; EG: ; %bb.0: 1196; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1197; EG-NEXT: TEX 0 @6 1198; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 1199; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1200; EG-NEXT: CF_END 1201; EG-NEXT: PAD 1202; EG-NEXT: Fetch clause starting at 6: 1203; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1204; EG-NEXT: ALU clause starting at 8: 1205; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1206; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1207; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1208; EG-NEXT: ALU clause starting at 11: 1209; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1210; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W, 1211; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1212; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1213; 1214; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1215; GFX9-GISEL: ; %bb.0: 1216; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1217; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1218; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1219; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1220; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1221; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1222; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1223; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1224; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1225; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1226; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1227; GFX9-GISEL-NEXT: s_endpgm 1228 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1229 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1230 %val = load i32, i32 addrspace(1)* %in.gep 1231 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1232 %cmp = icmp ne i32 %val, 0 1233 %sel = select i1 %cmp, i32 %ctlz, i32 0 1234 store i32 %sel, i32 addrspace(1)* %out 1235 ret void 1236} 1237 1238; Compare on wrong constant 1239define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1240; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 1241; SI: ; %bb.0: 1242; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1243; SI-NEXT: s_mov_b32 s3, 0xf000 1244; SI-NEXT: s_mov_b32 s6, 0 1245; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1246; SI-NEXT: v_mov_b32_e32 v1, 0 1247; SI-NEXT: s_mov_b32 s7, s3 1248; SI-NEXT: s_waitcnt lgkmcnt(0) 1249; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1250; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1251; SI-NEXT: s_mov_b32 s2, -1 1252; SI-NEXT: s_waitcnt vmcnt(0) 1253; SI-NEXT: v_ffbh_u32_e32 v1, v0 1254; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 1255; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1256; SI-NEXT: s_waitcnt lgkmcnt(0) 1257; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1258; SI-NEXT: s_endpgm 1259; 1260; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 1261; VI: ; %bb.0: 1262; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1263; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1264; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1265; VI-NEXT: s_waitcnt lgkmcnt(0) 1266; VI-NEXT: v_mov_b32_e32 v1, s3 1267; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1268; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1269; VI-NEXT: flat_load_dword v0, v[0:1] 1270; VI-NEXT: s_waitcnt vmcnt(0) 1271; VI-NEXT: v_ffbh_u32_e32 v1, v0 1272; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 1273; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 1274; VI-NEXT: v_mov_b32_e32 v0, s0 1275; VI-NEXT: v_mov_b32_e32 v1, s1 1276; VI-NEXT: flat_store_dword v[0:1], v2 1277; VI-NEXT: s_endpgm 1278; 1279; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 1280; EG: ; %bb.0: 1281; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1282; EG-NEXT: TEX 0 @6 1283; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 1284; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1285; EG-NEXT: CF_END 1286; EG-NEXT: PAD 1287; EG-NEXT: Fetch clause starting at 6: 1288; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1289; EG-NEXT: ALU clause starting at 8: 1290; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1291; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1292; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1293; EG-NEXT: ALU clause starting at 11: 1294; EG-NEXT: FFBH_UINT T0.W, T0.X, 1295; EG-NEXT: SETE_INT * T1.W, T0.X, 1, 1296; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0, 1297; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1298; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1299; 1300; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 1301; GFX9-GISEL: ; %bb.0: 1302; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1303; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1304; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1305; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1306; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1307; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1308; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1309; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 1310; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 1311; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1312; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1313; GFX9-GISEL-NEXT: s_endpgm 1314 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1315 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1316 %val = load i32, i32 addrspace(1)* %in.gep 1317 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1318 %cmp = icmp eq i32 %val, 1 1319 %sel = select i1 %cmp, i32 0, i32 %ctlz 1320 store i32 %sel, i32 addrspace(1)* %out 1321 ret void 1322} 1323 1324; Selected on wrong constant 1325define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1326; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 1327; SI: ; %bb.0: 1328; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1329; SI-NEXT: s_mov_b32 s3, 0xf000 1330; SI-NEXT: s_mov_b32 s6, 0 1331; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1332; SI-NEXT: v_mov_b32_e32 v1, 0 1333; SI-NEXT: s_mov_b32 s7, s3 1334; SI-NEXT: s_waitcnt lgkmcnt(0) 1335; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1336; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1337; SI-NEXT: s_mov_b32 s2, -1 1338; SI-NEXT: s_waitcnt vmcnt(0) 1339; SI-NEXT: v_ffbh_u32_e32 v1, v0 1340; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 1341; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1342; SI-NEXT: s_waitcnt lgkmcnt(0) 1343; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1344; SI-NEXT: s_endpgm 1345; 1346; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 1347; VI: ; %bb.0: 1348; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1349; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1350; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1351; VI-NEXT: s_waitcnt lgkmcnt(0) 1352; VI-NEXT: v_mov_b32_e32 v1, s3 1353; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1354; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1355; VI-NEXT: flat_load_dword v0, v[0:1] 1356; VI-NEXT: s_waitcnt vmcnt(0) 1357; VI-NEXT: v_ffbh_u32_e32 v1, v0 1358; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 1359; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 1360; VI-NEXT: v_mov_b32_e32 v0, s0 1361; VI-NEXT: v_mov_b32_e32 v1, s1 1362; VI-NEXT: flat_store_dword v[0:1], v2 1363; VI-NEXT: s_endpgm 1364; 1365; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 1366; EG: ; %bb.0: 1367; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1368; EG-NEXT: TEX 0 @6 1369; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 1370; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1371; EG-NEXT: CF_END 1372; EG-NEXT: PAD 1373; EG-NEXT: Fetch clause starting at 6: 1374; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1375; EG-NEXT: ALU clause starting at 8: 1376; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1377; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1378; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1379; EG-NEXT: ALU clause starting at 11: 1380; EG-NEXT: FFBH_UINT T0.W, T0.X, 1381; EG-NEXT: SETNE_INT * T1.W, T0.X, 1, 1382; EG-NEXT: CNDE_INT T0.X, PS, 0.0, PV.W, 1383; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1384; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1385; 1386; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 1387; GFX9-GISEL: ; %bb.0: 1388; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1389; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1390; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1391; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1392; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1393; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1394; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1395; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 1396; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1397; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1398; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1399; GFX9-GISEL-NEXT: s_endpgm 1400 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1401 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1402 %val = load i32, i32 addrspace(1)* %in.gep 1403 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1404 %cmp = icmp ne i32 %val, 1 1405 %sel = select i1 %cmp, i32 %ctlz, i32 0 1406 store i32 %sel, i32 addrspace(1)* %out 1407 ret void 1408} 1409