1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,VI 4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,EG 5 6declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone 7declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 8declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone 9 10declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 11declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 12declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 13 14declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 15declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 16declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 17 18declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 19 20define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 21; SI-LABEL: s_ctlz_i32: 22; SI: ; %bb.0: 23; SI-NEXT: s_load_dword s2, s[0:1], 0xb 24; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 25; SI-NEXT: s_mov_b32 s7, 0xf000 26; SI-NEXT: s_waitcnt lgkmcnt(0) 27; SI-NEXT: s_flbit_i32_b32 s0, s2 28; SI-NEXT: s_cmp_lg_u32 s2, 0 29; SI-NEXT: s_cselect_b32 s0, s0, 32 30; SI-NEXT: s_mov_b32 s6, -1 31; SI-NEXT: v_mov_b32_e32 v0, s0 32; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 33; SI-NEXT: s_endpgm 34; 35; VI-LABEL: s_ctlz_i32: 36; VI: ; %bb.0: 37; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 38; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 39; VI-NEXT: s_mov_b32 s7, 0xf000 40; VI-NEXT: s_mov_b32 s6, -1 41; VI-NEXT: s_waitcnt lgkmcnt(0) 42; VI-NEXT: s_flbit_i32_b32 s1, s0 43; VI-NEXT: s_cmp_lg_u32 s0, 0 44; VI-NEXT: s_cselect_b32 s0, s1, 32 45; VI-NEXT: v_mov_b32_e32 v0, s0 46; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48; 49; EG-LABEL: s_ctlz_i32: 50; EG: ; %bb.0: 51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: ALU clause starting at 4: 56; EG-NEXT: FFBH_UINT * T0.W, KC0[2].Z, 57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 60 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 61 store i32 %ctlz, i32 addrspace(1)* %out, align 4 62 ret void 63} 64 65define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 66; SI-LABEL: v_ctlz_i32: 67; SI: ; %bb.0: 68; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 69; SI-NEXT: s_mov_b32 s3, 0xf000 70; SI-NEXT: s_mov_b32 s6, 0 71; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 72; SI-NEXT: v_mov_b32_e32 v1, 0 73; SI-NEXT: s_mov_b32 s7, s3 74; SI-NEXT: s_waitcnt lgkmcnt(0) 75; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 76; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 77; SI-NEXT: s_mov_b32 s2, -1 78; SI-NEXT: s_waitcnt vmcnt(0) 79; SI-NEXT: v_ffbh_u32_e32 v1, v0 80; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 81; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 82; SI-NEXT: s_waitcnt lgkmcnt(0) 83; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 84; SI-NEXT: s_endpgm 85; 86; VI-LABEL: v_ctlz_i32: 87; VI: ; %bb.0: 88; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 89; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 90; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 91; VI-NEXT: s_mov_b32 s7, 0xf000 92; VI-NEXT: s_mov_b32 s6, -1 93; VI-NEXT: s_waitcnt lgkmcnt(0) 94; VI-NEXT: v_mov_b32_e32 v1, s1 95; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 96; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 97; VI-NEXT: flat_load_dword v0, v[0:1] 98; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; VI-NEXT: v_ffbh_u32_e32 v1, v0 100; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 101; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 102; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 103; VI-NEXT: s_endpgm 104; 105; EG-LABEL: v_ctlz_i32: 106; EG: ; %bb.0: 107; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 108; EG-NEXT: TEX 0 @6 109; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 110; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 111; EG-NEXT: CF_END 112; EG-NEXT: PAD 113; EG-NEXT: Fetch clause starting at 6: 114; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 115; EG-NEXT: ALU clause starting at 8: 116; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 117; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 118; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 119; EG-NEXT: ALU clause starting at 11: 120; EG-NEXT: FFBH_UINT * T0.W, T0.X, 121; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 122; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 123; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 126 %val = load i32, i32 addrspace(1)* %in.gep, align 4 127 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 128 store i32 %ctlz, i32 addrspace(1)* %out, align 4 129 ret void 130} 131 132define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 133; SI-LABEL: v_ctlz_v2i32: 134; SI: ; %bb.0: 135; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 136; SI-NEXT: s_mov_b32 s3, 0xf000 137; SI-NEXT: s_mov_b32 s6, 0 138; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 139; SI-NEXT: v_mov_b32_e32 v1, 0 140; SI-NEXT: s_mov_b32 s7, s3 141; SI-NEXT: s_waitcnt lgkmcnt(0) 142; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 143; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 144; SI-NEXT: s_mov_b32 s2, -1 145; SI-NEXT: s_waitcnt vmcnt(0) 146; SI-NEXT: v_ffbh_u32_e32 v2, v1 147; SI-NEXT: v_ffbh_u32_e32 v3, v0 148; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 149; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 150; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 151; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 154; SI-NEXT: s_endpgm 155; 156; VI-LABEL: v_ctlz_v2i32: 157; VI: ; %bb.0: 158; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 159; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 160; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 161; VI-NEXT: s_mov_b32 s7, 0xf000 162; VI-NEXT: s_mov_b32 s6, -1 163; VI-NEXT: s_waitcnt lgkmcnt(0) 164; VI-NEXT: v_mov_b32_e32 v1, s1 165; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 166; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 167; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 168; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 169; VI-NEXT: v_ffbh_u32_e32 v2, v1 170; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 171; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 172; VI-NEXT: v_ffbh_u32_e32 v3, v0 173; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 174; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc 175; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 176; VI-NEXT: s_endpgm 177; 178; EG-LABEL: v_ctlz_v2i32: 179; EG: ; %bb.0: 180; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 181; EG-NEXT: TEX 0 @6 182; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 183; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 184; EG-NEXT: CF_END 185; EG-NEXT: PAD 186; EG-NEXT: Fetch clause starting at 6: 187; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 188; EG-NEXT: ALU clause starting at 8: 189; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 190; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 191; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 192; EG-NEXT: ALU clause starting at 11: 193; EG-NEXT: FFBH_UINT * T0.W, T0.Y, 194; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 195; EG-NEXT: FFBH_UINT * T0.W, T0.X, 196; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 197; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 198; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 199; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 200 %tid = call i32 @llvm.amdgcn.workitem.id.x() 201 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 202 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 203 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 204 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 205 ret void 206} 207 208define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 209; SI-LABEL: v_ctlz_v4i32: 210; SI: ; %bb.0: 211; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 212; SI-NEXT: s_mov_b32 s3, 0xf000 213; SI-NEXT: s_mov_b32 s6, 0 214; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 215; SI-NEXT: v_mov_b32_e32 v1, 0 216; SI-NEXT: s_mov_b32 s7, s3 217; SI-NEXT: s_waitcnt lgkmcnt(0) 218; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 219; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 220; SI-NEXT: s_mov_b32 s2, -1 221; SI-NEXT: s_waitcnt vmcnt(0) 222; SI-NEXT: v_ffbh_u32_e32 v4, v3 223; SI-NEXT: v_ffbh_u32_e32 v5, v2 224; SI-NEXT: v_ffbh_u32_e32 v6, v1 225; SI-NEXT: v_ffbh_u32_e32 v7, v0 226; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 227; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc 228; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 229; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc 230; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 231; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc 232; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 233; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc 234; SI-NEXT: s_waitcnt lgkmcnt(0) 235; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 236; SI-NEXT: s_endpgm 237; 238; VI-LABEL: v_ctlz_v4i32: 239; VI: ; %bb.0: 240; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 241; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 242; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 243; VI-NEXT: s_mov_b32 s7, 0xf000 244; VI-NEXT: s_mov_b32 s6, -1 245; VI-NEXT: s_waitcnt lgkmcnt(0) 246; VI-NEXT: v_mov_b32_e32 v1, s1 247; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 248; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 249; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 250; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 251; VI-NEXT: v_ffbh_u32_e32 v4, v3 252; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 253; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc 254; VI-NEXT: v_ffbh_u32_e32 v5, v2 255; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 256; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc 257; VI-NEXT: v_ffbh_u32_e32 v6, v1 258; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 259; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc 260; VI-NEXT: v_ffbh_u32_e32 v7, v0 261; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 262; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc 263; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 264; VI-NEXT: s_endpgm 265; 266; EG-LABEL: v_ctlz_v4i32: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 278; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 279; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 280; EG-NEXT: ALU clause starting at 11: 281; EG-NEXT: FFBH_UINT * T1.W, T0.W, 282; EG-NEXT: FFBH_UINT T2.W, T0.Z, 283; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 284; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 285; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 286; EG-NEXT: FFBH_UINT * T1.W, T0.Y, 287; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 288; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 289; EG-NEXT: FFBH_UINT * T1.W, T0.X, 290; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 291; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 292; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 293; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 294 %tid = call i32 @llvm.amdgcn.workitem.id.x() 295 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 296 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 297 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 298 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 299 ret void 300} 301 302define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 303; SI-LABEL: v_ctlz_i8: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 306; SI-NEXT: s_mov_b32 s3, 0xf000 307; SI-NEXT: s_mov_b32 s2, -1 308; SI-NEXT: s_mov_b32 s6, s2 309; SI-NEXT: s_mov_b32 s7, s3 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 312; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 313; SI-NEXT: s_waitcnt vmcnt(0) 314; SI-NEXT: v_ffbh_u32_e32 v1, v0 315; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 316; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 317; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 318; SI-NEXT: s_waitcnt lgkmcnt(0) 319; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 320; SI-NEXT: s_endpgm 321; 322; VI-LABEL: v_ctlz_i8: 323; VI: ; %bb.0: 324; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 325; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 326; VI-NEXT: s_mov_b32 s7, 0xf000 327; VI-NEXT: s_mov_b32 s6, -1 328; VI-NEXT: s_mov_b32 s2, s6 329; VI-NEXT: s_mov_b32 s3, s7 330; VI-NEXT: s_waitcnt lgkmcnt(0) 331; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 332; VI-NEXT: s_waitcnt vmcnt(0) 333; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 334; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 335; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 336; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 337; VI-NEXT: v_add_u16_e32 v0, -8, v0 338; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 339; VI-NEXT: s_endpgm 340; 341; EG-LABEL: v_ctlz_i8: 342; EG: ; %bb.0: 343; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 344; EG-NEXT: TEX 0 @6 345; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 346; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 347; EG-NEXT: CF_END 348; EG-NEXT: PAD 349; EG-NEXT: Fetch clause starting at 6: 350; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 351; EG-NEXT: ALU clause starting at 8: 352; EG-NEXT: MOV * T0.X, KC0[2].Z, 353; EG-NEXT: ALU clause starting at 9: 354; EG-NEXT: FFBH_UINT * T0.W, T0.X, 355; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W, 356; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 357; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45) 358; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 359; EG-NEXT: -24(nan), 0(0.000000e+00) 360; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 361; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 362; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 363; EG-NEXT: LSHL T0.X, PV.W, PS, 364; EG-NEXT: LSHL * T0.W, literal.x, PS, 365; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 366; EG-NEXT: MOV T0.Y, 0.0, 367; EG-NEXT: MOV * T0.Z, 0.0, 368; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 369; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 370 %val = load i8, i8 addrspace(1)* %valptr 371 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 372 store i8 %ctlz, i8 addrspace(1)* %out 373 ret void 374} 375 376define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 377; SI-LABEL: s_ctlz_i64: 378; SI: ; %bb.0: 379; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 380; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 381; SI-NEXT: s_mov_b32 s7, 0xf000 382; SI-NEXT: s_mov_b32 s6, -1 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_flbit_i32_b32 s0, s2 385; SI-NEXT: s_flbit_i32_b32 s1, s3 386; SI-NEXT: s_add_i32 s0, s0, 32 387; SI-NEXT: s_cmp_eq_u32 s3, 0 388; SI-NEXT: s_cselect_b32 s0, s0, s1 389; SI-NEXT: s_or_b32 s1, s2, s3 390; SI-NEXT: s_cmp_lg_u32 s1, 0 391; SI-NEXT: s_cselect_b32 s0, s0, 64 392; SI-NEXT: v_mov_b32_e32 v1, 0 393; SI-NEXT: v_mov_b32_e32 v0, s0 394; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 395; SI-NEXT: s_endpgm 396; 397; VI-LABEL: s_ctlz_i64: 398; VI: ; %bb.0: 399; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 400; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c 401; VI-NEXT: s_mov_b32 s7, 0xf000 402; VI-NEXT: s_mov_b32 s6, -1 403; VI-NEXT: v_mov_b32_e32 v1, 0 404; VI-NEXT: s_waitcnt lgkmcnt(0) 405; VI-NEXT: s_flbit_i32_b32 s2, s0 406; VI-NEXT: s_add_i32 s2, s2, 32 407; VI-NEXT: s_flbit_i32_b32 s3, s1 408; VI-NEXT: s_cmp_eq_u32 s1, 0 409; VI-NEXT: s_cselect_b32 s2, s2, s3 410; VI-NEXT: s_or_b32 s0, s0, s1 411; VI-NEXT: s_cmp_lg_u32 s0, 0 412; VI-NEXT: s_cselect_b32 s0, s2, 64 413; VI-NEXT: v_mov_b32_e32 v0, s0 414; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 415; VI-NEXT: s_endpgm 416; 417; EG-LABEL: s_ctlz_i64: 418; EG: ; %bb.0: 419; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 420; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 421; EG-NEXT: CF_END 422; EG-NEXT: PAD 423; EG-NEXT: ALU clause starting at 4: 424; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 425; EG-NEXT: CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W, 426; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 427; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 428; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 429; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 430; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 431; EG-NEXT: MOV T0.Y, 0.0, 432; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 433; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 434 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 435 store i64 %ctlz, i64 addrspace(1)* %out 436 ret void 437} 438 439define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 440; SI-LABEL: s_ctlz_i64_trunc: 441; SI: ; %bb.0: 442; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 443; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 444; SI-NEXT: s_mov_b32 s7, 0xf000 445; SI-NEXT: s_waitcnt lgkmcnt(0) 446; SI-NEXT: s_flbit_i32_b32 s0, s2 447; SI-NEXT: s_flbit_i32_b32 s1, s3 448; SI-NEXT: s_add_i32 s0, s0, 32 449; SI-NEXT: s_cmp_eq_u32 s3, 0 450; SI-NEXT: s_cselect_b32 s0, s0, s1 451; SI-NEXT: s_or_b32 s1, s2, s3 452; SI-NEXT: s_cmp_lg_u32 s1, 0 453; SI-NEXT: s_cselect_b32 s0, s0, 64 454; SI-NEXT: s_mov_b32 s6, -1 455; SI-NEXT: v_mov_b32_e32 v0, s0 456; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 457; SI-NEXT: s_endpgm 458; 459; VI-LABEL: s_ctlz_i64_trunc: 460; VI: ; %bb.0: 461; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 462; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 463; VI-NEXT: s_mov_b32 s7, 0xf000 464; VI-NEXT: s_mov_b32 s6, -1 465; VI-NEXT: s_waitcnt lgkmcnt(0) 466; VI-NEXT: s_flbit_i32_b32 s2, s0 467; VI-NEXT: s_add_i32 s2, s2, 32 468; VI-NEXT: s_flbit_i32_b32 s3, s1 469; VI-NEXT: s_cmp_eq_u32 s1, 0 470; VI-NEXT: s_cselect_b32 s2, s2, s3 471; VI-NEXT: s_or_b32 s0, s0, s1 472; VI-NEXT: s_cmp_lg_u32 s0, 0 473; VI-NEXT: s_cselect_b32 s0, s2, 64 474; VI-NEXT: v_mov_b32_e32 v0, s0 475; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 476; VI-NEXT: s_endpgm 477; 478; EG-LABEL: s_ctlz_i64_trunc: 479; EG: ; %bb.0: 480; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 481; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 482; EG-NEXT: CF_END 483; EG-NEXT: PAD 484; EG-NEXT: ALU clause starting at 4: 485; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 486; EG-NEXT: CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W, 487; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 488; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 489; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 490; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 491; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 492; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 493; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 494 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 495 %trunc = trunc i64 %ctlz to i32 496 store i32 %trunc, i32 addrspace(1)* %out 497 ret void 498} 499 500define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 501; SI-LABEL: v_ctlz_i64: 502; SI: ; %bb.0: 503; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 504; SI-NEXT: s_mov_b32 s7, 0xf000 505; SI-NEXT: s_mov_b32 s6, 0 506; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 507; SI-NEXT: v_mov_b32_e32 v1, 0 508; SI-NEXT: s_waitcnt lgkmcnt(0) 509; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 510; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 511; SI-NEXT: s_waitcnt vmcnt(0) 512; SI-NEXT: v_ffbh_u32_e32 v4, v2 513; SI-NEXT: v_ffbh_u32_e32 v5, v3 514; SI-NEXT: v_or_b32_e32 v2, v2, v3 515; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v4 516; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 517; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 518; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 519; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc 520; SI-NEXT: v_mov_b32_e32 v3, v1 521; SI-NEXT: s_waitcnt lgkmcnt(0) 522; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 523; SI-NEXT: s_endpgm 524; 525; VI-LABEL: v_ctlz_i64: 526; VI: ; %bb.0: 527; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 528; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 529; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 530; VI-NEXT: v_mov_b32_e32 v4, 0 531; VI-NEXT: v_mov_b32_e32 v2, 0 532; VI-NEXT: s_waitcnt lgkmcnt(0) 533; VI-NEXT: v_mov_b32_e32 v5, s3 534; VI-NEXT: v_mov_b32_e32 v1, s1 535; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 536; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 537; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 538; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 539; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc 540; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 541; VI-NEXT: v_ffbh_u32_e32 v5, v0 542; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5 543; VI-NEXT: v_ffbh_u32_e32 v6, v1 544; VI-NEXT: v_or_b32_e32 v0, v0, v1 545; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 546; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc 547; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 548; VI-NEXT: v_cndmask_b32_e32 v1, 64, v1, vcc 549; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 550; VI-NEXT: s_endpgm 551; 552; EG-LABEL: v_ctlz_i64: 553; EG: ; %bb.0: 554; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 555; EG-NEXT: TEX 0 @6 556; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 557; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 558; EG-NEXT: CF_END 559; EG-NEXT: PAD 560; EG-NEXT: Fetch clause starting at 6: 561; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 562; EG-NEXT: ALU clause starting at 8: 563; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 564; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 565; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 566; EG-NEXT: ALU clause starting at 11: 567; EG-NEXT: FFBH_UINT * T1.W, T0.X, 568; EG-NEXT: CNDE_INT * T1.W, T0.X, literal.x, PV.W, 569; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 570; EG-NEXT: FFBH_UINT T2.W, T0.Y, 571; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 572; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 573; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 574; EG-NEXT: MOV T0.Y, 0.0, 575; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 576; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 577; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 578 %tid = call i32 @llvm.amdgcn.workitem.id.x() 579 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 580 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 581 %val = load i64, i64 addrspace(1)* %in.gep 582 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 583 store i64 %ctlz, i64 addrspace(1)* %out.gep 584 ret void 585} 586 587define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 588; SI-LABEL: v_ctlz_i64_trunc: 589; SI: ; %bb.0: 590; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 591; SI-NEXT: s_mov_b32 s7, 0xf000 592; SI-NEXT: s_mov_b32 s6, 0 593; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 594; SI-NEXT: v_mov_b32_e32 v2, 0 595; SI-NEXT: s_waitcnt lgkmcnt(0) 596; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 597; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 598; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 599; SI-NEXT: s_waitcnt vmcnt(0) 600; SI-NEXT: v_ffbh_u32_e32 v0, v3 601; SI-NEXT: v_ffbh_u32_e32 v5, v4 602; SI-NEXT: v_or_b32_e32 v3, v3, v4 603; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 604; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 605; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc 606; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 607; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 608; SI-NEXT: s_waitcnt lgkmcnt(0) 609; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 610; SI-NEXT: s_endpgm 611; 612; VI-LABEL: v_ctlz_i64_trunc: 613; VI: ; %bb.0: 614; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 615; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 616; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 617; VI-NEXT: v_mov_b32_e32 v4, 0 618; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 619; VI-NEXT: s_waitcnt lgkmcnt(0) 620; VI-NEXT: v_mov_b32_e32 v5, s3 621; VI-NEXT: v_mov_b32_e32 v2, s1 622; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 623; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 624; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 625; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 626; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc 627; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 628; VI-NEXT: v_ffbh_u32_e32 v0, v1 629; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 630; VI-NEXT: v_ffbh_u32_e32 v5, v2 631; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 632; VI-NEXT: v_or_b32_e32 v1, v1, v2 633; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc 634; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 635; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 636; VI-NEXT: flat_store_dword v[3:4], v0 637; VI-NEXT: s_endpgm 638; 639; EG-LABEL: v_ctlz_i64_trunc: 640; EG: ; %bb.0: 641; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 642; EG-NEXT: TEX 0 @6 643; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 644; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 645; EG-NEXT: CF_END 646; EG-NEXT: PAD 647; EG-NEXT: Fetch clause starting at 6: 648; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 649; EG-NEXT: ALU clause starting at 8: 650; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 651; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 652; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 653; EG-NEXT: ALU clause starting at 11: 654; EG-NEXT: FFBH_UINT * T0.W, T1.X, 655; EG-NEXT: CNDE_INT * T0.W, T1.X, literal.x, PV.W, 656; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 657; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 658; EG-NEXT: FFBH_UINT T1.W, T1.Y, 659; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 660; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 661; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 662; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 663; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 664; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 665 %tid = call i32 @llvm.amdgcn.workitem.id.x() 666 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 667 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 668 %val = load i64, i64 addrspace(1)* %in.gep 669 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 670 %trunc = trunc i64 %ctlz to i32 671 store i32 %trunc, i32 addrspace(1)* %out.gep 672 ret void 673} 674 675define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 676; SI-LABEL: v_ctlz_i32_sel_eq_neg1: 677; SI: ; %bb.0: 678; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 679; SI-NEXT: s_mov_b32 s3, 0xf000 680; SI-NEXT: s_mov_b32 s6, 0 681; SI-NEXT: s_mov_b32 s7, s3 682; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 683; SI-NEXT: v_mov_b32_e32 v1, 0 684; SI-NEXT: s_waitcnt lgkmcnt(0) 685; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 686; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 687; SI-NEXT: s_mov_b32 s2, -1 688; SI-NEXT: s_waitcnt vmcnt(0) 689; SI-NEXT: v_ffbh_u32_e32 v0, v0 690; SI-NEXT: s_waitcnt lgkmcnt(0) 691; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 692; SI-NEXT: s_endpgm 693; 694; VI-LABEL: v_ctlz_i32_sel_eq_neg1: 695; VI: ; %bb.0: 696; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 697; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 698; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 699; VI-NEXT: s_mov_b32 s7, 0xf000 700; VI-NEXT: s_mov_b32 s6, -1 701; VI-NEXT: s_waitcnt lgkmcnt(0) 702; VI-NEXT: v_mov_b32_e32 v1, s1 703; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 704; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 705; VI-NEXT: flat_load_dword v0, v[0:1] 706; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 707; VI-NEXT: v_ffbh_u32_e32 v0, v0 708; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 709; VI-NEXT: s_endpgm 710; 711; EG-LABEL: v_ctlz_i32_sel_eq_neg1: 712; EG: ; %bb.0: 713; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 714; EG-NEXT: TEX 0 @6 715; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 716; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 717; EG-NEXT: CF_END 718; EG-NEXT: PAD 719; EG-NEXT: Fetch clause starting at 6: 720; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 721; EG-NEXT: ALU clause starting at 8: 722; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 723; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 724; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 725; EG-NEXT: ALU clause starting at 11: 726; EG-NEXT: FFBH_UINT * T0.W, T0.X, 727; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 728; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 729; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 730; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 731; EG-NEXT: -1(nan), 2(2.802597e-45) 732 %tid = call i32 @llvm.amdgcn.workitem.id.x() 733 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 734 %val = load i32, i32 addrspace(1)* %in.gep 735 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 736 %cmp = icmp eq i32 %val, 0 737 %sel = select i1 %cmp, i32 -1, i32 %ctlz 738 store i32 %sel, i32 addrspace(1)* %out 739 ret void 740} 741 742define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 743; SI-LABEL: v_ctlz_i32_sel_ne_neg1: 744; SI: ; %bb.0: 745; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 746; SI-NEXT: s_mov_b32 s3, 0xf000 747; SI-NEXT: s_mov_b32 s6, 0 748; SI-NEXT: s_mov_b32 s7, s3 749; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 750; SI-NEXT: v_mov_b32_e32 v1, 0 751; SI-NEXT: s_waitcnt lgkmcnt(0) 752; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 753; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 754; SI-NEXT: s_mov_b32 s2, -1 755; SI-NEXT: s_waitcnt vmcnt(0) 756; SI-NEXT: v_ffbh_u32_e32 v0, v0 757; SI-NEXT: s_waitcnt lgkmcnt(0) 758; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 759; SI-NEXT: s_endpgm 760; 761; VI-LABEL: v_ctlz_i32_sel_ne_neg1: 762; VI: ; %bb.0: 763; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 764; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 765; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 766; VI-NEXT: s_mov_b32 s7, 0xf000 767; VI-NEXT: s_mov_b32 s6, -1 768; VI-NEXT: s_waitcnt lgkmcnt(0) 769; VI-NEXT: v_mov_b32_e32 v1, s1 770; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 771; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 772; VI-NEXT: flat_load_dword v0, v[0:1] 773; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 774; VI-NEXT: v_ffbh_u32_e32 v0, v0 775; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 776; VI-NEXT: s_endpgm 777; 778; EG-LABEL: v_ctlz_i32_sel_ne_neg1: 779; EG: ; %bb.0: 780; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 781; EG-NEXT: TEX 0 @6 782; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 783; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 784; EG-NEXT: CF_END 785; EG-NEXT: PAD 786; EG-NEXT: Fetch clause starting at 6: 787; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 788; EG-NEXT: ALU clause starting at 8: 789; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 790; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 791; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 792; EG-NEXT: ALU clause starting at 11: 793; EG-NEXT: FFBH_UINT * T0.W, T0.X, 794; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 795; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 796; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 797; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 798; EG-NEXT: -1(nan), 2(2.802597e-45) 799 %tid = call i32 @llvm.amdgcn.workitem.id.x() 800 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 801 %val = load i32, i32 addrspace(1)* %in.gep 802 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 803 %cmp = icmp ne i32 %val, 0 804 %sel = select i1 %cmp, i32 %ctlz, i32 -1 805 store i32 %sel, i32 addrspace(1)* %out 806 ret void 807} 808 809; TODO: Should be able to eliminate select here as well. 810define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 811; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 812; SI: ; %bb.0: 813; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 814; SI-NEXT: s_mov_b32 s3, 0xf000 815; SI-NEXT: s_mov_b32 s6, 0 816; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 817; SI-NEXT: v_mov_b32_e32 v1, 0 818; SI-NEXT: s_mov_b32 s7, s3 819; SI-NEXT: s_waitcnt lgkmcnt(0) 820; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 821; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 822; SI-NEXT: s_mov_b32 s2, -1 823; SI-NEXT: s_waitcnt vmcnt(0) 824; SI-NEXT: v_ffbh_u32_e32 v1, v0 825; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 826; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 827; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 828; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 829; SI-NEXT: s_waitcnt lgkmcnt(0) 830; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 831; SI-NEXT: s_endpgm 832; 833; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 834; VI: ; %bb.0: 835; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 836; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 837; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 838; VI-NEXT: s_mov_b32 s7, 0xf000 839; VI-NEXT: s_mov_b32 s6, -1 840; VI-NEXT: s_waitcnt lgkmcnt(0) 841; VI-NEXT: v_mov_b32_e32 v1, s1 842; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 843; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 844; VI-NEXT: flat_load_dword v0, v[0:1] 845; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 846; VI-NEXT: v_ffbh_u32_e32 v1, v0 847; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 848; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 849; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 850; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 851; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 852; VI-NEXT: s_endpgm 853; 854; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: 855; EG: ; %bb.0: 856; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 857; EG-NEXT: TEX 0 @6 858; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 859; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 860; EG-NEXT: CF_END 861; EG-NEXT: PAD 862; EG-NEXT: Fetch clause starting at 6: 863; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 864; EG-NEXT: ALU clause starting at 8: 865; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 866; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 867; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 868; EG-NEXT: ALU clause starting at 11: 869; EG-NEXT: FFBH_UINT * T0.W, T0.X, 870; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 871; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 872; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 873; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 874; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 875; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 876; EG-NEXT: -1(nan), 2(2.802597e-45) 877 %tid = call i32 @llvm.amdgcn.workitem.id.x() 878 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 879 %val = load i32, i32 addrspace(1)* %in.gep 880 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 881 %cmp = icmp eq i32 %ctlz, 32 882 %sel = select i1 %cmp, i32 -1, i32 %ctlz 883 store i32 %sel, i32 addrspace(1)* %out 884 ret void 885} 886 887define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 888; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 889; SI: ; %bb.0: 890; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 891; SI-NEXT: s_mov_b32 s3, 0xf000 892; SI-NEXT: s_mov_b32 s6, 0 893; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 894; SI-NEXT: v_mov_b32_e32 v1, 0 895; SI-NEXT: s_mov_b32 s7, s3 896; SI-NEXT: s_waitcnt lgkmcnt(0) 897; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 898; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 899; SI-NEXT: s_mov_b32 s2, -1 900; SI-NEXT: s_waitcnt vmcnt(0) 901; SI-NEXT: v_ffbh_u32_e32 v1, v0 902; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 903; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 904; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 905; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 906; SI-NEXT: s_waitcnt lgkmcnt(0) 907; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 908; SI-NEXT: s_endpgm 909; 910; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 911; VI: ; %bb.0: 912; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 913; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 914; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 915; VI-NEXT: s_mov_b32 s7, 0xf000 916; VI-NEXT: s_mov_b32 s6, -1 917; VI-NEXT: s_waitcnt lgkmcnt(0) 918; VI-NEXT: v_mov_b32_e32 v1, s1 919; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 920; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 921; VI-NEXT: flat_load_dword v0, v[0:1] 922; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 923; VI-NEXT: v_ffbh_u32_e32 v1, v0 924; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 925; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 926; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 927; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 928; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 929; VI-NEXT: s_endpgm 930; 931; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: 932; EG: ; %bb.0: 933; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 934; EG-NEXT: TEX 0 @6 935; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 936; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 937; EG-NEXT: CF_END 938; EG-NEXT: PAD 939; EG-NEXT: Fetch clause starting at 6: 940; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 941; EG-NEXT: ALU clause starting at 8: 942; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 943; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 944; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 945; EG-NEXT: ALU clause starting at 11: 946; EG-NEXT: FFBH_UINT * T0.W, T0.X, 947; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 948; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 949; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 950; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 951; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 952; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 953; EG-NEXT: -1(nan), 2(2.802597e-45) 954 %tid = call i32 @llvm.amdgcn.workitem.id.x() 955 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 956 %val = load i32, i32 addrspace(1)* %in.gep 957 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 958 %cmp = icmp ne i32 %ctlz, 32 959 %sel = select i1 %cmp, i32 %ctlz, i32 -1 960 store i32 %sel, i32 addrspace(1)* %out 961 ret void 962} 963 964 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 965; SI-LABEL: v_ctlz_i8_sel_eq_neg1: 966; SI: ; %bb.0: 967; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 968; SI-NEXT: s_mov_b32 s3, 0xf000 969; SI-NEXT: v_mov_b32_e32 v1, 0 970; SI-NEXT: s_mov_b32 s6, 0 971; SI-NEXT: s_mov_b32 s7, s3 972; SI-NEXT: s_waitcnt lgkmcnt(0) 973; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 974; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 975; SI-NEXT: s_mov_b32 s2, -1 976; SI-NEXT: s_waitcnt vmcnt(0) 977; SI-NEXT: v_ffbh_u32_e32 v0, v0 978; SI-NEXT: s_waitcnt lgkmcnt(0) 979; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 980; SI-NEXT: s_endpgm 981; 982; VI-LABEL: v_ctlz_i8_sel_eq_neg1: 983; VI: ; %bb.0: 984; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 985; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 986; VI-NEXT: s_mov_b32 s7, 0xf000 987; VI-NEXT: s_mov_b32 s6, -1 988; VI-NEXT: s_waitcnt lgkmcnt(0) 989; VI-NEXT: v_mov_b32_e32 v1, s1 990; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 991; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 992; VI-NEXT: flat_load_ubyte v0, v[0:1] 993; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 994; VI-NEXT: v_ffbh_u32_e32 v0, v0 995; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 996; VI-NEXT: s_endpgm 997; 998; EG-LABEL: v_ctlz_i8_sel_eq_neg1: 999; EG: ; %bb.0: 1000; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1001; EG-NEXT: TEX 0 @6 1002; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1003; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1004; EG-NEXT: CF_END 1005; EG-NEXT: PAD 1006; EG-NEXT: Fetch clause starting at 6: 1007; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1008; EG-NEXT: ALU clause starting at 8: 1009; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1010; EG-NEXT: ALU clause starting at 9: 1011; EG-NEXT: FFBH_UINT T0.W, T0.X, 1012; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1013; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1014; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1015; EG-NEXT: LSHL * T1.W, PS, literal.y, 1016; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1017; EG-NEXT: LSHL T0.X, PV.W, PS, 1018; EG-NEXT: LSHL * T0.W, literal.x, PS, 1019; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1020; EG-NEXT: MOV T0.Y, 0.0, 1021; EG-NEXT: MOV * T0.Z, 0.0, 1022; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1023; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1024 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1025 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 1026 %val = load i8, i8 addrspace(1)* %valptr.gep 1027 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 1028 %cmp = icmp eq i8 %val, 0 1029 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1030 store i8 %sel, i8 addrspace(1)* %out 1031 ret void 1032} 1033 1034 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { 1035; SI-LABEL: v_ctlz_i16_sel_eq_neg1: 1036; SI: ; %bb.0: 1037; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1038; SI-NEXT: s_mov_b32 s3, 0xf000 1039; SI-NEXT: s_mov_b32 s2, -1 1040; SI-NEXT: s_mov_b32 s6, s2 1041; SI-NEXT: s_mov_b32 s7, s3 1042; SI-NEXT: s_waitcnt lgkmcnt(0) 1043; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1044; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1045; SI-NEXT: s_waitcnt vmcnt(0) 1046; SI-NEXT: v_ffbh_u32_e32 v0, v0 1047; SI-NEXT: s_waitcnt lgkmcnt(0) 1048; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1049; SI-NEXT: s_endpgm 1050; 1051; VI-LABEL: v_ctlz_i16_sel_eq_neg1: 1052; VI: ; %bb.0: 1053; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1054; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1055; VI-NEXT: s_mov_b32 s7, 0xf000 1056; VI-NEXT: s_mov_b32 s6, -1 1057; VI-NEXT: s_mov_b32 s2, s6 1058; VI-NEXT: s_mov_b32 s3, s7 1059; VI-NEXT: s_waitcnt lgkmcnt(0) 1060; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1061; VI-NEXT: s_waitcnt vmcnt(0) 1062; VI-NEXT: v_ffbh_u32_e32 v1, v0 1063; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 1064; VI-NEXT: v_cndmask_b32_e64 v0, 32, v1, s[0:1] 1065; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 1066; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1067; VI-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1] 1068; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 1069; VI-NEXT: s_endpgm 1070; 1071; EG-LABEL: v_ctlz_i16_sel_eq_neg1: 1072; EG: ; %bb.0: 1073; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1074; EG-NEXT: TEX 0 @6 1075; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1076; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1077; EG-NEXT: CF_END 1078; EG-NEXT: PAD 1079; EG-NEXT: Fetch clause starting at 6: 1080; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1081; EG-NEXT: ALU clause starting at 8: 1082; EG-NEXT: MOV * T0.X, KC0[2].Z, 1083; EG-NEXT: ALU clause starting at 9: 1084; EG-NEXT: FFBH_UINT T0.W, T0.X, 1085; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1086; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1087; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1088; EG-NEXT: LSHL * T1.W, PS, literal.y, 1089; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1090; EG-NEXT: LSHL T0.X, PV.W, PS, 1091; EG-NEXT: LSHL * T0.W, literal.x, PS, 1092; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1093; EG-NEXT: MOV T0.Y, 0.0, 1094; EG-NEXT: MOV * T0.Z, 0.0, 1095; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1096; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1097 %val = load i16, i16 addrspace(1)* %valptr 1098 %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone 1099 %cmp = icmp eq i16 %val, 0 1100 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1101 store i16 %sel, i16 addrspace(1)* %out 1102 ret void 1103} 1104 1105; FIXME: Need to handle non-uniform case for function below (load without gep). 1106define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { 1107; SI-LABEL: v_ctlz_i7_sel_eq_neg1: 1108; SI: ; %bb.0: 1109; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1110; SI-NEXT: s_mov_b32 s3, 0xf000 1111; SI-NEXT: v_mov_b32_e32 v1, 0 1112; SI-NEXT: s_mov_b32 s6, 0 1113; SI-NEXT: s_mov_b32 s7, s3 1114; SI-NEXT: s_waitcnt lgkmcnt(0) 1115; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1116; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1117; SI-NEXT: s_mov_b32 s2, -1 1118; SI-NEXT: s_waitcnt vmcnt(0) 1119; SI-NEXT: v_ffbh_u32_e32 v0, v0 1120; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1121; SI-NEXT: s_waitcnt lgkmcnt(0) 1122; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1123; SI-NEXT: s_endpgm 1124; 1125; VI-LABEL: v_ctlz_i7_sel_eq_neg1: 1126; VI: ; %bb.0: 1127; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1128; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1129; VI-NEXT: s_mov_b32 s7, 0xf000 1130; VI-NEXT: s_mov_b32 s6, -1 1131; VI-NEXT: s_waitcnt lgkmcnt(0) 1132; VI-NEXT: v_mov_b32_e32 v1, s1 1133; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1134; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1135; VI-NEXT: flat_load_ubyte v0, v[0:1] 1136; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1137; VI-NEXT: v_ffbh_u32_e32 v0, v0 1138; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1139; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1140; VI-NEXT: s_endpgm 1141; 1142; EG-LABEL: v_ctlz_i7_sel_eq_neg1: 1143; EG: ; %bb.0: 1144; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1145; EG-NEXT: TEX 0 @6 1146; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1147; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1148; EG-NEXT: CF_END 1149; EG-NEXT: PAD 1150; EG-NEXT: Fetch clause starting at 6: 1151; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1152; EG-NEXT: ALU clause starting at 8: 1153; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1154; EG-NEXT: ALU clause starting at 9: 1155; EG-NEXT: FFBH_UINT T0.W, T0.X, 1156; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1157; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1158; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1159; EG-NEXT: LSHL * T1.W, PS, literal.y, 1160; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1161; EG-NEXT: LSHL T0.X, PV.W, PS, 1162; EG-NEXT: LSHL * T0.W, literal.x, PS, 1163; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1164; EG-NEXT: MOV T0.Y, 0.0, 1165; EG-NEXT: MOV * T0.Z, 0.0, 1166; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1167; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1168 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1169 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid 1170 %val = load i7, i7 addrspace(1)* %valptr.gep 1171 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone 1172 %cmp = icmp eq i7 %val, 0 1173 %sel = select i1 %cmp, i7 -1, i7 %ctlz 1174 store i7 %sel, i7 addrspace(1)* %out 1175 ret void 1176} 1177