1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI 4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG 5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL 7 8declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone 9declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 10declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone 11 12declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 13declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 14declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 15 16declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 17declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 18declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 19 20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 21 22define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 23; SI-LABEL: s_ctlz_i32: 24; SI: ; %bb.0: 25; SI-NEXT: s_load_dword s2, s[0:1], 0xb 26; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 27; SI-NEXT: s_mov_b32 s3, 0xf000 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: s_flbit_i32_b32 s2, s2 30; SI-NEXT: s_min_u32 s4, s2, 32 31; SI-NEXT: s_mov_b32 s2, -1 32; SI-NEXT: v_mov_b32_e32 v0, s4 33; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: s_ctlz_i32: 37; VI: ; %bb.0: 38; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 39; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 40; VI-NEXT: s_mov_b32 s7, 0xf000 41; VI-NEXT: s_mov_b32 s6, -1 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: s_flbit_i32_b32 s0, s0 44; VI-NEXT: s_min_u32 s0, s0, 32 45; VI-NEXT: v_mov_b32_e32 v0, s0 46; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48; 49; EG-LABEL: s_ctlz_i32: 50; EG: ; %bb.0: 51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: ALU clause starting at 4: 56; EG-NEXT: FFBH_UINT * T0.W, KC0[2].Z, 57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 60; 61; GFX10-LABEL: s_ctlz_i32: 62; GFX10: ; %bb.0: 63; GFX10-NEXT: s_clause 0x1 64; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c 65; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 66; GFX10-NEXT: v_mov_b32_e32 v0, 0 67; GFX10-NEXT: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: s_flbit_i32_b32 s0, s4 69; GFX10-NEXT: s_min_u32 s0, s0, 32 70; GFX10-NEXT: v_mov_b32_e32 v1, s0 71; GFX10-NEXT: global_store_dword v0, v1, s[2:3] 72; GFX10-NEXT: s_endpgm 73; 74; GFX10-GISEL-LABEL: s_ctlz_i32: 75; GFX10-GISEL: ; %bb.0: 76; GFX10-GISEL-NEXT: s_clause 0x1 77; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 78; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 79; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 80; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 82; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 83; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 84; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 85; GFX10-GISEL-NEXT: s_endpgm 86 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 87 store i32 %ctlz, i32 addrspace(1)* %out, align 4 88 ret void 89} 90 91define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 92; SI-LABEL: v_ctlz_i32: 93; SI: ; %bb.0: 94; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 95; SI-NEXT: s_mov_b32 s3, 0xf000 96; SI-NEXT: s_mov_b32 s6, 0 97; SI-NEXT: s_mov_b32 s7, s3 98; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 99; SI-NEXT: v_mov_b32_e32 v1, 0 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 102; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt vmcnt(0) 105; SI-NEXT: v_ffbh_u32_e32 v0, v0 106; SI-NEXT: v_min_u32_e32 v0, 32, v0 107; SI-NEXT: s_waitcnt lgkmcnt(0) 108; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 109; SI-NEXT: s_endpgm 110; 111; VI-LABEL: v_ctlz_i32: 112; VI: ; %bb.0: 113; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 114; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 115; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 116; VI-NEXT: s_mov_b32 s7, 0xf000 117; VI-NEXT: s_mov_b32 s6, -1 118; VI-NEXT: s_waitcnt lgkmcnt(0) 119; VI-NEXT: v_mov_b32_e32 v1, s1 120; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 121; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 122; VI-NEXT: flat_load_dword v0, v[0:1] 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_ffbh_u32_e32 v0, v0 125; VI-NEXT: v_min_u32_e32 v0, 32, v0 126; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 127; VI-NEXT: s_endpgm 128; 129; EG-LABEL: v_ctlz_i32: 130; EG: ; %bb.0: 131; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 132; EG-NEXT: TEX 0 @6 133; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 134; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 135; EG-NEXT: CF_END 136; EG-NEXT: PAD 137; EG-NEXT: Fetch clause starting at 6: 138; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 139; EG-NEXT: ALU clause starting at 8: 140; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 141; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 142; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 143; EG-NEXT: ALU clause starting at 11: 144; EG-NEXT: FFBH_UINT * T0.W, T0.X, 145; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 146; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 147; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 148; 149; GFX10-LABEL: v_ctlz_i32: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 152; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 153; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 154; GFX10-NEXT: v_mov_b32_e32 v1, 0 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 157; GFX10-NEXT: s_waitcnt vmcnt(0) 158; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 159; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 160; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 161; GFX10-NEXT: s_endpgm 162; 163; GFX10-GISEL-LABEL: v_ctlz_i32: 164; GFX10-GISEL: ; %bb.0: 165; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 166; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 167; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 168; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 169; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 171; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 172; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 173; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 174; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 175; GFX10-GISEL-NEXT: s_endpgm 176 %tid = call i32 @llvm.amdgcn.workitem.id.x() 177 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 178 %val = load i32, i32 addrspace(1)* %in.gep, align 4 179 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 180 store i32 %ctlz, i32 addrspace(1)* %out, align 4 181 ret void 182} 183 184define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 185; SI-LABEL: v_ctlz_v2i32: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 188; SI-NEXT: s_mov_b32 s3, 0xf000 189; SI-NEXT: s_mov_b32 s6, 0 190; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 191; SI-NEXT: v_mov_b32_e32 v1, 0 192; SI-NEXT: s_mov_b32 s7, s3 193; SI-NEXT: s_waitcnt lgkmcnt(0) 194; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 195; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 196; SI-NEXT: s_mov_b32 s2, -1 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: v_ffbh_u32_e32 v1, v1 199; SI-NEXT: v_ffbh_u32_e32 v0, v0 200; SI-NEXT: v_min_u32_e32 v1, 32, v1 201; SI-NEXT: v_min_u32_e32 v0, 32, v0 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 204; SI-NEXT: s_endpgm 205; 206; VI-LABEL: v_ctlz_v2i32: 207; VI: ; %bb.0: 208; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 209; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 210; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 211; VI-NEXT: s_mov_b32 s7, 0xf000 212; VI-NEXT: s_mov_b32 s6, -1 213; VI-NEXT: s_waitcnt lgkmcnt(0) 214; VI-NEXT: v_mov_b32_e32 v1, s1 215; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 216; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 217; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 218; VI-NEXT: s_waitcnt vmcnt(0) 219; VI-NEXT: v_ffbh_u32_e32 v1, v1 220; VI-NEXT: v_ffbh_u32_e32 v0, v0 221; VI-NEXT: v_min_u32_e32 v1, 32, v1 222; VI-NEXT: v_min_u32_e32 v0, 32, v0 223; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 224; VI-NEXT: s_endpgm 225; 226; EG-LABEL: v_ctlz_v2i32: 227; EG: ; %bb.0: 228; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 229; EG-NEXT: TEX 0 @6 230; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 231; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 232; EG-NEXT: CF_END 233; EG-NEXT: PAD 234; EG-NEXT: Fetch clause starting at 6: 235; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 236; EG-NEXT: ALU clause starting at 8: 237; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 238; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 239; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 240; EG-NEXT: ALU clause starting at 11: 241; EG-NEXT: FFBH_UINT * T0.W, T0.Y, 242; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 243; EG-NEXT: FFBH_UINT * T0.W, T0.X, 244; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 245; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 246; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 247; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 248; 249; GFX10-LABEL: v_ctlz_v2i32: 250; GFX10: ; %bb.0: 251; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 252; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 253; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 254; GFX10-NEXT: v_mov_b32_e32 v2, 0 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 257; GFX10-NEXT: s_waitcnt vmcnt(0) 258; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 259; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 260; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 261; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 262; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 263; GFX10-NEXT: s_endpgm 264; 265; GFX10-GISEL-LABEL: v_ctlz_v2i32: 266; GFX10-GISEL: ; %bb.0: 267; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 268; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 269; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 270; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 271; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 272; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 273; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 274; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 275; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 276; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 277; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 278; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 279; GFX10-GISEL-NEXT: s_endpgm 280 %tid = call i32 @llvm.amdgcn.workitem.id.x() 281 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 282 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 283 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 284 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 285 ret void 286} 287 288define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 289; SI-LABEL: v_ctlz_v4i32: 290; SI: ; %bb.0: 291; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 292; SI-NEXT: s_mov_b32 s3, 0xf000 293; SI-NEXT: s_mov_b32 s6, 0 294; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 295; SI-NEXT: v_mov_b32_e32 v1, 0 296; SI-NEXT: s_mov_b32 s7, s3 297; SI-NEXT: s_waitcnt lgkmcnt(0) 298; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 299; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 300; SI-NEXT: s_mov_b32 s2, -1 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: v_ffbh_u32_e32 v3, v3 303; SI-NEXT: v_ffbh_u32_e32 v2, v2 304; SI-NEXT: v_ffbh_u32_e32 v1, v1 305; SI-NEXT: v_ffbh_u32_e32 v0, v0 306; SI-NEXT: v_min_u32_e32 v3, 32, v3 307; SI-NEXT: v_min_u32_e32 v2, 32, v2 308; SI-NEXT: v_min_u32_e32 v1, 32, v1 309; SI-NEXT: v_min_u32_e32 v0, 32, v0 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 312; SI-NEXT: s_endpgm 313; 314; VI-LABEL: v_ctlz_v4i32: 315; VI: ; %bb.0: 316; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 317; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 318; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 319; VI-NEXT: s_mov_b32 s7, 0xf000 320; VI-NEXT: s_mov_b32 s6, -1 321; VI-NEXT: s_waitcnt lgkmcnt(0) 322; VI-NEXT: v_mov_b32_e32 v1, s1 323; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 324; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 325; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 326; VI-NEXT: s_waitcnt vmcnt(0) 327; VI-NEXT: v_ffbh_u32_e32 v3, v3 328; VI-NEXT: v_ffbh_u32_e32 v2, v2 329; VI-NEXT: v_ffbh_u32_e32 v1, v1 330; VI-NEXT: v_ffbh_u32_e32 v0, v0 331; VI-NEXT: v_min_u32_e32 v3, 32, v3 332; VI-NEXT: v_min_u32_e32 v2, 32, v2 333; VI-NEXT: v_min_u32_e32 v1, 32, v1 334; VI-NEXT: v_min_u32_e32 v0, 32, v0 335; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 336; VI-NEXT: s_endpgm 337; 338; EG-LABEL: v_ctlz_v4i32: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 350; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 351; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 352; EG-NEXT: ALU clause starting at 11: 353; EG-NEXT: FFBH_UINT * T1.W, T0.W, 354; EG-NEXT: FFBH_UINT T2.W, T0.Z, 355; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 356; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 357; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 358; EG-NEXT: FFBH_UINT * T1.W, T0.Y, 359; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 360; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 361; EG-NEXT: FFBH_UINT * T1.W, T0.X, 362; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 363; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 364; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 365; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 366; 367; GFX10-LABEL: v_ctlz_v4i32: 368; GFX10: ; %bb.0: 369; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 370; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 371; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 372; GFX10-NEXT: v_mov_b32_e32 v4, 0 373; GFX10-NEXT: s_waitcnt lgkmcnt(0) 374; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 375; GFX10-NEXT: s_waitcnt vmcnt(0) 376; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 377; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 378; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 379; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 380; GFX10-NEXT: v_min_u32_e32 v3, 32, v3 381; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 382; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 383; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 384; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 385; GFX10-NEXT: s_endpgm 386; 387; GFX10-GISEL-LABEL: v_ctlz_v4i32: 388; GFX10-GISEL: ; %bb.0: 389; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 390; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 391; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 392; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 393; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 394; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 395; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 396; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 397; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 398; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 399; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 400; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 401; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 402; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 403; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 404; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 405; GFX10-GISEL-NEXT: s_endpgm 406 %tid = call i32 @llvm.amdgcn.workitem.id.x() 407 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 408 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 409 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 410 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 411 ret void 412} 413 414define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 415; SI-LABEL: v_ctlz_i8: 416; SI: ; %bb.0: 417; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 418; SI-NEXT: s_mov_b32 s3, 0xf000 419; SI-NEXT: s_mov_b32 s2, -1 420; SI-NEXT: s_mov_b32 s6, s2 421; SI-NEXT: s_mov_b32 s7, s3 422; SI-NEXT: s_waitcnt lgkmcnt(0) 423; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 424; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 425; SI-NEXT: s_waitcnt vmcnt(0) 426; SI-NEXT: v_ffbh_u32_e32 v0, v0 427; SI-NEXT: v_min_u32_e32 v0, 32, v0 428; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 429; SI-NEXT: s_waitcnt lgkmcnt(0) 430; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 431; SI-NEXT: s_endpgm 432; 433; VI-LABEL: v_ctlz_i8: 434; VI: ; %bb.0: 435; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 436; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 437; VI-NEXT: s_mov_b32 s7, 0xf000 438; VI-NEXT: s_mov_b32 s6, -1 439; VI-NEXT: s_mov_b32 s2, s6 440; VI-NEXT: s_mov_b32 s3, s7 441; VI-NEXT: s_waitcnt lgkmcnt(0) 442; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 443; VI-NEXT: s_waitcnt vmcnt(0) 444; VI-NEXT: v_ffbh_u32_e32 v0, v0 445; VI-NEXT: v_min_u32_e32 v0, 32, v0 446; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 447; VI-NEXT: v_add_u16_e32 v0, -8, v0 448; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 449; VI-NEXT: s_endpgm 450; 451; EG-LABEL: v_ctlz_i8: 452; EG: ; %bb.0: 453; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 454; EG-NEXT: TEX 0 @6 455; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 456; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 457; EG-NEXT: CF_END 458; EG-NEXT: PAD 459; EG-NEXT: Fetch clause starting at 6: 460; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 461; EG-NEXT: ALU clause starting at 8: 462; EG-NEXT: MOV * T0.X, KC0[2].Z, 463; EG-NEXT: ALU clause starting at 9: 464; EG-NEXT: FFBH_UINT * T0.W, T0.X, 465; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W, 466; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 467; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45) 468; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 469; EG-NEXT: -24(nan), 0(0.000000e+00) 470; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 471; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 472; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 473; EG-NEXT: LSHL T0.X, PV.W, PS, 474; EG-NEXT: LSHL * T0.W, literal.x, PS, 475; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 476; EG-NEXT: MOV T0.Y, 0.0, 477; EG-NEXT: MOV * T0.Z, 0.0, 478; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 479; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 480; 481; GFX10-LABEL: v_ctlz_i8: 482; GFX10: ; %bb.0: 483; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 484; GFX10-NEXT: v_mov_b32_e32 v0, 0 485; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 486; GFX10-NEXT: s_waitcnt lgkmcnt(0) 487; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 488; GFX10-NEXT: s_waitcnt vmcnt(0) 489; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 490; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 491; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1 492; GFX10-NEXT: v_add_nc_u16 v1, v1, -8 493; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 494; GFX10-NEXT: s_endpgm 495; 496; GFX10-GISEL-LABEL: v_ctlz_i8: 497; GFX10-GISEL: ; %bb.0: 498; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 499; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 500; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 501; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 502; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 503; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 504; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 505; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 506; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 507; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 508; GFX10-GISEL-NEXT: s_endpgm 509 %val = load i8, i8 addrspace(1)* %valptr 510 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 511 store i8 %ctlz, i8 addrspace(1)* %out 512 ret void 513} 514 515define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 516; SI-LABEL: s_ctlz_i64: 517; SI: ; %bb.0: 518; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 519; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 520; SI-NEXT: s_mov_b32 s3, 0xf000 521; SI-NEXT: s_mov_b32 s2, -1 522; SI-NEXT: s_waitcnt lgkmcnt(0) 523; SI-NEXT: s_flbit_i32_b32 s4, s4 524; SI-NEXT: s_flbit_i32_b32 s5, s5 525; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf 526; SI-NEXT: v_mov_b32_e32 v0, s5 527; SI-NEXT: s_add_i32 s4, s4, 32 528; SI-NEXT: v_min3_u32 v0, s4, v0, 64 529; SI-NEXT: v_mov_b32_e32 v1, 0 530; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 531; SI-NEXT: s_endpgm 532; 533; VI-LABEL: s_ctlz_i64: 534; VI: ; %bb.0: 535; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 536; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c 537; VI-NEXT: s_mov_b32 s7, 0xf000 538; VI-NEXT: s_mov_b32 s6, -1 539; VI-NEXT: v_mov_b32_e32 v1, 0 540; VI-NEXT: s_waitcnt lgkmcnt(0) 541; VI-NEXT: s_flbit_i32_b32 s0, s0 542; VI-NEXT: v_add_u32_e64 v0, s[2:3], s0, 32 clamp 543; VI-NEXT: s_flbit_i32_b32 s0, s1 544; VI-NEXT: v_min3_u32 v0, v0, s0, 64 545; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 546; VI-NEXT: s_endpgm 547; 548; EG-LABEL: s_ctlz_i64: 549; EG: ; %bb.0: 550; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 551; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 552; EG-NEXT: CF_END 553; EG-NEXT: PAD 554; EG-NEXT: ALU clause starting at 4: 555; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 556; EG-NEXT: CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W, 557; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 558; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 559; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 560; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 561; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 562; EG-NEXT: MOV T0.Y, 0.0, 563; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 564; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 565; 566; GFX10-LABEL: s_ctlz_i64: 567; GFX10: ; %bb.0: 568; GFX10-NEXT: s_clause 0x1 569; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 570; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 571; GFX10-NEXT: v_mov_b32_e32 v1, 0 572; GFX10-NEXT: s_waitcnt lgkmcnt(0) 573; GFX10-NEXT: s_flbit_i32_b32 s0, s2 574; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 575; GFX10-NEXT: s_flbit_i32_b32 s0, s3 576; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 577; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] 578; GFX10-NEXT: s_endpgm 579; 580; GFX10-GISEL-LABEL: s_ctlz_i64: 581; GFX10-GISEL: ; %bb.0: 582; GFX10-GISEL-NEXT: s_clause 0x1 583; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 584; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 585; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 586; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 587; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] 588; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 589; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 590; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 591; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 592; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 593; GFX10-GISEL-NEXT: s_endpgm 594 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 595 store i64 %ctlz, i64 addrspace(1)* %out 596 ret void 597} 598 599define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 600; SI-LABEL: s_ctlz_i64_trunc: 601; SI: ; %bb.0: 602; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 603; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 604; SI-NEXT: s_mov_b32 s3, 0xf000 605; SI-NEXT: s_mov_b32 s2, -1 606; SI-NEXT: s_waitcnt lgkmcnt(0) 607; SI-NEXT: s_flbit_i32_b32 s4, s4 608; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf 609; SI-NEXT: s_flbit_i32_b32 s5, s5 610; SI-NEXT: s_add_i32 s4, s4, 32 611; SI-NEXT: v_mov_b32_e32 v0, s5 612; SI-NEXT: v_min3_u32 v0, s4, v0, 64 613; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 614; SI-NEXT: s_endpgm 615; 616; VI-LABEL: s_ctlz_i64_trunc: 617; VI: ; %bb.0: 618; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 619; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 620; VI-NEXT: s_mov_b32 s7, 0xf000 621; VI-NEXT: s_mov_b32 s6, -1 622; VI-NEXT: s_waitcnt lgkmcnt(0) 623; VI-NEXT: s_flbit_i32_b32 s0, s0 624; VI-NEXT: v_add_u32_e64 v0, s[2:3], s0, 32 clamp 625; VI-NEXT: s_flbit_i32_b32 s0, s1 626; VI-NEXT: v_min3_u32 v0, v0, s0, 64 627; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 628; VI-NEXT: s_endpgm 629; 630; EG-LABEL: s_ctlz_i64_trunc: 631; EG: ; %bb.0: 632; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 633; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 634; EG-NEXT: CF_END 635; EG-NEXT: PAD 636; EG-NEXT: ALU clause starting at 4: 637; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 638; EG-NEXT: CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W, 639; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 640; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 641; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 642; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 643; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 644; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 645; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 646; 647; GFX10-LABEL: s_ctlz_i64_trunc: 648; GFX10: ; %bb.0: 649; GFX10-NEXT: s_clause 0x1 650; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 651; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 652; GFX10-NEXT: v_mov_b32_e32 v1, 0 653; GFX10-NEXT: s_waitcnt lgkmcnt(0) 654; GFX10-NEXT: s_flbit_i32_b32 s0, s2 655; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 656; GFX10-NEXT: s_flbit_i32_b32 s0, s3 657; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 658; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 659; GFX10-NEXT: s_endpgm 660; 661; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: 662; GFX10-GISEL: ; %bb.0: 663; GFX10-GISEL-NEXT: s_clause 0x1 664; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 665; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 666; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 667; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 668; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] 669; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 670; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 671; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] 672; GFX10-GISEL-NEXT: s_endpgm 673 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 674 %trunc = trunc i64 %ctlz to i32 675 store i32 %trunc, i32 addrspace(1)* %out 676 ret void 677} 678 679define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 680; SI-LABEL: v_ctlz_i64: 681; SI: ; %bb.0: 682; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 683; SI-NEXT: s_mov_b32 s7, 0xf000 684; SI-NEXT: s_mov_b32 s6, 0 685; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 686; SI-NEXT: v_mov_b32_e32 v1, 0 687; SI-NEXT: s_waitcnt lgkmcnt(0) 688; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 689; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 690; SI-NEXT: s_waitcnt vmcnt(0) 691; SI-NEXT: v_ffbh_u32_e32 v2, v2 692; SI-NEXT: v_min_u32_e32 v2, 0xffffffdf, v2 693; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 694; SI-NEXT: v_ffbh_u32_e32 v3, v3 695; SI-NEXT: v_min3_u32 v2, v2, v3, 64 696; SI-NEXT: v_mov_b32_e32 v3, v1 697; SI-NEXT: s_waitcnt lgkmcnt(0) 698; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 699; SI-NEXT: s_endpgm 700; 701; VI-LABEL: v_ctlz_i64: 702; VI: ; %bb.0: 703; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 704; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 705; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 706; VI-NEXT: v_mov_b32_e32 v2, 0 707; VI-NEXT: s_waitcnt lgkmcnt(0) 708; VI-NEXT: v_mov_b32_e32 v4, s3 709; VI-NEXT: v_mov_b32_e32 v1, s1 710; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 711; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 712; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 713; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 714; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 715; VI-NEXT: s_waitcnt vmcnt(0) 716; VI-NEXT: v_ffbh_u32_e32 v0, v0 717; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 718; VI-NEXT: v_ffbh_u32_e32 v1, v1 719; VI-NEXT: v_min3_u32 v1, v0, v1, 64 720; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 721; VI-NEXT: s_endpgm 722; 723; EG-LABEL: v_ctlz_i64: 724; EG: ; %bb.0: 725; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 726; EG-NEXT: TEX 0 @6 727; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 728; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 729; EG-NEXT: CF_END 730; EG-NEXT: PAD 731; EG-NEXT: Fetch clause starting at 6: 732; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 733; EG-NEXT: ALU clause starting at 8: 734; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 735; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 736; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 737; EG-NEXT: ALU clause starting at 11: 738; EG-NEXT: FFBH_UINT * T1.W, T0.X, 739; EG-NEXT: CNDE_INT * T1.W, T0.X, literal.x, PV.W, 740; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 741; EG-NEXT: FFBH_UINT T2.W, T0.Y, 742; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 743; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 744; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 745; EG-NEXT: MOV T0.Y, 0.0, 746; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 747; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 748; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 749; 750; GFX10-LABEL: v_ctlz_i64: 751; GFX10: ; %bb.0: 752; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 753; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 754; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 755; GFX10-NEXT: s_waitcnt lgkmcnt(0) 756; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 757; GFX10-NEXT: s_waitcnt vmcnt(0) 758; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 759; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 760; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 761; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 762; GFX10-NEXT: v_mov_b32_e32 v1, 0 763; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 764; GFX10-NEXT: s_endpgm 765; 766; GFX10-GISEL-LABEL: v_ctlz_i64: 767; GFX10-GISEL: ; %bb.0: 768; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 769; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 770; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 771; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 772; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 773; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 774; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 775; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 776; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 777; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 778; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 779; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 780; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 781; GFX10-GISEL-NEXT: s_endpgm 782 %tid = call i32 @llvm.amdgcn.workitem.id.x() 783 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 784 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 785 %val = load i64, i64 addrspace(1)* %in.gep 786 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 787 store i64 %ctlz, i64 addrspace(1)* %out.gep 788 ret void 789} 790 791define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 792; SI-LABEL: v_ctlz_i64_trunc: 793; SI: ; %bb.0: 794; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 795; SI-NEXT: s_mov_b32 s7, 0xf000 796; SI-NEXT: s_mov_b32 s6, 0 797; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 798; SI-NEXT: v_mov_b32_e32 v2, 0 799; SI-NEXT: s_waitcnt lgkmcnt(0) 800; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 801; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 802; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 803; SI-NEXT: s_waitcnt vmcnt(0) 804; SI-NEXT: v_ffbh_u32_e32 v0, v3 805; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 806; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 807; SI-NEXT: v_ffbh_u32_e32 v3, v4 808; SI-NEXT: v_min3_u32 v0, v0, v3, 64 809; SI-NEXT: s_waitcnt lgkmcnt(0) 810; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 811; SI-NEXT: s_endpgm 812; 813; VI-LABEL: v_ctlz_i64_trunc: 814; VI: ; %bb.0: 815; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 816; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 817; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 818; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 819; VI-NEXT: s_waitcnt lgkmcnt(0) 820; VI-NEXT: v_mov_b32_e32 v4, s3 821; VI-NEXT: v_mov_b32_e32 v2, s1 822; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 823; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 824; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 825; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 826; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 827; VI-NEXT: s_waitcnt vmcnt(0) 828; VI-NEXT: v_ffbh_u32_e32 v0, v1 829; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 830; VI-NEXT: v_ffbh_u32_e32 v1, v2 831; VI-NEXT: v_min3_u32 v0, v0, v1, 64 832; VI-NEXT: flat_store_dword v[3:4], v0 833; VI-NEXT: s_endpgm 834; 835; EG-LABEL: v_ctlz_i64_trunc: 836; EG: ; %bb.0: 837; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 838; EG-NEXT: TEX 0 @6 839; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 840; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 841; EG-NEXT: CF_END 842; EG-NEXT: PAD 843; EG-NEXT: Fetch clause starting at 6: 844; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 845; EG-NEXT: ALU clause starting at 8: 846; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 847; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 848; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 849; EG-NEXT: ALU clause starting at 11: 850; EG-NEXT: FFBH_UINT * T0.W, T1.X, 851; EG-NEXT: CNDE_INT * T0.W, T1.X, literal.x, PV.W, 852; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 853; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 854; EG-NEXT: FFBH_UINT T1.W, T1.Y, 855; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 856; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 857; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 858; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 859; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 860; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 861; 862; GFX10-LABEL: v_ctlz_i64_trunc: 863; GFX10: ; %bb.0: 864; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 865; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 866; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 867; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 868; GFX10-NEXT: s_waitcnt lgkmcnt(0) 869; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 870; GFX10-NEXT: s_waitcnt vmcnt(0) 871; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 872; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 873; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 874; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 875; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 876; GFX10-NEXT: s_endpgm 877; 878; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: 879; GFX10-GISEL: ; %bb.0: 880; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 881; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 882; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 883; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 884; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 885; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 886; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 887; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 888; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 889; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 890; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 891; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 892; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 893; GFX10-GISEL-NEXT: s_endpgm 894 %tid = call i32 @llvm.amdgcn.workitem.id.x() 895 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 896 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 897 %val = load i64, i64 addrspace(1)* %in.gep 898 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 899 %trunc = trunc i64 %ctlz to i32 900 store i32 %trunc, i32 addrspace(1)* %out.gep 901 ret void 902} 903 904define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 905; SI-LABEL: v_ctlz_i32_sel_eq_neg1: 906; SI: ; %bb.0: 907; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 908; SI-NEXT: s_mov_b32 s3, 0xf000 909; SI-NEXT: s_mov_b32 s6, 0 910; SI-NEXT: s_mov_b32 s7, s3 911; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 912; SI-NEXT: v_mov_b32_e32 v1, 0 913; SI-NEXT: s_waitcnt lgkmcnt(0) 914; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 915; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 916; SI-NEXT: s_mov_b32 s2, -1 917; SI-NEXT: s_waitcnt vmcnt(0) 918; SI-NEXT: v_ffbh_u32_e32 v0, v0 919; SI-NEXT: s_waitcnt lgkmcnt(0) 920; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 921; SI-NEXT: s_endpgm 922; 923; VI-LABEL: v_ctlz_i32_sel_eq_neg1: 924; VI: ; %bb.0: 925; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 926; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 927; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 928; VI-NEXT: s_mov_b32 s7, 0xf000 929; VI-NEXT: s_mov_b32 s6, -1 930; VI-NEXT: s_waitcnt lgkmcnt(0) 931; VI-NEXT: v_mov_b32_e32 v1, s1 932; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 933; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 934; VI-NEXT: flat_load_dword v0, v[0:1] 935; VI-NEXT: s_waitcnt vmcnt(0) 936; VI-NEXT: v_ffbh_u32_e32 v0, v0 937; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 938; VI-NEXT: s_endpgm 939; 940; EG-LABEL: v_ctlz_i32_sel_eq_neg1: 941; EG: ; %bb.0: 942; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 943; EG-NEXT: TEX 0 @6 944; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 945; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 946; EG-NEXT: CF_END 947; EG-NEXT: PAD 948; EG-NEXT: Fetch clause starting at 6: 949; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 950; EG-NEXT: ALU clause starting at 8: 951; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 952; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 953; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 954; EG-NEXT: ALU clause starting at 11: 955; EG-NEXT: FFBH_UINT * T0.W, T0.X, 956; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 957; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 958; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 959; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 960; EG-NEXT: -1(nan), 2(2.802597e-45) 961; 962; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: 963; GFX10: ; %bb.0: 964; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 965; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 966; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 967; GFX10-NEXT: v_mov_b32_e32 v1, 0 968; GFX10-NEXT: s_waitcnt lgkmcnt(0) 969; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 970; GFX10-NEXT: s_waitcnt vmcnt(0) 971; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 972; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 973; GFX10-NEXT: s_endpgm 974; 975; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: 976; GFX10-GISEL: ; %bb.0: 977; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 978; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 979; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 980; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 981; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 982; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 983; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 984; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 985; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 986; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 987; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 988; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 989; GFX10-GISEL-NEXT: s_endpgm 990 %tid = call i32 @llvm.amdgcn.workitem.id.x() 991 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 992 %val = load i32, i32 addrspace(1)* %in.gep 993 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 994 %cmp = icmp eq i32 %val, 0 995 %sel = select i1 %cmp, i32 -1, i32 %ctlz 996 store i32 %sel, i32 addrspace(1)* %out 997 ret void 998} 999 1000define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1001; SI-LABEL: v_ctlz_i32_sel_ne_neg1: 1002; SI: ; %bb.0: 1003; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1004; SI-NEXT: s_mov_b32 s3, 0xf000 1005; SI-NEXT: s_mov_b32 s6, 0 1006; SI-NEXT: s_mov_b32 s7, s3 1007; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1008; SI-NEXT: v_mov_b32_e32 v1, 0 1009; SI-NEXT: s_waitcnt lgkmcnt(0) 1010; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1011; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1012; SI-NEXT: s_mov_b32 s2, -1 1013; SI-NEXT: s_waitcnt vmcnt(0) 1014; SI-NEXT: v_ffbh_u32_e32 v0, v0 1015; SI-NEXT: s_waitcnt lgkmcnt(0) 1016; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1017; SI-NEXT: s_endpgm 1018; 1019; VI-LABEL: v_ctlz_i32_sel_ne_neg1: 1020; VI: ; %bb.0: 1021; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1022; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1023; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1024; VI-NEXT: s_mov_b32 s7, 0xf000 1025; VI-NEXT: s_mov_b32 s6, -1 1026; VI-NEXT: s_waitcnt lgkmcnt(0) 1027; VI-NEXT: v_mov_b32_e32 v1, s1 1028; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1029; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1030; VI-NEXT: flat_load_dword v0, v[0:1] 1031; VI-NEXT: s_waitcnt vmcnt(0) 1032; VI-NEXT: v_ffbh_u32_e32 v0, v0 1033; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1034; VI-NEXT: s_endpgm 1035; 1036; EG-LABEL: v_ctlz_i32_sel_ne_neg1: 1037; EG: ; %bb.0: 1038; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1039; EG-NEXT: TEX 0 @6 1040; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1041; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1042; EG-NEXT: CF_END 1043; EG-NEXT: PAD 1044; EG-NEXT: Fetch clause starting at 6: 1045; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1046; EG-NEXT: ALU clause starting at 8: 1047; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1048; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1049; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1050; EG-NEXT: ALU clause starting at 11: 1051; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1052; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1053; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1054; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1055; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1056; EG-NEXT: -1(nan), 2(2.802597e-45) 1057; 1058; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: 1059; GFX10: ; %bb.0: 1060; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1061; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1062; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1063; GFX10-NEXT: v_mov_b32_e32 v1, 0 1064; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1066; GFX10-NEXT: s_waitcnt vmcnt(0) 1067; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1068; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1069; GFX10-NEXT: s_endpgm 1070; 1071; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: 1072; GFX10-GISEL: ; %bb.0: 1073; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1074; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1075; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1076; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1078; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1079; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1080; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1081; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1082; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo 1083; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1084; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1085; GFX10-GISEL-NEXT: s_endpgm 1086 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1087 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1088 %val = load i32, i32 addrspace(1)* %in.gep 1089 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1090 %cmp = icmp ne i32 %val, 0 1091 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1092 store i32 %sel, i32 addrspace(1)* %out 1093 ret void 1094} 1095 1096; TODO: Should be able to eliminate select here as well. 1097define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1098; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1099; SI: ; %bb.0: 1100; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1101; SI-NEXT: s_mov_b32 s3, 0xf000 1102; SI-NEXT: s_mov_b32 s6, 0 1103; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1104; SI-NEXT: v_mov_b32_e32 v1, 0 1105; SI-NEXT: s_mov_b32 s7, s3 1106; SI-NEXT: s_waitcnt lgkmcnt(0) 1107; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1108; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1109; SI-NEXT: s_mov_b32 s2, -1 1110; SI-NEXT: s_waitcnt vmcnt(0) 1111; SI-NEXT: v_ffbh_u32_e32 v0, v0 1112; SI-NEXT: v_min_u32_e32 v0, 32, v0 1113; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1114; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1115; SI-NEXT: s_waitcnt lgkmcnt(0) 1116; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1117; SI-NEXT: s_endpgm 1118; 1119; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1120; VI: ; %bb.0: 1121; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1122; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1123; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1124; VI-NEXT: s_mov_b32 s7, 0xf000 1125; VI-NEXT: s_mov_b32 s6, -1 1126; VI-NEXT: s_waitcnt lgkmcnt(0) 1127; VI-NEXT: v_mov_b32_e32 v1, s1 1128; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1129; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1130; VI-NEXT: flat_load_dword v0, v[0:1] 1131; VI-NEXT: s_waitcnt vmcnt(0) 1132; VI-NEXT: v_ffbh_u32_e32 v0, v0 1133; VI-NEXT: v_min_u32_e32 v0, 32, v0 1134; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1135; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1136; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1137; VI-NEXT: s_endpgm 1138; 1139; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1140; EG: ; %bb.0: 1141; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1142; EG-NEXT: TEX 0 @6 1143; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1144; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1145; EG-NEXT: CF_END 1146; EG-NEXT: PAD 1147; EG-NEXT: Fetch clause starting at 6: 1148; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1149; EG-NEXT: ALU clause starting at 8: 1150; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1151; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1152; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1153; EG-NEXT: ALU clause starting at 11: 1154; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1155; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1156; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1157; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 1158; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1159; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 1160; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1161; EG-NEXT: -1(nan), 2(2.802597e-45) 1162; 1163; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1164; GFX10: ; %bb.0: 1165; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1166; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1167; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1168; GFX10-NEXT: v_mov_b32_e32 v1, 0 1169; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1171; GFX10-NEXT: s_waitcnt vmcnt(0) 1172; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1173; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1174; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1175; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1176; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1177; GFX10-NEXT: s_endpgm 1178; 1179; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1180; GFX10-GISEL: ; %bb.0: 1181; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1182; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1183; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1184; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1185; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1186; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1187; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1188; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1189; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1190; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 1191; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 1192; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1193; GFX10-GISEL-NEXT: s_endpgm 1194 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1195 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1196 %val = load i32, i32 addrspace(1)* %in.gep 1197 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1198 %cmp = icmp eq i32 %ctlz, 32 1199 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1200 store i32 %sel, i32 addrspace(1)* %out 1201 ret void 1202} 1203 1204define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1205; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1206; SI: ; %bb.0: 1207; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1208; SI-NEXT: s_mov_b32 s3, 0xf000 1209; SI-NEXT: s_mov_b32 s6, 0 1210; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1211; SI-NEXT: v_mov_b32_e32 v1, 0 1212; SI-NEXT: s_mov_b32 s7, s3 1213; SI-NEXT: s_waitcnt lgkmcnt(0) 1214; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1215; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1216; SI-NEXT: s_mov_b32 s2, -1 1217; SI-NEXT: s_waitcnt vmcnt(0) 1218; SI-NEXT: v_ffbh_u32_e32 v0, v0 1219; SI-NEXT: v_min_u32_e32 v0, 32, v0 1220; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1221; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1222; SI-NEXT: s_waitcnt lgkmcnt(0) 1223; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1224; SI-NEXT: s_endpgm 1225; 1226; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1227; VI: ; %bb.0: 1228; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1229; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1230; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1231; VI-NEXT: s_mov_b32 s7, 0xf000 1232; VI-NEXT: s_mov_b32 s6, -1 1233; VI-NEXT: s_waitcnt lgkmcnt(0) 1234; VI-NEXT: v_mov_b32_e32 v1, s1 1235; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1236; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1237; VI-NEXT: flat_load_dword v0, v[0:1] 1238; VI-NEXT: s_waitcnt vmcnt(0) 1239; VI-NEXT: v_ffbh_u32_e32 v0, v0 1240; VI-NEXT: v_min_u32_e32 v0, 32, v0 1241; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1242; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1243; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1244; VI-NEXT: s_endpgm 1245; 1246; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1247; EG: ; %bb.0: 1248; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1249; EG-NEXT: TEX 0 @6 1250; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1251; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1252; EG-NEXT: CF_END 1253; EG-NEXT: PAD 1254; EG-NEXT: Fetch clause starting at 6: 1255; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1256; EG-NEXT: ALU clause starting at 8: 1257; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1258; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1259; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1260; EG-NEXT: ALU clause starting at 11: 1261; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1262; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1263; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1264; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1265; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1266; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1267; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1268; EG-NEXT: -1(nan), 2(2.802597e-45) 1269; 1270; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1271; GFX10: ; %bb.0: 1272; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1273; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1274; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1275; GFX10-NEXT: v_mov_b32_e32 v1, 0 1276; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1277; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1278; GFX10-NEXT: s_waitcnt vmcnt(0) 1279; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1280; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1281; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1282; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1283; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1284; GFX10-NEXT: s_endpgm 1285; 1286; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1287; GFX10-GISEL: ; %bb.0: 1288; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1289; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1290; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1291; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1292; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1294; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1295; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1296; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1297; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1298; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1299; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1300; GFX10-GISEL-NEXT: s_endpgm 1301 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1302 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1303 %val = load i32, i32 addrspace(1)* %in.gep 1304 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1305 %cmp = icmp ne i32 %ctlz, 32 1306 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1307 store i32 %sel, i32 addrspace(1)* %out 1308 ret void 1309} 1310 1311 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 1312; SI-LABEL: v_ctlz_i8_sel_eq_neg1: 1313; SI: ; %bb.0: 1314; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1315; SI-NEXT: s_mov_b32 s3, 0xf000 1316; SI-NEXT: v_mov_b32_e32 v1, 0 1317; SI-NEXT: s_mov_b32 s6, 0 1318; SI-NEXT: s_mov_b32 s7, s3 1319; SI-NEXT: s_waitcnt lgkmcnt(0) 1320; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1321; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1322; SI-NEXT: s_mov_b32 s2, -1 1323; SI-NEXT: s_waitcnt vmcnt(0) 1324; SI-NEXT: v_ffbh_u32_e32 v0, v0 1325; SI-NEXT: s_waitcnt lgkmcnt(0) 1326; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1327; SI-NEXT: s_endpgm 1328; 1329; VI-LABEL: v_ctlz_i8_sel_eq_neg1: 1330; VI: ; %bb.0: 1331; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1332; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1333; VI-NEXT: s_mov_b32 s7, 0xf000 1334; VI-NEXT: s_mov_b32 s6, -1 1335; VI-NEXT: s_waitcnt lgkmcnt(0) 1336; VI-NEXT: v_mov_b32_e32 v1, s1 1337; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1338; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1339; VI-NEXT: flat_load_ubyte v0, v[0:1] 1340; VI-NEXT: s_waitcnt vmcnt(0) 1341; VI-NEXT: v_ffbh_u32_e32 v0, v0 1342; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1343; VI-NEXT: s_endpgm 1344; 1345; EG-LABEL: v_ctlz_i8_sel_eq_neg1: 1346; EG: ; %bb.0: 1347; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1348; EG-NEXT: TEX 0 @6 1349; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1350; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1351; EG-NEXT: CF_END 1352; EG-NEXT: PAD 1353; EG-NEXT: Fetch clause starting at 6: 1354; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1355; EG-NEXT: ALU clause starting at 8: 1356; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1357; EG-NEXT: ALU clause starting at 9: 1358; EG-NEXT: FFBH_UINT T0.W, T0.X, 1359; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1360; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1361; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1362; EG-NEXT: LSHL * T1.W, PS, literal.y, 1363; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1364; EG-NEXT: LSHL T0.X, PV.W, PS, 1365; EG-NEXT: LSHL * T0.W, literal.x, PS, 1366; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1367; EG-NEXT: MOV T0.Y, 0.0, 1368; EG-NEXT: MOV * T0.Z, 0.0, 1369; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1370; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1371; 1372; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: 1373; GFX10: ; %bb.0: 1374; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1375; GFX10-NEXT: v_mov_b32_e32 v1, 0 1376; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1377; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1378; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1379; GFX10-NEXT: s_waitcnt vmcnt(0) 1380; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1381; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1382; GFX10-NEXT: s_endpgm 1383; 1384; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: 1385; GFX10-GISEL: ; %bb.0: 1386; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1387; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1388; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1389; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1390; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1391; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1392; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1393; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1394; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1395; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1396; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1397; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1398; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1399; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 1400; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 1401; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1402; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1403; GFX10-GISEL-NEXT: s_endpgm 1404 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1405 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 1406 %val = load i8, i8 addrspace(1)* %valptr.gep 1407 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 1408 %cmp = icmp eq i8 %val, 0 1409 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1410 store i8 %sel, i8 addrspace(1)* %out 1411 ret void 1412} 1413 1414 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { 1415; SI-LABEL: v_ctlz_i16_sel_eq_neg1: 1416; SI: ; %bb.0: 1417; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1418; SI-NEXT: s_mov_b32 s3, 0xf000 1419; SI-NEXT: s_mov_b32 s2, -1 1420; SI-NEXT: s_mov_b32 s6, s2 1421; SI-NEXT: s_mov_b32 s7, s3 1422; SI-NEXT: s_waitcnt lgkmcnt(0) 1423; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1424; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1425; SI-NEXT: s_waitcnt vmcnt(0) 1426; SI-NEXT: v_ffbh_u32_e32 v0, v0 1427; SI-NEXT: s_waitcnt lgkmcnt(0) 1428; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1429; SI-NEXT: s_endpgm 1430; 1431; VI-LABEL: v_ctlz_i16_sel_eq_neg1: 1432; VI: ; %bb.0: 1433; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1434; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1435; VI-NEXT: s_mov_b32 s7, 0xf000 1436; VI-NEXT: s_mov_b32 s6, -1 1437; VI-NEXT: s_mov_b32 s2, s6 1438; VI-NEXT: s_mov_b32 s3, s7 1439; VI-NEXT: s_waitcnt lgkmcnt(0) 1440; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1441; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1442; VI-NEXT: s_waitcnt vmcnt(0) 1443; VI-NEXT: v_ffbh_u32_e32 v2, v0 1444; VI-NEXT: v_min_u32_e32 v2, 32, v2 1445; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 1446; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1447; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1448; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 1449; VI-NEXT: s_endpgm 1450; 1451; EG-LABEL: v_ctlz_i16_sel_eq_neg1: 1452; EG: ; %bb.0: 1453; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1454; EG-NEXT: TEX 0 @6 1455; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1456; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1457; EG-NEXT: CF_END 1458; EG-NEXT: PAD 1459; EG-NEXT: Fetch clause starting at 6: 1460; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1461; EG-NEXT: ALU clause starting at 8: 1462; EG-NEXT: MOV * T0.X, KC0[2].Z, 1463; EG-NEXT: ALU clause starting at 9: 1464; EG-NEXT: FFBH_UINT T0.W, T0.X, 1465; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1466; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1467; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1468; EG-NEXT: LSHL * T1.W, PS, literal.y, 1469; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1470; EG-NEXT: LSHL T0.X, PV.W, PS, 1471; EG-NEXT: LSHL * T0.W, literal.x, PS, 1472; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1473; EG-NEXT: MOV T0.Y, 0.0, 1474; EG-NEXT: MOV * T0.Z, 0.0, 1475; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1476; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1477; 1478; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: 1479; GFX10: ; %bb.0: 1480; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1481; GFX10-NEXT: v_mov_b32_e32 v0, 0 1482; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1483; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1485; GFX10-NEXT: s_waitcnt vmcnt(0) 1486; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 1487; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1488; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 1489; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 1490; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1491; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1492; GFX10-NEXT: s_endpgm 1493; 1494; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: 1495; GFX10-GISEL: ; %bb.0: 1496; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1497; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 1498; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1499; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1500; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] 1501; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 1502; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff 1503; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1504; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 1505; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1506; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1507; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 1508; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 1509; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo 1510; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1511; GFX10-GISEL-NEXT: s_endpgm 1512 %val = load i16, i16 addrspace(1)* %valptr 1513 %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone 1514 %cmp = icmp eq i16 %val, 0 1515 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1516 store i16 %sel, i16 addrspace(1)* %out 1517 ret void 1518} 1519 1520; FIXME: Need to handle non-uniform case for function below (load without gep). 1521define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { 1522; SI-LABEL: v_ctlz_i7_sel_eq_neg1: 1523; SI: ; %bb.0: 1524; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1525; SI-NEXT: s_mov_b32 s3, 0xf000 1526; SI-NEXT: v_mov_b32_e32 v1, 0 1527; SI-NEXT: s_mov_b32 s6, 0 1528; SI-NEXT: s_mov_b32 s7, s3 1529; SI-NEXT: s_waitcnt lgkmcnt(0) 1530; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1531; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1532; SI-NEXT: s_mov_b32 s2, -1 1533; SI-NEXT: s_waitcnt vmcnt(0) 1534; SI-NEXT: v_ffbh_u32_e32 v0, v0 1535; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1536; SI-NEXT: s_waitcnt lgkmcnt(0) 1537; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1538; SI-NEXT: s_endpgm 1539; 1540; VI-LABEL: v_ctlz_i7_sel_eq_neg1: 1541; VI: ; %bb.0: 1542; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1543; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1544; VI-NEXT: s_mov_b32 s7, 0xf000 1545; VI-NEXT: s_mov_b32 s6, -1 1546; VI-NEXT: s_waitcnt lgkmcnt(0) 1547; VI-NEXT: v_mov_b32_e32 v1, s1 1548; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1549; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1550; VI-NEXT: flat_load_ubyte v0, v[0:1] 1551; VI-NEXT: s_waitcnt vmcnt(0) 1552; VI-NEXT: v_ffbh_u32_e32 v0, v0 1553; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1554; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1555; VI-NEXT: s_endpgm 1556; 1557; EG-LABEL: v_ctlz_i7_sel_eq_neg1: 1558; EG: ; %bb.0: 1559; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1560; EG-NEXT: TEX 0 @6 1561; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1562; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1563; EG-NEXT: CF_END 1564; EG-NEXT: PAD 1565; EG-NEXT: Fetch clause starting at 6: 1566; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1567; EG-NEXT: ALU clause starting at 8: 1568; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1569; EG-NEXT: ALU clause starting at 9: 1570; EG-NEXT: FFBH_UINT T0.W, T0.X, 1571; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1572; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1573; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1574; EG-NEXT: LSHL * T1.W, PS, literal.y, 1575; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1576; EG-NEXT: LSHL T0.X, PV.W, PS, 1577; EG-NEXT: LSHL * T0.W, literal.x, PS, 1578; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1579; EG-NEXT: MOV T0.Y, 0.0, 1580; EG-NEXT: MOV * T0.Z, 0.0, 1581; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1582; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1583; 1584; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: 1585; GFX10: ; %bb.0: 1586; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1587; GFX10-NEXT: v_mov_b32_e32 v1, 0 1588; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1589; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1590; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1591; GFX10-NEXT: s_waitcnt vmcnt(0) 1592; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1593; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 1594; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1595; GFX10-NEXT: s_endpgm 1596; 1597; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: 1598; GFX10-GISEL: ; %bb.0: 1599; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1600; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1601; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1602; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1604; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1605; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f 1606; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1607; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1608; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1609; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1610; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 1611; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1612; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1613; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1614; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 1615; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo 1616; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1617; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 1618; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1619; GFX10-GISEL-NEXT: s_endpgm 1620 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1621 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid 1622 %val = load i7, i7 addrspace(1)* %valptr.gep 1623 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone 1624 %cmp = icmp eq i7 %val, 0 1625 %sel = select i1 %cmp, i7 -1, i7 %ctlz 1626 store i7 %sel, i7 addrspace(1)* %out 1627 ret void 1628} 1629