1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s 5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s 6 7declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone 8declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone 9declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone 10declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone 11declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone 12declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone 13declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone 14declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 15 16define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 17; SI-LABEL: s_cttz_zero_undef_i32: 18; SI: ; %bb.0: 19; SI-NEXT: s_load_dword s2, s[0:1], 0xb 20; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 21; SI-NEXT: s_mov_b32 s3, 0xf000 22; SI-NEXT: s_waitcnt lgkmcnt(0) 23; SI-NEXT: s_ff1_i32_b32 s4, s2 24; SI-NEXT: s_mov_b32 s2, -1 25; SI-NEXT: v_mov_b32_e32 v0, s4 26; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: s_cttz_zero_undef_i32: 30; VI: ; %bb.0: 31; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 32; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: s_ff1_i32_b32 s2, s2 35; VI-NEXT: v_mov_b32_e32 v0, s0 36; VI-NEXT: v_mov_b32_e32 v1, s1 37; VI-NEXT: v_mov_b32_e32 v2, s2 38; VI-NEXT: flat_store_dword v[0:1], v2 39; VI-NEXT: s_endpgm 40; 41; EG-LABEL: s_cttz_zero_undef_i32: 42; EG: ; %bb.0: 43; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 44; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 45; EG-NEXT: CF_END 46; EG-NEXT: PAD 47; EG-NEXT: ALU clause starting at 4: 48; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 49; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 50; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z, 51; 52; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: 53; GFX9-GISEL: ; %bb.0: 54; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 55; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 56; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 57; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 59; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 60; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 61; GFX9-GISEL-NEXT: s_endpgm 62 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 63 store i32 %cttz, i32 addrspace(1)* %out, align 4 64 ret void 65} 66 67define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 68; SI-LABEL: v_cttz_zero_undef_i32: 69; SI: ; %bb.0: 70; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 71; SI-NEXT: s_mov_b32 s3, 0xf000 72; SI-NEXT: s_mov_b32 s6, 0 73; SI-NEXT: s_mov_b32 s7, s3 74; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 75; SI-NEXT: v_mov_b32_e32 v1, 0 76; SI-NEXT: s_waitcnt lgkmcnt(0) 77; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 78; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 79; SI-NEXT: s_mov_b32 s2, -1 80; SI-NEXT: s_waitcnt vmcnt(0) 81; SI-NEXT: v_ffbl_b32_e32 v0, v0 82; SI-NEXT: s_waitcnt lgkmcnt(0) 83; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 84; SI-NEXT: s_endpgm 85; 86; VI-LABEL: v_cttz_zero_undef_i32: 87; VI: ; %bb.0: 88; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 89; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 90; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 91; VI-NEXT: s_waitcnt lgkmcnt(0) 92; VI-NEXT: v_mov_b32_e32 v1, s3 93; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 94; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 95; VI-NEXT: flat_load_dword v0, v[0:1] 96; VI-NEXT: s_waitcnt vmcnt(0) 97; VI-NEXT: v_ffbl_b32_e32 v2, v0 98; VI-NEXT: v_mov_b32_e32 v0, s0 99; VI-NEXT: v_mov_b32_e32 v1, s1 100; VI-NEXT: flat_store_dword v[0:1], v2 101; VI-NEXT: s_endpgm 102; 103; EG-LABEL: v_cttz_zero_undef_i32: 104; EG: ; %bb.0: 105; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 106; EG-NEXT: TEX 0 @6 107; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] 108; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 109; EG-NEXT: CF_END 110; EG-NEXT: PAD 111; EG-NEXT: Fetch clause starting at 6: 112; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 113; EG-NEXT: ALU clause starting at 8: 114; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 115; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 116; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 117; EG-NEXT: ALU clause starting at 11: 118; EG-NEXT: FFBL_INT T0.X, T0.X, 119; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 120; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 121; 122; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: 123; GFX9-GISEL: ; %bb.0: 124; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 125; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 127; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 128; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 129; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 130; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 131; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 132; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 133; GFX9-GISEL-NEXT: s_endpgm 134 %tid = call i32 @llvm.amdgcn.workitem.id.x() 135 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 136 %val = load i32, i32 addrspace(1)* %in.gep, align 4 137 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 138 store i32 %cttz, i32 addrspace(1)* %out, align 4 139 ret void 140} 141 142define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 143; SI-LABEL: v_cttz_zero_undef_v2i32: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 146; SI-NEXT: s_mov_b32 s3, 0xf000 147; SI-NEXT: s_mov_b32 s6, 0 148; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 149; SI-NEXT: v_mov_b32_e32 v1, 0 150; SI-NEXT: s_mov_b32 s7, s3 151; SI-NEXT: s_waitcnt lgkmcnt(0) 152; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 153; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 154; SI-NEXT: s_mov_b32 s2, -1 155; SI-NEXT: s_waitcnt vmcnt(0) 156; SI-NEXT: v_ffbl_b32_e32 v1, v1 157; SI-NEXT: v_ffbl_b32_e32 v0, v0 158; SI-NEXT: s_waitcnt lgkmcnt(0) 159; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 160; SI-NEXT: s_endpgm 161; 162; VI-LABEL: v_cttz_zero_undef_v2i32: 163; VI: ; %bb.0: 164; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 165; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 166; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 167; VI-NEXT: s_waitcnt lgkmcnt(0) 168; VI-NEXT: v_mov_b32_e32 v1, s3 169; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 170; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 171; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 172; VI-NEXT: v_mov_b32_e32 v3, s1 173; VI-NEXT: v_mov_b32_e32 v2, s0 174; VI-NEXT: s_waitcnt vmcnt(0) 175; VI-NEXT: v_ffbl_b32_e32 v1, v1 176; VI-NEXT: v_ffbl_b32_e32 v0, v0 177; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 178; VI-NEXT: s_endpgm 179; 180; EG-LABEL: v_cttz_zero_undef_v2i32: 181; EG: ; %bb.0: 182; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 183; EG-NEXT: TEX 0 @6 184; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 185; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 186; EG-NEXT: CF_END 187; EG-NEXT: PAD 188; EG-NEXT: Fetch clause starting at 6: 189; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 190; EG-NEXT: ALU clause starting at 8: 191; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 192; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 193; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 194; EG-NEXT: ALU clause starting at 11: 195; EG-NEXT: FFBL_INT * T0.Y, T0.Y, 196; EG-NEXT: FFBL_INT T0.X, T0.X, 197; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 198; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 199; 200; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: 201; GFX9-GISEL: ; %bb.0: 202; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 203; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 204; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 205; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 206; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 207; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 208; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 209; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 210; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 211; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 212; GFX9-GISEL-NEXT: s_endpgm 213 %tid = call i32 @llvm.amdgcn.workitem.id.x() 214 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 215 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 216 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone 217 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 218 ret void 219} 220 221define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 222; SI-LABEL: v_cttz_zero_undef_v4i32: 223; SI: ; %bb.0: 224; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 225; SI-NEXT: s_mov_b32 s3, 0xf000 226; SI-NEXT: s_mov_b32 s6, 0 227; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 228; SI-NEXT: v_mov_b32_e32 v1, 0 229; SI-NEXT: s_mov_b32 s7, s3 230; SI-NEXT: s_waitcnt lgkmcnt(0) 231; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 232; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 233; SI-NEXT: s_mov_b32 s2, -1 234; SI-NEXT: s_waitcnt vmcnt(0) 235; SI-NEXT: v_ffbl_b32_e32 v3, v3 236; SI-NEXT: v_ffbl_b32_e32 v2, v2 237; SI-NEXT: v_ffbl_b32_e32 v1, v1 238; SI-NEXT: v_ffbl_b32_e32 v0, v0 239; SI-NEXT: s_waitcnt lgkmcnt(0) 240; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 241; SI-NEXT: s_endpgm 242; 243; VI-LABEL: v_cttz_zero_undef_v4i32: 244; VI: ; %bb.0: 245; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 246; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 247; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 248; VI-NEXT: s_waitcnt lgkmcnt(0) 249; VI-NEXT: v_mov_b32_e32 v1, s3 250; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 251; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 252; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 253; VI-NEXT: v_mov_b32_e32 v5, s1 254; VI-NEXT: v_mov_b32_e32 v4, s0 255; VI-NEXT: s_waitcnt vmcnt(0) 256; VI-NEXT: v_ffbl_b32_e32 v3, v3 257; VI-NEXT: v_ffbl_b32_e32 v2, v2 258; VI-NEXT: v_ffbl_b32_e32 v1, v1 259; VI-NEXT: v_ffbl_b32_e32 v0, v0 260; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 261; VI-NEXT: s_endpgm 262; 263; EG-LABEL: v_cttz_zero_undef_v4i32: 264; EG: ; %bb.0: 265; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 266; EG-NEXT: TEX 0 @6 267; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 268; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 269; EG-NEXT: CF_END 270; EG-NEXT: PAD 271; EG-NEXT: Fetch clause starting at 6: 272; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 273; EG-NEXT: ALU clause starting at 8: 274; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 275; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 276; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 277; EG-NEXT: ALU clause starting at 11: 278; EG-NEXT: FFBL_INT * T0.W, T0.W, 279; EG-NEXT: FFBL_INT * T0.Z, T0.Z, 280; EG-NEXT: FFBL_INT * T0.Y, T0.Y, 281; EG-NEXT: FFBL_INT T0.X, T0.X, 282; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 283; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 284; 285; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: 286; GFX9-GISEL: ; %bb.0: 287; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 288; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 289; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 290; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 291; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 292; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 293; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 294; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 295; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 296; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 297; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 298; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 299; GFX9-GISEL-NEXT: s_endpgm 300 %tid = call i32 @llvm.amdgcn.workitem.id.x() 301 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 302 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 303 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone 304 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 305 ret void 306} 307 308define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind { 309; SI-LABEL: s_cttz_zero_undef_i8_with_select: 310; SI: ; %bb.0: 311; SI-NEXT: s_load_dword s2, s[0:1], 0xb 312; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 313; SI-NEXT: s_mov_b32 s3, 0xf000 314; SI-NEXT: s_waitcnt lgkmcnt(0) 315; SI-NEXT: s_ff1_i32_b32 s4, s2 316; SI-NEXT: s_mov_b32 s2, -1 317; SI-NEXT: v_mov_b32_e32 v0, s4 318; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 319; SI-NEXT: s_endpgm 320; 321; VI-LABEL: s_cttz_zero_undef_i8_with_select: 322; VI: ; %bb.0: 323; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 324; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 325; VI-NEXT: s_waitcnt lgkmcnt(0) 326; VI-NEXT: s_ff1_i32_b32 s2, s2 327; VI-NEXT: v_mov_b32_e32 v0, s0 328; VI-NEXT: v_mov_b32_e32 v1, s1 329; VI-NEXT: v_mov_b32_e32 v2, s2 330; VI-NEXT: flat_store_byte v[0:1], v2 331; VI-NEXT: s_endpgm 332; 333; EG-LABEL: s_cttz_zero_undef_i8_with_select: 334; EG: ; %bb.0: 335; EG-NEXT: ALU 0, @8, KC0[], KC1[] 336; EG-NEXT: TEX 0 @6 337; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 338; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 339; EG-NEXT: CF_END 340; EG-NEXT: PAD 341; EG-NEXT: Fetch clause starting at 6: 342; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 343; EG-NEXT: ALU clause starting at 8: 344; EG-NEXT: MOV * T0.X, 0.0, 345; EG-NEXT: ALU clause starting at 9: 346; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, 347; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 348; EG-NEXT: FFBL_INT T0.W, PV.W, 349; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 350; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 351; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 352; EG-NEXT: LSHL * T1.W, PS, literal.y, 353; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 354; EG-NEXT: LSHL T0.X, PV.W, PS, 355; EG-NEXT: LSHL * T0.W, literal.x, PS, 356; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 357; EG-NEXT: MOV T0.Y, 0.0, 358; EG-NEXT: MOV * T0.Z, 0.0, 359; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 360; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 361; 362; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: 363; GFX9-GISEL: ; %bb.0: 364; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 365; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 366; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 367; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 368; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 369; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 370; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] 371; GFX9-GISEL-NEXT: s_endpgm 372 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone 373 %cttz_ret = icmp ne i8 %val, 0 374 %ret = select i1 %cttz_ret, i8 %cttz, i8 32 375 store i8 %cttz, i8 addrspace(1)* %out, align 4 376 ret void 377} 378 379define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind { 380; SI-LABEL: s_cttz_zero_undef_i16_with_select: 381; SI: ; %bb.0: 382; SI-NEXT: s_load_dword s2, s[0:1], 0xb 383; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 384; SI-NEXT: s_mov_b32 s3, 0xf000 385; SI-NEXT: s_waitcnt lgkmcnt(0) 386; SI-NEXT: s_ff1_i32_b32 s4, s2 387; SI-NEXT: s_mov_b32 s2, -1 388; SI-NEXT: v_mov_b32_e32 v0, s4 389; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 390; SI-NEXT: s_endpgm 391; 392; VI-LABEL: s_cttz_zero_undef_i16_with_select: 393; VI: ; %bb.0: 394; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 395; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 396; VI-NEXT: s_waitcnt lgkmcnt(0) 397; VI-NEXT: s_ff1_i32_b32 s2, s2 398; VI-NEXT: v_mov_b32_e32 v0, s0 399; VI-NEXT: v_mov_b32_e32 v1, s1 400; VI-NEXT: v_mov_b32_e32 v2, s2 401; VI-NEXT: flat_store_short v[0:1], v2 402; VI-NEXT: s_endpgm 403; 404; EG-LABEL: s_cttz_zero_undef_i16_with_select: 405; EG: ; %bb.0: 406; EG-NEXT: ALU 0, @8, KC0[], KC1[] 407; EG-NEXT: TEX 0 @6 408; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 409; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 410; EG-NEXT: CF_END 411; EG-NEXT: PAD 412; EG-NEXT: Fetch clause starting at 6: 413; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 414; EG-NEXT: ALU clause starting at 8: 415; EG-NEXT: MOV * T0.X, 0.0, 416; EG-NEXT: ALU clause starting at 9: 417; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, 418; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 419; EG-NEXT: FFBL_INT T0.W, PV.W, 420; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 421; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 422; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 423; EG-NEXT: LSHL * T1.W, PS, literal.y, 424; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 425; EG-NEXT: LSHL T0.X, PV.W, PS, 426; EG-NEXT: LSHL * T0.W, literal.x, PS, 427; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 428; EG-NEXT: MOV T0.Y, 0.0, 429; EG-NEXT: MOV * T0.Z, 0.0, 430; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 431; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 432; 433; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: 434; GFX9-GISEL: ; %bb.0: 435; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 436; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 437; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 438; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 439; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 440; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 441; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] 442; GFX9-GISEL-NEXT: s_endpgm 443 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone 444 %cttz_ret = icmp ne i16 %val, 0 445 %ret = select i1 %cttz_ret, i16 %cttz, i16 32 446 store i16 %cttz, i16 addrspace(1)* %out, align 4 447 ret void 448} 449 450define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 451; SI-LABEL: s_cttz_zero_undef_i32_with_select: 452; SI: ; %bb.0: 453; SI-NEXT: s_load_dword s2, s[0:1], 0xb 454; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 455; SI-NEXT: s_mov_b32 s3, 0xf000 456; SI-NEXT: s_waitcnt lgkmcnt(0) 457; SI-NEXT: s_ff1_i32_b32 s4, s2 458; SI-NEXT: s_mov_b32 s2, -1 459; SI-NEXT: v_mov_b32_e32 v0, s4 460; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 461; SI-NEXT: s_endpgm 462; 463; VI-LABEL: s_cttz_zero_undef_i32_with_select: 464; VI: ; %bb.0: 465; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 466; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 467; VI-NEXT: s_waitcnt lgkmcnt(0) 468; VI-NEXT: s_ff1_i32_b32 s2, s2 469; VI-NEXT: v_mov_b32_e32 v0, s0 470; VI-NEXT: v_mov_b32_e32 v1, s1 471; VI-NEXT: v_mov_b32_e32 v2, s2 472; VI-NEXT: flat_store_dword v[0:1], v2 473; VI-NEXT: s_endpgm 474; 475; EG-LABEL: s_cttz_zero_undef_i32_with_select: 476; EG: ; %bb.0: 477; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 478; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 479; EG-NEXT: CF_END 480; EG-NEXT: PAD 481; EG-NEXT: ALU clause starting at 4: 482; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 483; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 484; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z, 485; 486; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: 487; GFX9-GISEL: ; %bb.0: 488; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 489; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 490; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 491; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 492; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 493; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 494; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 495; GFX9-GISEL-NEXT: s_endpgm 496 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 497 %cttz_ret = icmp ne i32 %val, 0 498 %ret = select i1 %cttz_ret, i32 %cttz, i32 32 499 store i32 %cttz, i32 addrspace(1)* %out, align 4 500 ret void 501} 502 503define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind { 504; SI-LABEL: s_cttz_zero_undef_i64_with_select: 505; SI: ; %bb.0: 506; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 507; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 508; SI-NEXT: s_mov_b32 s3, 0xf000 509; SI-NEXT: s_mov_b32 s2, -1 510; SI-NEXT: s_waitcnt lgkmcnt(0) 511; SI-NEXT: s_ff1_i32_b32 s5, s5 512; SI-NEXT: s_ff1_i32_b32 s4, s4 513; SI-NEXT: s_add_i32 s5, s5, 32 514; SI-NEXT: s_min_u32 s4, s4, s5 515; SI-NEXT: v_mov_b32_e32 v1, 0 516; SI-NEXT: v_mov_b32_e32 v0, s4 517; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 518; SI-NEXT: s_endpgm 519; 520; VI-LABEL: s_cttz_zero_undef_i64_with_select: 521; VI: ; %bb.0: 522; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 523; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 524; VI-NEXT: v_mov_b32_e32 v1, 0 525; VI-NEXT: s_waitcnt lgkmcnt(0) 526; VI-NEXT: s_ff1_i32_b32 s3, s3 527; VI-NEXT: s_ff1_i32_b32 s2, s2 528; VI-NEXT: s_add_i32 s3, s3, 32 529; VI-NEXT: s_min_u32 s2, s2, s3 530; VI-NEXT: v_mov_b32_e32 v3, s1 531; VI-NEXT: v_mov_b32_e32 v0, s2 532; VI-NEXT: v_mov_b32_e32 v2, s0 533; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 534; VI-NEXT: s_endpgm 535; 536; EG-LABEL: s_cttz_zero_undef_i64_with_select: 537; EG: ; %bb.0: 538; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 539; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 540; EG-NEXT: CF_END 541; EG-NEXT: PAD 542; EG-NEXT: ALU clause starting at 4: 543; EG-NEXT: FFBL_INT * T0.W, KC0[3].X, 544; EG-NEXT: FFBL_INT T1.W, KC0[2].W, 545; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 546; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 547; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W, 548; EG-NEXT: MOV T0.Y, 0.0, 549; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 550; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 551; 552; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: 553; GFX9-GISEL: ; %bb.0: 554; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 555; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 556; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 557; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 558; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] 559; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 560; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 561; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 562; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 563; GFX9-GISEL-NEXT: s_endpgm 564 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone 565 %cttz_ret = icmp ne i64 %val, 0 566 %ret = select i1 %cttz_ret, i64 %cttz, i64 32 567 store i64 %cttz, i64 addrspace(1)* %out, align 4 568 ret void 569} 570 571define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { 572; SI-LABEL: v_cttz_zero_undef_i8_with_select: 573; SI: ; %bb.0: 574; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 575; SI-NEXT: s_mov_b32 s3, 0xf000 576; SI-NEXT: s_mov_b32 s2, -1 577; SI-NEXT: s_mov_b32 s6, s2 578; SI-NEXT: s_mov_b32 s7, s3 579; SI-NEXT: s_waitcnt lgkmcnt(0) 580; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 581; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 582; SI-NEXT: s_waitcnt vmcnt(0) 583; SI-NEXT: v_ffbl_b32_e32 v1, v0 584; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 585; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 586; SI-NEXT: s_waitcnt lgkmcnt(0) 587; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 588; SI-NEXT: s_endpgm 589; 590; VI-LABEL: v_cttz_zero_undef_i8_with_select: 591; VI: ; %bb.0: 592; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 593; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 594; VI-NEXT: s_waitcnt lgkmcnt(0) 595; VI-NEXT: v_mov_b32_e32 v0, s2 596; VI-NEXT: v_mov_b32_e32 v1, s3 597; VI-NEXT: flat_load_ubyte v0, v[0:1] 598; VI-NEXT: s_waitcnt vmcnt(0) 599; VI-NEXT: v_ffbl_b32_e32 v1, v0 600; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 601; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc 602; VI-NEXT: v_mov_b32_e32 v0, s0 603; VI-NEXT: v_mov_b32_e32 v1, s1 604; VI-NEXT: flat_store_byte v[0:1], v2 605; VI-NEXT: s_endpgm 606; 607; EG-LABEL: v_cttz_zero_undef_i8_with_select: 608; EG: ; %bb.0: 609; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 610; EG-NEXT: TEX 0 @6 611; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 612; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 613; EG-NEXT: CF_END 614; EG-NEXT: PAD 615; EG-NEXT: Fetch clause starting at 6: 616; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 617; EG-NEXT: ALU clause starting at 8: 618; EG-NEXT: MOV * T0.X, KC0[2].Z, 619; EG-NEXT: ALU clause starting at 9: 620; EG-NEXT: FFBL_INT T0.W, T0.X, 621; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 622; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 623; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 624; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 625; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 626; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 627; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 628; EG-NEXT: LSHL T0.X, PV.W, PS, 629; EG-NEXT: LSHL * T0.W, literal.x, PS, 630; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 631; EG-NEXT: MOV T0.Y, 0.0, 632; EG-NEXT: MOV * T0.Z, 0.0, 633; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 634; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 635; 636; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: 637; GFX9-GISEL: ; %bb.0: 638; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 639; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 640; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 641; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 642; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 643; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 644; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 645; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 646; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 647; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 648; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 649; GFX9-GISEL-NEXT: s_endpgm 650 %val = load i8, i8 addrspace(1)* %arrayidx, align 1 651 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone 652 %cttz_ret = icmp ne i8 %val, 0 653 %ret = select i1 %cttz_ret, i8 %cttz, i8 32 654 store i8 %ret, i8 addrspace(1)* %out, align 4 655 ret void 656} 657 658define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { 659; SI-LABEL: v_cttz_zero_undef_i16_with_select: 660; SI: ; %bb.0: 661; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 662; SI-NEXT: s_mov_b32 s3, 0xf000 663; SI-NEXT: s_mov_b32 s2, -1 664; SI-NEXT: s_mov_b32 s6, s2 665; SI-NEXT: s_mov_b32 s7, s3 666; SI-NEXT: s_waitcnt lgkmcnt(0) 667; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 668; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 669; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 670; SI-NEXT: s_waitcnt vmcnt(1) 671; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 672; SI-NEXT: s_waitcnt vmcnt(0) 673; SI-NEXT: v_or_b32_e32 v0, v0, v1 674; SI-NEXT: v_ffbl_b32_e32 v1, v0 675; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 676; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 677; SI-NEXT: s_waitcnt lgkmcnt(0) 678; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 679; SI-NEXT: s_endpgm 680; 681; VI-LABEL: v_cttz_zero_undef_i16_with_select: 682; VI: ; %bb.0: 683; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 684; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 685; VI-NEXT: s_waitcnt lgkmcnt(0) 686; VI-NEXT: s_add_u32 s4, s2, 1 687; VI-NEXT: s_addc_u32 s5, s3, 0 688; VI-NEXT: v_mov_b32_e32 v2, s4 689; VI-NEXT: v_mov_b32_e32 v0, s2 690; VI-NEXT: v_mov_b32_e32 v3, s5 691; VI-NEXT: v_mov_b32_e32 v1, s3 692; VI-NEXT: flat_load_ubyte v2, v[2:3] 693; VI-NEXT: flat_load_ubyte v0, v[0:1] 694; VI-NEXT: s_waitcnt vmcnt(1) 695; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2 696; VI-NEXT: s_waitcnt vmcnt(0) 697; VI-NEXT: v_or_b32_e32 v0, v1, v0 698; VI-NEXT: v_ffbl_b32_e32 v1, v0 699; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 700; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc 701; VI-NEXT: v_mov_b32_e32 v0, s0 702; VI-NEXT: v_mov_b32_e32 v1, s1 703; VI-NEXT: flat_store_short v[0:1], v2 704; VI-NEXT: s_endpgm 705; 706; EG-LABEL: v_cttz_zero_undef_i16_with_select: 707; EG: ; %bb.0: 708; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 709; EG-NEXT: TEX 0 @6 710; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 711; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 712; EG-NEXT: CF_END 713; EG-NEXT: PAD 714; EG-NEXT: Fetch clause starting at 6: 715; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 716; EG-NEXT: ALU clause starting at 8: 717; EG-NEXT: MOV * T0.X, KC0[2].Z, 718; EG-NEXT: ALU clause starting at 9: 719; EG-NEXT: FFBL_INT T0.W, T0.X, 720; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 721; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 722; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 723; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 724; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 725; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 726; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 727; EG-NEXT: LSHL T0.X, PV.W, PS, 728; EG-NEXT: LSHL * T0.W, literal.x, PS, 729; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 730; EG-NEXT: MOV T0.Y, 0.0, 731; EG-NEXT: MOV * T0.Z, 0.0, 732; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 733; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 734; 735; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: 736; GFX9-GISEL: ; %bb.0: 737; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 738; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 739; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 740; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 741; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 742; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 743; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 744; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 745; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 746; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 747; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 748; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 749; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] 750; GFX9-GISEL-NEXT: s_endpgm 751 %val = load i16, i16 addrspace(1)* %arrayidx, align 1 752 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone 753 %cttz_ret = icmp ne i16 %val, 0 754 %ret = select i1 %cttz_ret, i16 %cttz, i16 32 755 store i16 %ret, i16 addrspace(1)* %out, align 4 756 ret void 757} 758 759define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 760; SI-LABEL: v_cttz_zero_undef_i32_with_select: 761; SI: ; %bb.0: 762; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 763; SI-NEXT: s_mov_b32 s3, 0xf000 764; SI-NEXT: s_mov_b32 s2, -1 765; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 766; SI-NEXT: s_mov_b32 s6, s2 767; SI-NEXT: s_mov_b32 s7, s3 768; SI-NEXT: s_waitcnt lgkmcnt(0) 769; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 770; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 771; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 772; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 773; SI-NEXT: s_waitcnt vmcnt(3) 774; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 775; SI-NEXT: s_waitcnt vmcnt(2) 776; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 777; SI-NEXT: s_waitcnt vmcnt(1) 778; SI-NEXT: v_or_b32_e32 v0, v0, v2 779; SI-NEXT: s_waitcnt vmcnt(0) 780; SI-NEXT: v_or_b32_e32 v1, v1, v3 781; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 782; SI-NEXT: v_or_b32_e32 v0, v1, v0 783; SI-NEXT: v_ffbl_b32_e32 v0, v0 784; SI-NEXT: v_min_u32_e32 v0, 32, v0 785; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 786; SI-NEXT: s_endpgm 787; 788; VI-LABEL: v_cttz_zero_undef_i32_with_select: 789; VI: ; %bb.0: 790; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 791; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 792; VI-NEXT: s_waitcnt lgkmcnt(0) 793; VI-NEXT: s_add_u32 s4, s2, 3 794; VI-NEXT: s_addc_u32 s5, s3, 0 795; VI-NEXT: v_mov_b32_e32 v2, s4 796; VI-NEXT: v_mov_b32_e32 v3, s5 797; VI-NEXT: s_add_u32 s4, s2, 2 798; VI-NEXT: v_mov_b32_e32 v0, s2 799; VI-NEXT: s_addc_u32 s5, s3, 0 800; VI-NEXT: v_mov_b32_e32 v1, s3 801; VI-NEXT: s_add_u32 s2, s2, 1 802; VI-NEXT: s_addc_u32 s3, s3, 0 803; VI-NEXT: v_mov_b32_e32 v4, s4 804; VI-NEXT: v_mov_b32_e32 v7, s3 805; VI-NEXT: v_mov_b32_e32 v5, s5 806; VI-NEXT: v_mov_b32_e32 v6, s2 807; VI-NEXT: flat_load_ubyte v2, v[2:3] 808; VI-NEXT: flat_load_ubyte v3, v[4:5] 809; VI-NEXT: flat_load_ubyte v4, v[6:7] 810; VI-NEXT: flat_load_ubyte v0, v[0:1] 811; VI-NEXT: s_waitcnt vmcnt(3) 812; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 813; VI-NEXT: s_waitcnt vmcnt(2) 814; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 815; VI-NEXT: s_waitcnt vmcnt(1) 816; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 817; VI-NEXT: s_waitcnt vmcnt(0) 818; VI-NEXT: v_or_b32_e32 v0, v2, v0 819; VI-NEXT: v_or_b32_e32 v0, v1, v0 820; VI-NEXT: v_ffbl_b32_e32 v0, v0 821; VI-NEXT: v_min_u32_e32 v2, 32, v0 822; VI-NEXT: v_mov_b32_e32 v0, s0 823; VI-NEXT: v_mov_b32_e32 v1, s1 824; VI-NEXT: flat_store_dword v[0:1], v2 825; VI-NEXT: s_endpgm 826; 827; EG-LABEL: v_cttz_zero_undef_i32_with_select: 828; EG: ; %bb.0: 829; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 830; EG-NEXT: TEX 1 @6 831; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 832; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 833; EG-NEXT: CF_END 834; EG-NEXT: PAD 835; EG-NEXT: Fetch clause starting at 6: 836; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 837; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 838; EG-NEXT: ALU clause starting at 10: 839; EG-NEXT: MOV * T0.X, KC0[2].Z, 840; EG-NEXT: ALU clause starting at 11: 841; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 842; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 843; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 844; EG-NEXT: FFBL_INT * T1.W, PV.W, 845; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 846; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 847; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 848; 849; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: 850; GFX9-GISEL: ; %bb.0: 851; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 852; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 853; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 854; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 855; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 856; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 857; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 858; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 859; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 860; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 861; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 862; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 863; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 864; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 865; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 866; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 867; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 868; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 869; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 870; GFX9-GISEL-NEXT: s_endpgm 871 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 872 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 873 %cttz_ret = icmp ne i32 %val, 0 874 %ret = select i1 %cttz_ret, i32 %cttz, i32 32 875 store i32 %ret, i32 addrspace(1)* %out, align 4 876 ret void 877} 878 879define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind { 880; SI-LABEL: v_cttz_zero_undef_i64_with_select: 881; SI: ; %bb.0: 882; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 883; SI-NEXT: s_mov_b32 s3, 0xf000 884; SI-NEXT: s_mov_b32 s2, -1 885; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 886; SI-NEXT: s_mov_b32 s6, s2 887; SI-NEXT: s_mov_b32 s7, s3 888; SI-NEXT: s_waitcnt lgkmcnt(0) 889; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 890; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 891; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 892; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 893; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:4 894; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:5 895; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:6 896; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:7 897; SI-NEXT: s_waitcnt vmcnt(6) 898; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 899; SI-NEXT: s_waitcnt vmcnt(4) 900; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 901; SI-NEXT: s_waitcnt vmcnt(2) 902; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 903; SI-NEXT: s_waitcnt vmcnt(0) 904; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 905; SI-NEXT: v_or_b32_e32 v0, v1, v0 906; SI-NEXT: v_or_b32_e32 v1, v3, v2 907; SI-NEXT: v_or_b32_e32 v2, v5, v4 908; SI-NEXT: v_or_b32_e32 v3, v7, v6 909; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 910; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 911; SI-NEXT: v_or_b32_e32 v0, v1, v0 912; SI-NEXT: v_or_b32_e32 v1, v3, v2 913; SI-NEXT: v_ffbl_b32_e32 v1, v1 914; SI-NEXT: v_ffbl_b32_e32 v0, v0 915; SI-NEXT: v_min_u32_e32 v1, 0xffffffdf, v1 916; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 917; SI-NEXT: v_min3_u32 v0, v0, v1, 64 918; SI-NEXT: v_mov_b32_e32 v1, 0 919; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 920; SI-NEXT: s_endpgm 921; 922; VI-LABEL: v_cttz_zero_undef_i64_with_select: 923; VI: ; %bb.0: 924; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 925; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 926; VI-NEXT: s_waitcnt lgkmcnt(0) 927; VI-NEXT: s_add_u32 s4, s2, 5 928; VI-NEXT: s_addc_u32 s5, s3, 0 929; VI-NEXT: v_mov_b32_e32 v0, s4 930; VI-NEXT: v_mov_b32_e32 v1, s5 931; VI-NEXT: s_add_u32 s4, s2, 4 932; VI-NEXT: s_addc_u32 s5, s3, 0 933; VI-NEXT: v_mov_b32_e32 v2, s4 934; VI-NEXT: v_mov_b32_e32 v3, s5 935; VI-NEXT: s_add_u32 s4, s2, 7 936; VI-NEXT: s_addc_u32 s5, s3, 0 937; VI-NEXT: v_mov_b32_e32 v4, s4 938; VI-NEXT: v_mov_b32_e32 v5, s5 939; VI-NEXT: s_add_u32 s4, s2, 6 940; VI-NEXT: s_addc_u32 s5, s3, 0 941; VI-NEXT: v_mov_b32_e32 v7, s5 942; VI-NEXT: v_mov_b32_e32 v6, s4 943; VI-NEXT: s_add_u32 s4, s2, 3 944; VI-NEXT: s_addc_u32 s5, s3, 0 945; VI-NEXT: v_mov_b32_e32 v9, s5 946; VI-NEXT: v_mov_b32_e32 v8, s4 947; VI-NEXT: s_add_u32 s4, s2, 2 948; VI-NEXT: s_addc_u32 s5, s3, 0 949; VI-NEXT: v_mov_b32_e32 v11, s5 950; VI-NEXT: v_mov_b32_e32 v10, s4 951; VI-NEXT: flat_load_ubyte v12, v[0:1] 952; VI-NEXT: flat_load_ubyte v13, v[2:3] 953; VI-NEXT: flat_load_ubyte v4, v[4:5] 954; VI-NEXT: flat_load_ubyte v5, v[6:7] 955; VI-NEXT: s_add_u32 s4, s2, 1 956; VI-NEXT: flat_load_ubyte v6, v[8:9] 957; VI-NEXT: s_addc_u32 s5, s3, 0 958; VI-NEXT: v_mov_b32_e32 v0, s4 959; VI-NEXT: v_mov_b32_e32 v2, s2 960; VI-NEXT: v_mov_b32_e32 v1, s5 961; VI-NEXT: v_mov_b32_e32 v3, s3 962; VI-NEXT: flat_load_ubyte v7, v[10:11] 963; VI-NEXT: flat_load_ubyte v0, v[0:1] 964; VI-NEXT: flat_load_ubyte v2, v[2:3] 965; VI-NEXT: v_mov_b32_e32 v1, 0 966; VI-NEXT: s_waitcnt vmcnt(7) 967; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 968; VI-NEXT: s_waitcnt vmcnt(6) 969; VI-NEXT: v_or_b32_e32 v3, v3, v13 970; VI-NEXT: s_waitcnt vmcnt(5) 971; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 972; VI-NEXT: s_waitcnt vmcnt(4) 973; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 974; VI-NEXT: v_or_b32_e32 v3, v4, v3 975; VI-NEXT: s_waitcnt vmcnt(3) 976; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 977; VI-NEXT: v_ffbl_b32_e32 v3, v3 978; VI-NEXT: v_add_u32_e64 v3, s[2:3], v3, 32 clamp 979; VI-NEXT: s_waitcnt vmcnt(2) 980; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 981; VI-NEXT: s_waitcnt vmcnt(1) 982; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 983; VI-NEXT: s_waitcnt vmcnt(0) 984; VI-NEXT: v_or_b32_e32 v0, v0, v2 985; VI-NEXT: v_or_b32_e32 v0, v4, v0 986; VI-NEXT: v_ffbl_b32_e32 v0, v0 987; VI-NEXT: v_min3_u32 v0, v0, v3, 64 988; VI-NEXT: v_mov_b32_e32 v3, s1 989; VI-NEXT: v_mov_b32_e32 v2, s0 990; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 991; VI-NEXT: s_endpgm 992; 993; EG-LABEL: v_cttz_zero_undef_i64_with_select: 994; EG: ; %bb.0: 995; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 996; EG-NEXT: TEX 3 @6 997; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 998; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 999; EG-NEXT: CF_END 1000; EG-NEXT: PAD 1001; EG-NEXT: Fetch clause starting at 6: 1002; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1 1003; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1004; EG-NEXT: VTX_READ_16 T3.X, T0.X, 2, #1 1005; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1006; EG-NEXT: ALU clause starting at 14: 1007; EG-NEXT: MOV * T0.X, KC0[2].Z, 1008; EG-NEXT: ALU clause starting at 15: 1009; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1010; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1011; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1012; EG-NEXT: FFBL_INT T1.W, PV.W, 1013; EG-NEXT: LSHL * T2.W, T3.X, literal.x, 1014; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1015; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W, 1016; EG-NEXT: OR_INT * T1.W, PS, T2.X, 1017; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1018; EG-NEXT: FFBL_INT T2.W, PS, 1019; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 1020; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1021; EG-NEXT: CNDE_INT T0.X, T1.W, PS, PV.W, 1022; EG-NEXT: MOV T0.Y, 0.0, 1023; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1024; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1025; 1026; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: 1027; GFX9-GISEL: ; %bb.0: 1028; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1029; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1030; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1031; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1032; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] 1033; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 1034; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 1035; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 1036; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 1037; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 1038; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:7 1039; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6 1040; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) 1041; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 1042; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) 1043; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1044; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) 1045; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 1046; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0 1047; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1048; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5 1049; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1050; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v7 1051; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1052; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v8 1053; GFX9-GISEL-NEXT: v_or3_b32 v3, v5, v6, v4 1054; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3 1055; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2 1056; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4 1057; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1058; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 1059; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 1060; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 1061; GFX9-GISEL-NEXT: s_endpgm 1062 %val = load i64, i64 addrspace(1)* %arrayidx, align 1 1063 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone 1064 %cttz_ret = icmp ne i64 %val, 0 1065 %ret = select i1 %cttz_ret, i64 %cttz, i64 64 1066 store i64 %ret, i64 addrspace(1)* %out, align 4 1067 ret void 1068} 1069 1070define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 1071; SI-LABEL: v_cttz_i32_sel_eq_neg1: 1072; SI: ; %bb.0: 1073; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1074; SI-NEXT: s_mov_b32 s3, 0xf000 1075; SI-NEXT: s_mov_b32 s2, -1 1076; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1077; SI-NEXT: s_mov_b32 s6, s2 1078; SI-NEXT: s_mov_b32 s7, s3 1079; SI-NEXT: s_waitcnt lgkmcnt(0) 1080; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 1081; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 1082; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 1083; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 1084; SI-NEXT: s_waitcnt vmcnt(3) 1085; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1086; SI-NEXT: s_waitcnt vmcnt(2) 1087; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1088; SI-NEXT: s_waitcnt vmcnt(1) 1089; SI-NEXT: v_or_b32_e32 v0, v0, v2 1090; SI-NEXT: s_waitcnt vmcnt(0) 1091; SI-NEXT: v_or_b32_e32 v1, v1, v3 1092; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1093; SI-NEXT: v_or_b32_e32 v0, v1, v0 1094; SI-NEXT: v_ffbl_b32_e32 v0, v0 1095; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1096; SI-NEXT: s_endpgm 1097; 1098; VI-LABEL: v_cttz_i32_sel_eq_neg1: 1099; VI: ; %bb.0: 1100; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1101; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1102; VI-NEXT: s_waitcnt lgkmcnt(0) 1103; VI-NEXT: s_add_u32 s4, s2, 3 1104; VI-NEXT: s_addc_u32 s5, s3, 0 1105; VI-NEXT: v_mov_b32_e32 v2, s4 1106; VI-NEXT: v_mov_b32_e32 v3, s5 1107; VI-NEXT: s_add_u32 s4, s2, 2 1108; VI-NEXT: v_mov_b32_e32 v0, s2 1109; VI-NEXT: s_addc_u32 s5, s3, 0 1110; VI-NEXT: v_mov_b32_e32 v1, s3 1111; VI-NEXT: s_add_u32 s2, s2, 1 1112; VI-NEXT: s_addc_u32 s3, s3, 0 1113; VI-NEXT: v_mov_b32_e32 v4, s4 1114; VI-NEXT: v_mov_b32_e32 v7, s3 1115; VI-NEXT: v_mov_b32_e32 v5, s5 1116; VI-NEXT: v_mov_b32_e32 v6, s2 1117; VI-NEXT: flat_load_ubyte v2, v[2:3] 1118; VI-NEXT: flat_load_ubyte v3, v[4:5] 1119; VI-NEXT: flat_load_ubyte v4, v[6:7] 1120; VI-NEXT: flat_load_ubyte v0, v[0:1] 1121; VI-NEXT: s_waitcnt vmcnt(3) 1122; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1123; VI-NEXT: s_waitcnt vmcnt(2) 1124; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1125; VI-NEXT: s_waitcnt vmcnt(1) 1126; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 1127; VI-NEXT: s_waitcnt vmcnt(0) 1128; VI-NEXT: v_or_b32_e32 v0, v2, v0 1129; VI-NEXT: v_or_b32_e32 v0, v1, v0 1130; VI-NEXT: v_ffbl_b32_e32 v2, v0 1131; VI-NEXT: v_mov_b32_e32 v0, s0 1132; VI-NEXT: v_mov_b32_e32 v1, s1 1133; VI-NEXT: flat_store_dword v[0:1], v2 1134; VI-NEXT: s_endpgm 1135; 1136; EG-LABEL: v_cttz_i32_sel_eq_neg1: 1137; EG: ; %bb.0: 1138; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1139; EG-NEXT: TEX 1 @6 1140; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 1141; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1142; EG-NEXT: CF_END 1143; EG-NEXT: PAD 1144; EG-NEXT: Fetch clause starting at 6: 1145; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1146; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1147; EG-NEXT: ALU clause starting at 10: 1148; EG-NEXT: MOV * T0.X, KC0[2].Z, 1149; EG-NEXT: ALU clause starting at 11: 1150; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1151; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1152; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1153; EG-NEXT: FFBL_INT * T1.W, PV.W, 1154; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, 1155; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1156; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 1157; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1158; EG-NEXT: -1(nan), 2(2.802597e-45) 1159; 1160; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: 1161; GFX9-GISEL: ; %bb.0: 1162; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1163; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1164; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1165; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1166; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1167; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1168; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1169; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 1170; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1171; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1172; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1173; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1174; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1175; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1176; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 1177; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 1178; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1179; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1180; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 1181; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1182; GFX9-GISEL-NEXT: s_endpgm 1183 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 1184 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1185 %cmp = icmp eq i32 %val, 0 1186 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1187 store i32 %sel, i32 addrspace(1)* %out 1188 ret void 1189} 1190 1191define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 1192; SI-LABEL: v_cttz_i32_sel_ne_neg1: 1193; SI: ; %bb.0: 1194; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1195; SI-NEXT: s_mov_b32 s3, 0xf000 1196; SI-NEXT: s_mov_b32 s2, -1 1197; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1198; SI-NEXT: s_mov_b32 s6, s2 1199; SI-NEXT: s_mov_b32 s7, s3 1200; SI-NEXT: s_waitcnt lgkmcnt(0) 1201; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 1202; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 1203; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 1204; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 1205; SI-NEXT: s_waitcnt vmcnt(3) 1206; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1207; SI-NEXT: s_waitcnt vmcnt(2) 1208; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1209; SI-NEXT: s_waitcnt vmcnt(1) 1210; SI-NEXT: v_or_b32_e32 v0, v0, v2 1211; SI-NEXT: s_waitcnt vmcnt(0) 1212; SI-NEXT: v_or_b32_e32 v1, v1, v3 1213; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1214; SI-NEXT: v_or_b32_e32 v0, v1, v0 1215; SI-NEXT: v_ffbl_b32_e32 v0, v0 1216; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1217; SI-NEXT: s_endpgm 1218; 1219; VI-LABEL: v_cttz_i32_sel_ne_neg1: 1220; VI: ; %bb.0: 1221; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1222; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1223; VI-NEXT: s_waitcnt lgkmcnt(0) 1224; VI-NEXT: s_add_u32 s4, s2, 3 1225; VI-NEXT: s_addc_u32 s5, s3, 0 1226; VI-NEXT: v_mov_b32_e32 v2, s4 1227; VI-NEXT: v_mov_b32_e32 v3, s5 1228; VI-NEXT: s_add_u32 s4, s2, 2 1229; VI-NEXT: v_mov_b32_e32 v0, s2 1230; VI-NEXT: s_addc_u32 s5, s3, 0 1231; VI-NEXT: v_mov_b32_e32 v1, s3 1232; VI-NEXT: s_add_u32 s2, s2, 1 1233; VI-NEXT: s_addc_u32 s3, s3, 0 1234; VI-NEXT: v_mov_b32_e32 v4, s4 1235; VI-NEXT: v_mov_b32_e32 v7, s3 1236; VI-NEXT: v_mov_b32_e32 v5, s5 1237; VI-NEXT: v_mov_b32_e32 v6, s2 1238; VI-NEXT: flat_load_ubyte v2, v[2:3] 1239; VI-NEXT: flat_load_ubyte v3, v[4:5] 1240; VI-NEXT: flat_load_ubyte v4, v[6:7] 1241; VI-NEXT: flat_load_ubyte v0, v[0:1] 1242; VI-NEXT: s_waitcnt vmcnt(3) 1243; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1244; VI-NEXT: s_waitcnt vmcnt(2) 1245; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1246; VI-NEXT: s_waitcnt vmcnt(1) 1247; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 1248; VI-NEXT: s_waitcnt vmcnt(0) 1249; VI-NEXT: v_or_b32_e32 v0, v2, v0 1250; VI-NEXT: v_or_b32_e32 v0, v1, v0 1251; VI-NEXT: v_ffbl_b32_e32 v2, v0 1252; VI-NEXT: v_mov_b32_e32 v0, s0 1253; VI-NEXT: v_mov_b32_e32 v1, s1 1254; VI-NEXT: flat_store_dword v[0:1], v2 1255; VI-NEXT: s_endpgm 1256; 1257; EG-LABEL: v_cttz_i32_sel_ne_neg1: 1258; EG: ; %bb.0: 1259; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1260; EG-NEXT: TEX 1 @6 1261; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 1262; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1263; EG-NEXT: CF_END 1264; EG-NEXT: PAD 1265; EG-NEXT: Fetch clause starting at 6: 1266; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1267; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1268; EG-NEXT: ALU clause starting at 10: 1269; EG-NEXT: MOV * T0.X, KC0[2].Z, 1270; EG-NEXT: ALU clause starting at 11: 1271; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1272; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1273; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1274; EG-NEXT: FFBL_INT * T1.W, PV.W, 1275; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, 1276; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1277; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 1278; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1279; EG-NEXT: -1(nan), 2(2.802597e-45) 1280; 1281; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: 1282; GFX9-GISEL: ; %bb.0: 1283; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1284; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1285; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1286; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1288; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1289; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1290; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 1291; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1292; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1293; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1294; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1295; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1296; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1297; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 1298; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 1299; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1300; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1301; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc 1302; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1303; GFX9-GISEL-NEXT: s_endpgm 1304 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 1305 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1306 %cmp = icmp ne i32 %val, 0 1307 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1308 store i32 %sel, i32 addrspace(1)* %out 1309 ret void 1310} 1311 1312define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { 1313; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1314; SI: ; %bb.0: 1315; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1316; SI-NEXT: s_mov_b32 s3, 0xf000 1317; SI-NEXT: s_mov_b32 s2, -1 1318; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1319; SI-NEXT: s_mov_b32 s6, s2 1320; SI-NEXT: s_mov_b32 s7, s3 1321; SI-NEXT: s_waitcnt lgkmcnt(0) 1322; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 1323; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 1324; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 1325; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 1326; SI-NEXT: s_waitcnt vmcnt(3) 1327; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1328; SI-NEXT: s_waitcnt vmcnt(2) 1329; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1330; SI-NEXT: s_waitcnt vmcnt(1) 1331; SI-NEXT: v_or_b32_e32 v0, v0, v2 1332; SI-NEXT: s_waitcnt vmcnt(0) 1333; SI-NEXT: v_or_b32_e32 v1, v1, v3 1334; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1335; SI-NEXT: v_or_b32_e32 v0, v1, v0 1336; SI-NEXT: v_ffbl_b32_e32 v0, v0 1337; SI-NEXT: v_min_u32_e32 v0, 32, v0 1338; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1339; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1340; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1341; SI-NEXT: s_endpgm 1342; 1343; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1344; VI: ; %bb.0: 1345; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1346; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1347; VI-NEXT: s_waitcnt lgkmcnt(0) 1348; VI-NEXT: s_add_u32 s4, s2, 3 1349; VI-NEXT: s_addc_u32 s5, s3, 0 1350; VI-NEXT: v_mov_b32_e32 v2, s4 1351; VI-NEXT: v_mov_b32_e32 v3, s5 1352; VI-NEXT: s_add_u32 s4, s2, 2 1353; VI-NEXT: v_mov_b32_e32 v0, s2 1354; VI-NEXT: s_addc_u32 s5, s3, 0 1355; VI-NEXT: v_mov_b32_e32 v1, s3 1356; VI-NEXT: s_add_u32 s2, s2, 1 1357; VI-NEXT: s_addc_u32 s3, s3, 0 1358; VI-NEXT: v_mov_b32_e32 v4, s4 1359; VI-NEXT: v_mov_b32_e32 v7, s3 1360; VI-NEXT: v_mov_b32_e32 v5, s5 1361; VI-NEXT: v_mov_b32_e32 v6, s2 1362; VI-NEXT: flat_load_ubyte v2, v[2:3] 1363; VI-NEXT: flat_load_ubyte v3, v[4:5] 1364; VI-NEXT: flat_load_ubyte v4, v[6:7] 1365; VI-NEXT: flat_load_ubyte v0, v[0:1] 1366; VI-NEXT: s_waitcnt vmcnt(3) 1367; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1368; VI-NEXT: s_waitcnt vmcnt(2) 1369; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1370; VI-NEXT: s_waitcnt vmcnt(1) 1371; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 1372; VI-NEXT: s_waitcnt vmcnt(0) 1373; VI-NEXT: v_or_b32_e32 v0, v2, v0 1374; VI-NEXT: v_or_b32_e32 v0, v1, v0 1375; VI-NEXT: v_ffbl_b32_e32 v0, v0 1376; VI-NEXT: v_min_u32_e32 v0, 32, v0 1377; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1378; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc 1379; VI-NEXT: v_mov_b32_e32 v0, s0 1380; VI-NEXT: v_mov_b32_e32 v1, s1 1381; VI-NEXT: flat_store_dword v[0:1], v2 1382; VI-NEXT: s_endpgm 1383; 1384; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: 1385; EG: ; %bb.0: 1386; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1387; EG-NEXT: TEX 1 @6 1388; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 1389; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1390; EG-NEXT: CF_END 1391; EG-NEXT: PAD 1392; EG-NEXT: Fetch clause starting at 6: 1393; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1394; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1395; EG-NEXT: ALU clause starting at 10: 1396; EG-NEXT: MOV * T0.X, KC0[2].Z, 1397; EG-NEXT: ALU clause starting at 11: 1398; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1399; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1400; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1401; EG-NEXT: FFBL_INT * T1.W, PV.W, 1402; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, 1403; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1404; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1405; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1406; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1407; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1408; EG-NEXT: -1(nan), 2(2.802597e-45) 1409; 1410; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: 1411; GFX9-GISEL: ; %bb.0: 1412; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1413; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1414; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1415; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1417; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1418; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1419; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 1420; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1421; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1422; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1423; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1424; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1425; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1426; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 1427; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1428; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1429; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 1430; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc 1431; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1432; GFX9-GISEL-NEXT: s_endpgm 1433 %val = load i32, i32 addrspace(1)* %arrayidx, align 1 1434 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1435 %cmp = icmp ne i32 %ctlz, 32 1436 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1437 store i32 %sel, i32 addrspace(1)* %out 1438 ret void 1439} 1440 1441 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { 1442; SI-LABEL: v_cttz_i8_sel_eq_neg1: 1443; SI: ; %bb.0: 1444; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1445; SI-NEXT: s_mov_b32 s3, 0xf000 1446; SI-NEXT: s_mov_b32 s2, -1 1447; SI-NEXT: s_mov_b32 s6, s2 1448; SI-NEXT: s_mov_b32 s7, s3 1449; SI-NEXT: s_waitcnt lgkmcnt(0) 1450; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 1451; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1452; SI-NEXT: s_waitcnt vmcnt(0) 1453; SI-NEXT: v_ffbl_b32_e32 v0, v0 1454; SI-NEXT: s_waitcnt lgkmcnt(0) 1455; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1456; SI-NEXT: s_endpgm 1457; 1458; VI-LABEL: v_cttz_i8_sel_eq_neg1: 1459; VI: ; %bb.0: 1460; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1461; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1462; VI-NEXT: s_waitcnt lgkmcnt(0) 1463; VI-NEXT: v_mov_b32_e32 v0, s2 1464; VI-NEXT: v_mov_b32_e32 v1, s3 1465; VI-NEXT: flat_load_ubyte v0, v[0:1] 1466; VI-NEXT: v_mov_b32_e32 v1, 0xff 1467; VI-NEXT: s_waitcnt vmcnt(0) 1468; VI-NEXT: v_or_b32_e32 v2, 0x100, v0 1469; VI-NEXT: v_ffbl_b32_e32 v2, v2 1470; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 1471; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc 1472; VI-NEXT: v_mov_b32_e32 v0, s0 1473; VI-NEXT: v_mov_b32_e32 v1, s1 1474; VI-NEXT: flat_store_byte v[0:1], v2 1475; VI-NEXT: s_endpgm 1476; 1477; EG-LABEL: v_cttz_i8_sel_eq_neg1: 1478; EG: ; %bb.0: 1479; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1480; EG-NEXT: TEX 0 @6 1481; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1482; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1483; EG-NEXT: CF_END 1484; EG-NEXT: PAD 1485; EG-NEXT: Fetch clause starting at 6: 1486; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1487; EG-NEXT: ALU clause starting at 8: 1488; EG-NEXT: MOV * T0.X, KC0[2].Z, 1489; EG-NEXT: ALU clause starting at 9: 1490; EG-NEXT: FFBL_INT T0.W, T0.X, 1491; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1492; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1493; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1494; EG-NEXT: LSHL * T1.W, PS, literal.y, 1495; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1496; EG-NEXT: LSHL T0.X, PV.W, PS, 1497; EG-NEXT: LSHL * T0.W, literal.x, PS, 1498; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1499; EG-NEXT: MOV T0.Y, 0.0, 1500; EG-NEXT: MOV * T0.Z, 0.0, 1501; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1502; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1503; 1504; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: 1505; GFX9-GISEL: ; %bb.0: 1506; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1507; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1508; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1509; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff 1510; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1511; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1512; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1513; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 1514; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 1515; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 1516; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1517; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 1518; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 1519; GFX9-GISEL-NEXT: s_endpgm 1520 %val = load i8, i8 addrspace(1)* %arrayidx, align 1 1521 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 1522 %cmp = icmp eq i8 %val, 0 1523 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1524 store i8 %sel, i8 addrspace(1)* %out 1525 ret void 1526} 1527 1528 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { 1529; SI-LABEL: v_cttz_i16_sel_eq_neg1: 1530; SI: ; %bb.0: 1531; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1532; SI-NEXT: s_mov_b32 s3, 0xf000 1533; SI-NEXT: s_mov_b32 s2, -1 1534; SI-NEXT: s_mov_b32 s6, s2 1535; SI-NEXT: s_mov_b32 s7, s3 1536; SI-NEXT: s_waitcnt lgkmcnt(0) 1537; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 1538; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 1539; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1540; SI-NEXT: s_waitcnt vmcnt(1) 1541; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1542; SI-NEXT: s_waitcnt vmcnt(0) 1543; SI-NEXT: v_or_b32_e32 v0, v0, v1 1544; SI-NEXT: v_ffbl_b32_e32 v0, v0 1545; SI-NEXT: s_waitcnt lgkmcnt(0) 1546; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1547; SI-NEXT: s_endpgm 1548; 1549; VI-LABEL: v_cttz_i16_sel_eq_neg1: 1550; VI: ; %bb.0: 1551; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1552; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1553; VI-NEXT: s_waitcnt lgkmcnt(0) 1554; VI-NEXT: s_add_u32 s4, s2, 1 1555; VI-NEXT: s_addc_u32 s5, s3, 0 1556; VI-NEXT: v_mov_b32_e32 v2, s4 1557; VI-NEXT: v_mov_b32_e32 v0, s2 1558; VI-NEXT: v_mov_b32_e32 v3, s5 1559; VI-NEXT: v_mov_b32_e32 v1, s3 1560; VI-NEXT: flat_load_ubyte v2, v[2:3] 1561; VI-NEXT: flat_load_ubyte v0, v[0:1] 1562; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1563; VI-NEXT: s_waitcnt vmcnt(1) 1564; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 1565; VI-NEXT: s_waitcnt vmcnt(0) 1566; VI-NEXT: v_or_b32_e32 v0, v2, v0 1567; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 1568; VI-NEXT: v_ffbl_b32_e32 v2, v2 1569; VI-NEXT: v_min_u32_e32 v2, 32, v2 1570; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 1571; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc 1572; VI-NEXT: v_mov_b32_e32 v0, s0 1573; VI-NEXT: v_mov_b32_e32 v1, s1 1574; VI-NEXT: flat_store_short v[0:1], v2 1575; VI-NEXT: s_endpgm 1576; 1577; EG-LABEL: v_cttz_i16_sel_eq_neg1: 1578; EG: ; %bb.0: 1579; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1580; EG-NEXT: TEX 0 @6 1581; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1582; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1583; EG-NEXT: CF_END 1584; EG-NEXT: PAD 1585; EG-NEXT: Fetch clause starting at 6: 1586; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1587; EG-NEXT: ALU clause starting at 8: 1588; EG-NEXT: MOV * T0.X, KC0[2].Z, 1589; EG-NEXT: ALU clause starting at 9: 1590; EG-NEXT: FFBL_INT T0.W, T0.X, 1591; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1592; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1593; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1594; EG-NEXT: LSHL * T1.W, PS, literal.y, 1595; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1596; EG-NEXT: LSHL T0.X, PV.W, PS, 1597; EG-NEXT: LSHL * T0.W, literal.x, PS, 1598; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1599; EG-NEXT: MOV T0.Y, 0.0, 1600; EG-NEXT: MOV * T0.Z, 0.0, 1601; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1603; 1604; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: 1605; GFX9-GISEL: ; %bb.0: 1606; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1607; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1608; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1609; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff 1610; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1611; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1612; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1613; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1614; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1615; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 1616; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 1617; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 1618; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1619; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 1620; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1621; GFX9-GISEL-NEXT: s_endpgm 1622 %val = load i16, i16 addrspace(1)* %arrayidx, align 1 1623 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone 1624 %cmp = icmp eq i16 %val, 0 1625 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1626 store i16 %sel, i16 addrspace(1)* %out 1627 ret void 1628} 1629 1630 1631