1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI 4; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG 5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 6; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL 7 8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone 9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone 10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone 11 12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone 13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone 14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone 15 16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone 17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone 18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone 19 20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 21 22define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { 23; SI-LABEL: s_cttz_i32: 24; SI: ; %bb.0: 25; SI-NEXT: s_load_dword s2, s[0:1], 0xb 26; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 27; SI-NEXT: s_mov_b32 s3, 0xf000 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: s_ff1_i32_b32 s2, s2 30; SI-NEXT: s_min_u32 s4, s2, 32 31; SI-NEXT: s_mov_b32 s2, -1 32; SI-NEXT: v_mov_b32_e32 v0, s4 33; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: s_cttz_i32: 37; VI: ; %bb.0: 38; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 39; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 40; VI-NEXT: s_mov_b32 s3, 0xf000 41; VI-NEXT: s_mov_b32 s2, -1 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: s_ff1_i32_b32 s4, s4 44; VI-NEXT: s_min_u32 s4, s4, 32 45; VI-NEXT: v_mov_b32_e32 v0, s4 46; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; VI-NEXT: s_endpgm 48; 49; EG-LABEL: s_cttz_i32: 50; EG: ; %bb.0: 51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: ALU clause starting at 4: 56; EG-NEXT: FFBL_INT * T0.W, KC0[2].Z, 57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 60; 61; GFX10-LABEL: s_cttz_i32: 62; GFX10: ; %bb.0: 63; GFX10-NEXT: s_clause 0x1 64; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c 65; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 66; GFX10-NEXT: v_mov_b32_e32 v0, 0 67; GFX10-NEXT: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: s_ff1_i32_b32 s0, s4 69; GFX10-NEXT: s_min_u32 s0, s0, 32 70; GFX10-NEXT: v_mov_b32_e32 v1, s0 71; GFX10-NEXT: global_store_dword v0, v1, s[2:3] 72; GFX10-NEXT: s_endpgm 73; 74; GFX10-GISEL-LABEL: s_cttz_i32: 75; GFX10-GISEL: ; %bb.0: 76; GFX10-GISEL-NEXT: s_clause 0x1 77; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c 78; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 79; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 80; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 82; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 83; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 84; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] 85; GFX10-GISEL-NEXT: s_endpgm 86 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 87 store i32 %cttz, i32 addrspace(1)* %out, align 4 88 ret void 89} 90 91define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 92; SI-LABEL: v_cttz_i32: 93; SI: ; %bb.0: 94; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 95; SI-NEXT: s_mov_b32 s3, 0xf000 96; SI-NEXT: s_mov_b32 s6, 0 97; SI-NEXT: s_mov_b32 s7, s3 98; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 99; SI-NEXT: v_mov_b32_e32 v1, 0 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 102; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt vmcnt(0) 105; SI-NEXT: v_ffbl_b32_e32 v0, v0 106; SI-NEXT: v_min_u32_e32 v0, 32, v0 107; SI-NEXT: s_waitcnt lgkmcnt(0) 108; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 109; SI-NEXT: s_endpgm 110; 111; VI-LABEL: v_cttz_i32: 112; VI: ; %bb.0: 113; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 114; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 115; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v1, s3 118; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 119; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 120; VI-NEXT: flat_load_dword v0, v[0:1] 121; VI-NEXT: s_mov_b32 s3, 0xf000 122; VI-NEXT: s_mov_b32 s2, -1 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_ffbl_b32_e32 v0, v0 125; VI-NEXT: v_min_u32_e32 v0, 32, v0 126; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 127; VI-NEXT: s_endpgm 128; 129; EG-LABEL: v_cttz_i32: 130; EG: ; %bb.0: 131; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 132; EG-NEXT: TEX 0 @6 133; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 134; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 135; EG-NEXT: CF_END 136; EG-NEXT: PAD 137; EG-NEXT: Fetch clause starting at 6: 138; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 139; EG-NEXT: ALU clause starting at 8: 140; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 141; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 142; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 143; EG-NEXT: ALU clause starting at 11: 144; EG-NEXT: FFBL_INT * T0.W, T0.X, 145; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 146; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 147; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 148; 149; GFX10-LABEL: v_cttz_i32: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 152; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 153; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 154; GFX10-NEXT: v_mov_b32_e32 v1, 0 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 157; GFX10-NEXT: s_waitcnt vmcnt(0) 158; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 159; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 160; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 161; GFX10-NEXT: s_endpgm 162; 163; GFX10-GISEL-LABEL: v_cttz_i32: 164; GFX10-GISEL: ; %bb.0: 165; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 166; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 167; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 168; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 169; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 171; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 172; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 173; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 174; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 175; GFX10-GISEL-NEXT: s_endpgm 176 %tid = call i32 @llvm.amdgcn.workitem.id.x() 177 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 178 %val = load i32, i32 addrspace(1)* %in.gep, align 4 179 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 180 store i32 %cttz, i32 addrspace(1)* %out, align 4 181 ret void 182} 183 184define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { 185; SI-LABEL: v_cttz_v2i32: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 188; SI-NEXT: s_mov_b32 s3, 0xf000 189; SI-NEXT: s_mov_b32 s6, 0 190; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 191; SI-NEXT: v_mov_b32_e32 v1, 0 192; SI-NEXT: s_mov_b32 s7, s3 193; SI-NEXT: s_waitcnt lgkmcnt(0) 194; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 195; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 196; SI-NEXT: s_mov_b32 s2, -1 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: v_ffbl_b32_e32 v1, v1 199; SI-NEXT: v_ffbl_b32_e32 v0, v0 200; SI-NEXT: v_min_u32_e32 v1, 32, v1 201; SI-NEXT: v_min_u32_e32 v0, 32, v0 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 204; SI-NEXT: s_endpgm 205; 206; VI-LABEL: v_cttz_v2i32: 207; VI: ; %bb.0: 208; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 209; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 210; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 211; VI-NEXT: s_waitcnt lgkmcnt(0) 212; VI-NEXT: v_mov_b32_e32 v1, s3 213; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 214; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 215; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 216; VI-NEXT: s_mov_b32 s3, 0xf000 217; VI-NEXT: s_mov_b32 s2, -1 218; VI-NEXT: s_waitcnt vmcnt(0) 219; VI-NEXT: v_ffbl_b32_e32 v1, v1 220; VI-NEXT: v_ffbl_b32_e32 v0, v0 221; VI-NEXT: v_min_u32_e32 v1, 32, v1 222; VI-NEXT: v_min_u32_e32 v0, 32, v0 223; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 224; VI-NEXT: s_endpgm 225; 226; EG-LABEL: v_cttz_v2i32: 227; EG: ; %bb.0: 228; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 229; EG-NEXT: TEX 0 @6 230; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 231; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 232; EG-NEXT: CF_END 233; EG-NEXT: PAD 234; EG-NEXT: Fetch clause starting at 6: 235; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 236; EG-NEXT: ALU clause starting at 8: 237; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 238; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 239; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 240; EG-NEXT: ALU clause starting at 11: 241; EG-NEXT: FFBL_INT * T0.W, T0.Y, 242; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 243; EG-NEXT: FFBL_INT * T0.W, T0.X, 244; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 245; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 246; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 247; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 248; 249; GFX10-LABEL: v_cttz_v2i32: 250; GFX10: ; %bb.0: 251; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 252; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 253; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 254; GFX10-NEXT: v_mov_b32_e32 v2, 0 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 257; GFX10-NEXT: s_waitcnt vmcnt(0) 258; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 259; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 260; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 261; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 262; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 263; GFX10-NEXT: s_endpgm 264; 265; GFX10-GISEL-LABEL: v_cttz_v2i32: 266; GFX10-GISEL: ; %bb.0: 267; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 268; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 269; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 270; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 271; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 272; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 273; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 274; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 275; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 276; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 277; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 278; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 279; GFX10-GISEL-NEXT: s_endpgm 280 %tid = call i32 @llvm.amdgcn.workitem.id.x() 281 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid 282 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 283 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 284 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 285 ret void 286} 287 288define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { 289; SI-LABEL: v_cttz_v4i32: 290; SI: ; %bb.0: 291; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 292; SI-NEXT: s_mov_b32 s3, 0xf000 293; SI-NEXT: s_mov_b32 s6, 0 294; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 295; SI-NEXT: v_mov_b32_e32 v1, 0 296; SI-NEXT: s_mov_b32 s7, s3 297; SI-NEXT: s_waitcnt lgkmcnt(0) 298; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 299; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 300; SI-NEXT: s_mov_b32 s2, -1 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: v_ffbl_b32_e32 v3, v3 303; SI-NEXT: v_ffbl_b32_e32 v2, v2 304; SI-NEXT: v_ffbl_b32_e32 v1, v1 305; SI-NEXT: v_ffbl_b32_e32 v0, v0 306; SI-NEXT: v_min_u32_e32 v3, 32, v3 307; SI-NEXT: v_min_u32_e32 v2, 32, v2 308; SI-NEXT: v_min_u32_e32 v1, 32, v1 309; SI-NEXT: v_min_u32_e32 v0, 32, v0 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 312; SI-NEXT: s_endpgm 313; 314; VI-LABEL: v_cttz_v4i32: 315; VI: ; %bb.0: 316; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 317; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 318; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 319; VI-NEXT: s_waitcnt lgkmcnt(0) 320; VI-NEXT: v_mov_b32_e32 v1, s3 321; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 322; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 323; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 324; VI-NEXT: s_mov_b32 s3, 0xf000 325; VI-NEXT: s_mov_b32 s2, -1 326; VI-NEXT: s_waitcnt vmcnt(0) 327; VI-NEXT: v_ffbl_b32_e32 v3, v3 328; VI-NEXT: v_ffbl_b32_e32 v2, v2 329; VI-NEXT: v_ffbl_b32_e32 v1, v1 330; VI-NEXT: v_ffbl_b32_e32 v0, v0 331; VI-NEXT: v_min_u32_e32 v3, 32, v3 332; VI-NEXT: v_min_u32_e32 v2, 32, v2 333; VI-NEXT: v_min_u32_e32 v1, 32, v1 334; VI-NEXT: v_min_u32_e32 v0, 32, v0 335; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 336; VI-NEXT: s_endpgm 337; 338; EG-LABEL: v_cttz_v4i32: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 350; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 351; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 352; EG-NEXT: ALU clause starting at 11: 353; EG-NEXT: FFBL_INT * T1.W, T0.W, 354; EG-NEXT: FFBL_INT T2.W, T0.Z, 355; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 356; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 357; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 358; EG-NEXT: FFBL_INT * T1.W, T0.Y, 359; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 360; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 361; EG-NEXT: FFBL_INT * T1.W, T0.X, 362; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 363; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 364; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 365; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 366; 367; GFX10-LABEL: v_cttz_v4i32: 368; GFX10: ; %bb.0: 369; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 370; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 371; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 372; GFX10-NEXT: v_mov_b32_e32 v4, 0 373; GFX10-NEXT: s_waitcnt lgkmcnt(0) 374; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 375; GFX10-NEXT: s_waitcnt vmcnt(0) 376; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 377; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 378; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 379; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 380; GFX10-NEXT: v_min_u32_e32 v3, 32, v3 381; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 382; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 383; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 384; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 385; GFX10-NEXT: s_endpgm 386; 387; GFX10-GISEL-LABEL: v_cttz_v4i32: 388; GFX10-GISEL: ; %bb.0: 389; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 390; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 391; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 392; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 393; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 394; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 395; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 396; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 397; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 398; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 399; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 400; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 401; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 402; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 403; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 404; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 405; GFX10-GISEL-NEXT: s_endpgm 406 %tid = call i32 @llvm.amdgcn.workitem.id.x() 407 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid 408 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 409 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 410 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 411 ret void 412} 413 414define amdgpu_kernel void @v_cttz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 415; SI-LABEL: v_cttz_i8: 416; SI: ; %bb.0: 417; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 418; SI-NEXT: s_mov_b32 s3, 0xf000 419; SI-NEXT: s_mov_b32 s2, -1 420; SI-NEXT: s_mov_b32 s6, s2 421; SI-NEXT: s_mov_b32 s7, s3 422; SI-NEXT: s_waitcnt lgkmcnt(0) 423; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 424; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 425; SI-NEXT: s_waitcnt vmcnt(0) 426; SI-NEXT: v_or_b32_e32 v0, 0x100, v0 427; SI-NEXT: v_ffbl_b32_e32 v0, v0 428; SI-NEXT: s_waitcnt lgkmcnt(0) 429; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 430; SI-NEXT: s_endpgm 431; 432; VI-LABEL: v_cttz_i8: 433; VI: ; %bb.0: 434; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 435; VI-NEXT: s_mov_b32 s3, 0xf000 436; VI-NEXT: s_mov_b32 s2, -1 437; VI-NEXT: s_mov_b32 s6, s2 438; VI-NEXT: s_mov_b32 s7, s3 439; VI-NEXT: s_waitcnt lgkmcnt(0) 440; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 441; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 442; VI-NEXT: s_waitcnt vmcnt(0) 443; VI-NEXT: v_or_b32_e32 v0, 0x100, v0 444; VI-NEXT: v_ffbl_b32_e32 v0, v0 445; VI-NEXT: s_waitcnt lgkmcnt(0) 446; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 447; VI-NEXT: s_endpgm 448; 449; EG-LABEL: v_cttz_i8: 450; EG: ; %bb.0: 451; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 452; EG-NEXT: TEX 0 @6 453; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 454; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 455; EG-NEXT: CF_END 456; EG-NEXT: PAD 457; EG-NEXT: Fetch clause starting at 6: 458; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 459; EG-NEXT: ALU clause starting at 8: 460; EG-NEXT: MOV * T0.X, KC0[2].Z, 461; EG-NEXT: ALU clause starting at 9: 462; EG-NEXT: OR_INT * T0.W, T0.X, literal.x, 463; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00) 464; EG-NEXT: FFBL_INT T0.W, PV.W, 465; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 466; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 467; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 468; EG-NEXT: LSHL * T1.W, PS, literal.y, 469; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 470; EG-NEXT: LSHL T0.X, PV.W, PS, 471; EG-NEXT: LSHL * T0.W, literal.x, PS, 472; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 473; EG-NEXT: MOV T0.Y, 0.0, 474; EG-NEXT: MOV * T0.Z, 0.0, 475; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 476; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 477; 478; GFX10-LABEL: v_cttz_i8: 479; GFX10: ; %bb.0: 480; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 481; GFX10-NEXT: v_mov_b32_e32 v0, 0 482; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX10-NEXT: s_waitcnt lgkmcnt(0) 484; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 485; GFX10-NEXT: s_waitcnt vmcnt(0) 486; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 487; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 488; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 489; GFX10-NEXT: s_endpgm 490; 491; GFX10-GISEL-LABEL: v_cttz_i8: 492; GFX10-GISEL: ; %bb.0: 493; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 494; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 495; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 496; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 497; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 498; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 499; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 500; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 501; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 502; GFX10-GISEL-NEXT: s_endpgm 503 %val = load i8, i8 addrspace(1)* %valptr 504 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 505 store i8 %cttz, i8 addrspace(1)* %out 506 ret void 507} 508 509define amdgpu_kernel void @s_cttz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind { 510; SI-LABEL: s_cttz_i64: 511; SI: ; %bb.0: 512; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 513; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 514; SI-NEXT: s_mov_b32 s3, 0xf000 515; SI-NEXT: s_mov_b32 s2, -1 516; SI-NEXT: s_waitcnt lgkmcnt(0) 517; SI-NEXT: s_ff1_i32_b32 s5, s5 518; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf 519; SI-NEXT: s_add_i32 s5, s5, 32 520; SI-NEXT: s_ff1_i32_b32 s4, s4 521; SI-NEXT: v_mov_b32_e32 v0, s5 522; SI-NEXT: v_min3_u32 v0, s4, v0, 64 523; SI-NEXT: v_mov_b32_e32 v1, 0 524; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 525; SI-NEXT: s_endpgm 526; 527; VI-LABEL: s_cttz_i64: 528; VI: ; %bb.0: 529; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c 530; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 531; VI-NEXT: s_mov_b32 s3, 0xf000 532; VI-NEXT: s_mov_b32 s2, -1 533; VI-NEXT: v_mov_b32_e32 v1, 0 534; VI-NEXT: s_waitcnt lgkmcnt(0) 535; VI-NEXT: s_ff1_i32_b32 s5, s5 536; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp 537; VI-NEXT: s_ff1_i32_b32 s4, s4 538; VI-NEXT: v_min3_u32 v0, s4, v0, 64 539; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 540; VI-NEXT: s_endpgm 541; 542; EG-LABEL: s_cttz_i64: 543; EG: ; %bb.0: 544; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 545; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 546; EG-NEXT: CF_END 547; EG-NEXT: PAD 548; EG-NEXT: ALU clause starting at 4: 549; EG-NEXT: FFBL_INT * T0.W, KC0[5].X, 550; EG-NEXT: CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W, 551; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 552; EG-NEXT: FFBL_INT T1.W, KC0[4].W, 553; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 554; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 555; EG-NEXT: CNDE_INT T0.X, KC0[4].W, PS, PV.W, 556; EG-NEXT: MOV T0.Y, 0.0, 557; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 558; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 559; 560; GFX10-LABEL: s_cttz_i64: 561; GFX10: ; %bb.0: 562; GFX10-NEXT: s_clause 0x1 563; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 564; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 565; GFX10-NEXT: v_mov_b32_e32 v1, 0 566; GFX10-NEXT: s_waitcnt lgkmcnt(0) 567; GFX10-NEXT: s_ff1_i32_b32 s0, s3 568; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 569; GFX10-NEXT: s_ff1_i32_b32 s0, s2 570; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 571; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] 572; GFX10-NEXT: s_endpgm 573; 574; GFX10-GISEL-LABEL: s_cttz_i64: 575; GFX10-GISEL: ; %bb.0: 576; GFX10-GISEL-NEXT: s_clause 0x1 577; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 578; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 579; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 580; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 581; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] 582; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 583; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 584; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 585; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 586; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 587; GFX10-GISEL-NEXT: s_endpgm 588 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 589 store i64 %cttz, i64 addrspace(1)* %out 590 ret void 591} 592 593define amdgpu_kernel void @s_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { 594; SI-LABEL: s_cttz_i64_trunc: 595; SI: ; %bb.0: 596; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 597; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 598; SI-NEXT: s_mov_b32 s3, 0xf000 599; SI-NEXT: s_mov_b32 s2, -1 600; SI-NEXT: s_waitcnt lgkmcnt(0) 601; SI-NEXT: s_ff1_i32_b32 s5, s5 602; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf 603; SI-NEXT: s_add_i32 s5, s5, 32 604; SI-NEXT: s_ff1_i32_b32 s4, s4 605; SI-NEXT: v_mov_b32_e32 v0, s5 606; SI-NEXT: v_min3_u32 v0, s4, v0, 64 607; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 608; SI-NEXT: s_endpgm 609; 610; VI-LABEL: s_cttz_i64_trunc: 611; VI: ; %bb.0: 612; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 613; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 614; VI-NEXT: s_mov_b32 s3, 0xf000 615; VI-NEXT: s_mov_b32 s2, -1 616; VI-NEXT: s_waitcnt lgkmcnt(0) 617; VI-NEXT: s_ff1_i32_b32 s5, s5 618; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp 619; VI-NEXT: s_ff1_i32_b32 s4, s4 620; VI-NEXT: v_min3_u32 v0, s4, v0, 64 621; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 622; VI-NEXT: s_endpgm 623; 624; EG-LABEL: s_cttz_i64_trunc: 625; EG: ; %bb.0: 626; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 627; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 628; EG-NEXT: CF_END 629; EG-NEXT: PAD 630; EG-NEXT: ALU clause starting at 4: 631; EG-NEXT: FFBL_INT * T0.W, KC0[3].X, 632; EG-NEXT: CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W, 633; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 634; EG-NEXT: FFBL_INT T1.W, KC0[2].W, 635; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 636; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 637; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W, 638; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 639; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 640; 641; GFX10-LABEL: s_cttz_i64_trunc: 642; GFX10: ; %bb.0: 643; GFX10-NEXT: s_clause 0x1 644; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 645; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 646; GFX10-NEXT: v_mov_b32_e32 v1, 0 647; GFX10-NEXT: s_waitcnt lgkmcnt(0) 648; GFX10-NEXT: s_ff1_i32_b32 s0, s3 649; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp 650; GFX10-NEXT: s_ff1_i32_b32 s0, s2 651; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 652; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 653; GFX10-NEXT: s_endpgm 654; 655; GFX10-GISEL-LABEL: s_cttz_i64_trunc: 656; GFX10-GISEL: ; %bb.0: 657; GFX10-GISEL-NEXT: s_clause 0x1 658; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 659; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 660; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 661; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 662; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] 663; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 664; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 665; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] 666; GFX10-GISEL-NEXT: s_endpgm 667 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 668 %trunc = trunc i64 %cttz to i32 669 store i32 %trunc, i32 addrspace(1)* %out 670 ret void 671} 672 673define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 674; SI-LABEL: v_cttz_i64: 675; SI: ; %bb.0: 676; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 677; SI-NEXT: s_mov_b32 s7, 0xf000 678; SI-NEXT: s_mov_b32 s6, 0 679; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 680; SI-NEXT: v_mov_b32_e32 v1, 0 681; SI-NEXT: s_waitcnt lgkmcnt(0) 682; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 683; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 684; SI-NEXT: s_waitcnt vmcnt(0) 685; SI-NEXT: v_ffbl_b32_e32 v3, v3 686; SI-NEXT: v_min_u32_e32 v3, 0xffffffdf, v3 687; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 688; SI-NEXT: v_ffbl_b32_e32 v2, v2 689; SI-NEXT: v_min3_u32 v2, v2, v3, 64 690; SI-NEXT: v_mov_b32_e32 v3, v1 691; SI-NEXT: s_waitcnt lgkmcnt(0) 692; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 693; SI-NEXT: s_endpgm 694; 695; VI-LABEL: v_cttz_i64: 696; VI: ; %bb.0: 697; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 698; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 699; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 700; VI-NEXT: v_mov_b32_e32 v2, 0 701; VI-NEXT: s_waitcnt lgkmcnt(0) 702; VI-NEXT: v_mov_b32_e32 v1, s3 703; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 704; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 705; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 706; VI-NEXT: v_mov_b32_e32 v4, s1 707; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 708; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 709; VI-NEXT: s_waitcnt vmcnt(0) 710; VI-NEXT: v_ffbl_b32_e32 v1, v1 711; VI-NEXT: v_add_u32_e64 v1, s[0:1], v1, 32 clamp 712; VI-NEXT: v_ffbl_b32_e32 v0, v0 713; VI-NEXT: v_min3_u32 v1, v0, v1, 64 714; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 715; VI-NEXT: s_endpgm 716; 717; EG-LABEL: v_cttz_i64: 718; EG: ; %bb.0: 719; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 720; EG-NEXT: TEX 0 @6 721; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 722; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 723; EG-NEXT: CF_END 724; EG-NEXT: PAD 725; EG-NEXT: Fetch clause starting at 6: 726; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 727; EG-NEXT: ALU clause starting at 8: 728; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 729; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 730; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 731; EG-NEXT: ALU clause starting at 11: 732; EG-NEXT: FFBL_INT * T1.W, T0.Y, 733; EG-NEXT: CNDE_INT * T1.W, T0.Y, literal.x, PV.W, 734; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 735; EG-NEXT: FFBL_INT T2.W, T0.X, 736; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 737; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 738; EG-NEXT: CNDE_INT T0.X, T0.X, PS, PV.W, 739; EG-NEXT: MOV T0.Y, 0.0, 740; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 741; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 742; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 743; 744; GFX10-LABEL: v_cttz_i64: 745; GFX10: ; %bb.0: 746; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 747; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 748; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 749; GFX10-NEXT: s_waitcnt lgkmcnt(0) 750; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 751; GFX10-NEXT: s_waitcnt vmcnt(0) 752; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 753; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 754; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 755; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 756; GFX10-NEXT: v_mov_b32_e32 v1, 0 757; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 758; GFX10-NEXT: s_endpgm 759; 760; GFX10-GISEL-LABEL: v_cttz_i64: 761; GFX10-GISEL: ; %bb.0: 762; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 763; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 764; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 765; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 766; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 767; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 768; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 769; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 770; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 771; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 772; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 773; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 774; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 775; GFX10-GISEL-NEXT: s_endpgm 776 %tid = call i32 @llvm.amdgcn.workitem.id.x() 777 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 778 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 779 %val = load i64, i64 addrspace(1)* %in.gep 780 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 781 store i64 %cttz, i64 addrspace(1)* %out.gep 782 ret void 783} 784 785define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { 786; SI-LABEL: v_cttz_i64_trunc: 787; SI: ; %bb.0: 788; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 789; SI-NEXT: s_mov_b32 s7, 0xf000 790; SI-NEXT: s_mov_b32 s6, 0 791; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 792; SI-NEXT: v_mov_b32_e32 v2, 0 793; SI-NEXT: s_waitcnt lgkmcnt(0) 794; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 795; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 796; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 797; SI-NEXT: s_waitcnt vmcnt(0) 798; SI-NEXT: v_ffbl_b32_e32 v0, v4 799; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 800; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 801; SI-NEXT: v_ffbl_b32_e32 v3, v3 802; SI-NEXT: v_min3_u32 v0, v3, v0, 64 803; SI-NEXT: s_waitcnt lgkmcnt(0) 804; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 805; SI-NEXT: s_endpgm 806; 807; VI-LABEL: v_cttz_i64_trunc: 808; VI: ; %bb.0: 809; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 810; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 811; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 812; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 813; VI-NEXT: s_waitcnt lgkmcnt(0) 814; VI-NEXT: v_mov_b32_e32 v2, s3 815; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 816; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 817; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 818; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 819; VI-NEXT: v_mov_b32_e32 v4, s1 820; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 821; VI-NEXT: s_waitcnt vmcnt(0) 822; VI-NEXT: v_ffbl_b32_e32 v0, v2 823; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 824; VI-NEXT: v_ffbl_b32_e32 v1, v1 825; VI-NEXT: v_min3_u32 v0, v1, v0, 64 826; VI-NEXT: flat_store_dword v[3:4], v0 827; VI-NEXT: s_endpgm 828; 829; EG-LABEL: v_cttz_i64_trunc: 830; EG: ; %bb.0: 831; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 832; EG-NEXT: TEX 0 @6 833; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 834; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 835; EG-NEXT: CF_END 836; EG-NEXT: PAD 837; EG-NEXT: Fetch clause starting at 6: 838; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 839; EG-NEXT: ALU clause starting at 8: 840; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 841; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 842; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 843; EG-NEXT: ALU clause starting at 11: 844; EG-NEXT: FFBL_INT * T0.W, T1.Y, 845; EG-NEXT: CNDE_INT * T0.W, T1.Y, literal.x, PV.W, 846; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 847; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 848; EG-NEXT: FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212 849; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 850; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 851; EG-NEXT: CNDE_INT T0.X, T1.X, PS, PV.W, 852; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 853; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 854; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 855; 856; GFX10-LABEL: v_cttz_i64_trunc: 857; GFX10: ; %bb.0: 858; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 859; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 860; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 861; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 862; GFX10-NEXT: s_waitcnt lgkmcnt(0) 863; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 864; GFX10-NEXT: s_waitcnt vmcnt(0) 865; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 866; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 867; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp 868; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 869; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 870; GFX10-NEXT: s_endpgm 871; 872; GFX10-GISEL-LABEL: v_cttz_i64_trunc: 873; GFX10-GISEL: ; %bb.0: 874; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 875; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 876; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 877; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 878; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 879; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 880; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 881; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 882; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 883; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp 884; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 885; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 886; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 887; GFX10-GISEL-NEXT: s_endpgm 888 %tid = call i32 @llvm.amdgcn.workitem.id.x() 889 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 890 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid 891 %val = load i64, i64 addrspace(1)* %in.gep 892 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 893 %trunc = trunc i64 %cttz to i32 894 store i32 %trunc, i32 addrspace(1)* %out.gep 895 ret void 896} 897 898define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 899; SI-LABEL: v_cttz_i32_sel_eq_neg1: 900; SI: ; %bb.0: 901; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 902; SI-NEXT: s_mov_b32 s3, 0xf000 903; SI-NEXT: s_mov_b32 s6, 0 904; SI-NEXT: s_mov_b32 s7, s3 905; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 906; SI-NEXT: v_mov_b32_e32 v1, 0 907; SI-NEXT: s_waitcnt lgkmcnt(0) 908; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 909; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 910; SI-NEXT: s_mov_b32 s2, -1 911; SI-NEXT: s_waitcnt vmcnt(0) 912; SI-NEXT: v_ffbl_b32_e32 v0, v0 913; SI-NEXT: s_waitcnt lgkmcnt(0) 914; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 915; SI-NEXT: s_endpgm 916; 917; VI-LABEL: v_cttz_i32_sel_eq_neg1: 918; VI: ; %bb.0: 919; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 920; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 921; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 922; VI-NEXT: s_waitcnt lgkmcnt(0) 923; VI-NEXT: v_mov_b32_e32 v1, s3 924; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 925; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 926; VI-NEXT: flat_load_dword v0, v[0:1] 927; VI-NEXT: s_mov_b32 s3, 0xf000 928; VI-NEXT: s_mov_b32 s2, -1 929; VI-NEXT: s_waitcnt vmcnt(0) 930; VI-NEXT: v_ffbl_b32_e32 v0, v0 931; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 932; VI-NEXT: s_endpgm 933; 934; EG-LABEL: v_cttz_i32_sel_eq_neg1: 935; EG: ; %bb.0: 936; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 937; EG-NEXT: TEX 0 @6 938; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 939; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 940; EG-NEXT: CF_END 941; EG-NEXT: PAD 942; EG-NEXT: Fetch clause starting at 6: 943; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 944; EG-NEXT: ALU clause starting at 8: 945; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 946; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 947; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 948; EG-NEXT: ALU clause starting at 11: 949; EG-NEXT: FFBL_INT * T0.W, T0.X, 950; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 951; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 952; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 953; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 954; EG-NEXT: -1(nan), 2(2.802597e-45) 955; 956; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: 957; GFX10: ; %bb.0: 958; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 959; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 960; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 961; GFX10-NEXT: v_mov_b32_e32 v1, 0 962; GFX10-NEXT: s_waitcnt lgkmcnt(0) 963; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 964; GFX10-NEXT: s_waitcnt vmcnt(0) 965; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 966; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 967; GFX10-NEXT: s_endpgm 968; 969; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: 970; GFX10-GISEL: ; %bb.0: 971; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 972; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 973; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 974; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 975; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 976; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 977; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 978; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 979; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 980; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 981; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 982; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 983; GFX10-GISEL-NEXT: s_endpgm 984 %tid = call i32 @llvm.amdgcn.workitem.id.x() 985 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 986 %val = load i32, i32 addrspace(1)* %in.gep 987 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 988 %cmp = icmp eq i32 %val, 0 989 %sel = select i1 %cmp, i32 -1, i32 %cttz 990 store i32 %sel, i32 addrspace(1)* %out 991 ret void 992} 993 994define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 995; SI-LABEL: v_cttz_i32_sel_ne_neg1: 996; SI: ; %bb.0: 997; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 998; SI-NEXT: s_mov_b32 s3, 0xf000 999; SI-NEXT: s_mov_b32 s6, 0 1000; SI-NEXT: s_mov_b32 s7, s3 1001; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1002; SI-NEXT: v_mov_b32_e32 v1, 0 1003; SI-NEXT: s_waitcnt lgkmcnt(0) 1004; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1005; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1006; SI-NEXT: s_mov_b32 s2, -1 1007; SI-NEXT: s_waitcnt vmcnt(0) 1008; SI-NEXT: v_ffbl_b32_e32 v0, v0 1009; SI-NEXT: s_waitcnt lgkmcnt(0) 1010; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1011; SI-NEXT: s_endpgm 1012; 1013; VI-LABEL: v_cttz_i32_sel_ne_neg1: 1014; VI: ; %bb.0: 1015; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1016; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1017; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1018; VI-NEXT: s_waitcnt lgkmcnt(0) 1019; VI-NEXT: v_mov_b32_e32 v1, s3 1020; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1021; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1022; VI-NEXT: flat_load_dword v0, v[0:1] 1023; VI-NEXT: s_mov_b32 s3, 0xf000 1024; VI-NEXT: s_mov_b32 s2, -1 1025; VI-NEXT: s_waitcnt vmcnt(0) 1026; VI-NEXT: v_ffbl_b32_e32 v0, v0 1027; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1028; VI-NEXT: s_endpgm 1029; 1030; EG-LABEL: v_cttz_i32_sel_ne_neg1: 1031; EG: ; %bb.0: 1032; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1033; EG-NEXT: TEX 0 @6 1034; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1035; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1036; EG-NEXT: CF_END 1037; EG-NEXT: PAD 1038; EG-NEXT: Fetch clause starting at 6: 1039; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1040; EG-NEXT: ALU clause starting at 8: 1041; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1042; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1043; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1044; EG-NEXT: ALU clause starting at 11: 1045; EG-NEXT: FFBL_INT * T0.W, T0.X, 1046; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1047; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1048; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1049; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1050; EG-NEXT: -1(nan), 2(2.802597e-45) 1051; 1052; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: 1053; GFX10: ; %bb.0: 1054; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1055; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1056; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1057; GFX10-NEXT: v_mov_b32_e32 v1, 0 1058; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1060; GFX10-NEXT: s_waitcnt vmcnt(0) 1061; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1062; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1063; GFX10-NEXT: s_endpgm 1064; 1065; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: 1066; GFX10-GISEL: ; %bb.0: 1067; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1068; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1069; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1070; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1072; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1073; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 1074; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1075; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1076; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo 1077; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1078; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1079; GFX10-GISEL-NEXT: s_endpgm 1080 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1081 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1082 %val = load i32, i32 addrspace(1)* %in.gep 1083 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1084 %cmp = icmp ne i32 %val, 0 1085 %sel = select i1 %cmp, i32 %cttz, i32 -1 1086 store i32 %sel, i32 addrspace(1)* %out 1087 ret void 1088} 1089 1090; TODO: Should be able to eliminate select here as well. 1091define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1092; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: 1093; SI: ; %bb.0: 1094; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1095; SI-NEXT: s_mov_b32 s3, 0xf000 1096; SI-NEXT: s_mov_b32 s6, 0 1097; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1098; SI-NEXT: v_mov_b32_e32 v1, 0 1099; SI-NEXT: s_mov_b32 s7, s3 1100; SI-NEXT: s_waitcnt lgkmcnt(0) 1101; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1102; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1103; SI-NEXT: s_mov_b32 s2, -1 1104; SI-NEXT: s_waitcnt vmcnt(0) 1105; SI-NEXT: v_ffbl_b32_e32 v0, v0 1106; SI-NEXT: v_min_u32_e32 v0, 32, v0 1107; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1108; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1109; SI-NEXT: s_waitcnt lgkmcnt(0) 1110; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1111; SI-NEXT: s_endpgm 1112; 1113; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: 1114; VI: ; %bb.0: 1115; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1116; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1117; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1118; VI-NEXT: s_waitcnt lgkmcnt(0) 1119; VI-NEXT: v_mov_b32_e32 v1, s3 1120; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1121; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1122; VI-NEXT: flat_load_dword v0, v[0:1] 1123; VI-NEXT: s_mov_b32 s3, 0xf000 1124; VI-NEXT: s_mov_b32 s2, -1 1125; VI-NEXT: s_waitcnt vmcnt(0) 1126; VI-NEXT: v_ffbl_b32_e32 v0, v0 1127; VI-NEXT: v_min_u32_e32 v0, 32, v0 1128; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1129; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1130; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1131; VI-NEXT: s_endpgm 1132; 1133; EG-LABEL: v_cttz_i32_sel_eq_bitwidth: 1134; EG: ; %bb.0: 1135; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1136; EG-NEXT: TEX 0 @6 1137; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1138; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1139; EG-NEXT: CF_END 1140; EG-NEXT: PAD 1141; EG-NEXT: Fetch clause starting at 6: 1142; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1143; EG-NEXT: ALU clause starting at 8: 1144; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1145; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1146; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1147; EG-NEXT: ALU clause starting at 11: 1148; EG-NEXT: FFBL_INT * T0.W, T0.X, 1149; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1150; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1151; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 1152; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1153; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 1154; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1155; EG-NEXT: -1(nan), 2(2.802597e-45) 1156; 1157; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: 1158; GFX10: ; %bb.0: 1159; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1160; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1161; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1162; GFX10-NEXT: v_mov_b32_e32 v1, 0 1163; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1164; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1165; GFX10-NEXT: s_waitcnt vmcnt(0) 1166; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1167; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1168; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1169; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1170; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1171; GFX10-NEXT: s_endpgm 1172; 1173; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: 1174; GFX10-GISEL: ; %bb.0: 1175; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1176; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1177; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1178; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1179; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1181; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1182; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 1183; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1184; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 1185; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 1186; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1187; GFX10-GISEL-NEXT: s_endpgm 1188 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1189 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1190 %val = load i32, i32 addrspace(1)* %in.gep 1191 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1192 %cmp = icmp eq i32 %cttz, 32 1193 %sel = select i1 %cmp, i32 -1, i32 %cttz 1194 store i32 %sel, i32 addrspace(1)* %out 1195 ret void 1196} 1197 1198define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { 1199; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1200; SI: ; %bb.0: 1201; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1202; SI-NEXT: s_mov_b32 s3, 0xf000 1203; SI-NEXT: s_mov_b32 s6, 0 1204; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1205; SI-NEXT: v_mov_b32_e32 v1, 0 1206; SI-NEXT: s_mov_b32 s7, s3 1207; SI-NEXT: s_waitcnt lgkmcnt(0) 1208; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1209; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1210; SI-NEXT: s_mov_b32 s2, -1 1211; SI-NEXT: s_waitcnt vmcnt(0) 1212; SI-NEXT: v_ffbl_b32_e32 v0, v0 1213; SI-NEXT: v_min_u32_e32 v0, 32, v0 1214; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1215; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1216; SI-NEXT: s_waitcnt lgkmcnt(0) 1217; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1218; SI-NEXT: s_endpgm 1219; 1220; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1221; VI: ; %bb.0: 1222; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1223; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1224; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1225; VI-NEXT: s_waitcnt lgkmcnt(0) 1226; VI-NEXT: v_mov_b32_e32 v1, s3 1227; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1228; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1229; VI-NEXT: flat_load_dword v0, v[0:1] 1230; VI-NEXT: s_mov_b32 s3, 0xf000 1231; VI-NEXT: s_mov_b32 s2, -1 1232; VI-NEXT: s_waitcnt vmcnt(0) 1233; VI-NEXT: v_ffbl_b32_e32 v0, v0 1234; VI-NEXT: v_min_u32_e32 v0, 32, v0 1235; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1236; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1237; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1238; VI-NEXT: s_endpgm 1239; 1240; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: 1241; EG: ; %bb.0: 1242; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1243; EG-NEXT: TEX 0 @6 1244; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1245; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1246; EG-NEXT: CF_END 1247; EG-NEXT: PAD 1248; EG-NEXT: Fetch clause starting at 6: 1249; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1250; EG-NEXT: ALU clause starting at 8: 1251; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1252; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1253; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1254; EG-NEXT: ALU clause starting at 11: 1255; EG-NEXT: FFBL_INT * T0.W, T0.X, 1256; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1257; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1258; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1259; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1260; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1261; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1262; EG-NEXT: -1(nan), 2(2.802597e-45) 1263; 1264; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: 1265; GFX10: ; %bb.0: 1266; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1267; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1268; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1269; GFX10-NEXT: v_mov_b32_e32 v1, 0 1270; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1272; GFX10-NEXT: s_waitcnt vmcnt(0) 1273; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1274; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1275; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1276; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1277; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1278; GFX10-NEXT: s_endpgm 1279; 1280; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: 1281; GFX10-GISEL: ; %bb.0: 1282; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1283; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1284; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1285; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1286; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1288; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1289; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 1290; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1291; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1292; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1293; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1294; GFX10-GISEL-NEXT: s_endpgm 1295 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1296 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid 1297 %val = load i32, i32 addrspace(1)* %in.gep 1298 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1299 %cmp = icmp ne i32 %cttz, 32 1300 %sel = select i1 %cmp, i32 %cttz, i32 -1 1301 store i32 %sel, i32 addrspace(1)* %out 1302 ret void 1303} 1304 1305 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { 1306; SI-LABEL: v_cttz_i8_sel_eq_neg1: 1307; SI: ; %bb.0: 1308; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1309; SI-NEXT: s_mov_b32 s3, 0xf000 1310; SI-NEXT: v_mov_b32_e32 v1, 0 1311; SI-NEXT: s_mov_b32 s6, 0 1312; SI-NEXT: s_mov_b32 s7, s3 1313; SI-NEXT: s_waitcnt lgkmcnt(0) 1314; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1315; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1316; SI-NEXT: s_mov_b32 s2, -1 1317; SI-NEXT: s_waitcnt vmcnt(0) 1318; SI-NEXT: v_ffbl_b32_e32 v0, v0 1319; SI-NEXT: s_waitcnt lgkmcnt(0) 1320; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1321; SI-NEXT: s_endpgm 1322; 1323; VI-LABEL: v_cttz_i8_sel_eq_neg1: 1324; VI: ; %bb.0: 1325; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1326; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1327; VI-NEXT: s_waitcnt lgkmcnt(0) 1328; VI-NEXT: v_mov_b32_e32 v1, s3 1329; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1330; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1331; VI-NEXT: flat_load_ubyte v0, v[0:1] 1332; VI-NEXT: s_mov_b32 s3, 0xf000 1333; VI-NEXT: s_mov_b32 s2, -1 1334; VI-NEXT: s_waitcnt vmcnt(0) 1335; VI-NEXT: v_ffbl_b32_e32 v0, v0 1336; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1337; VI-NEXT: s_endpgm 1338; 1339; EG-LABEL: v_cttz_i8_sel_eq_neg1: 1340; EG: ; %bb.0: 1341; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1342; EG-NEXT: TEX 0 @6 1343; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1344; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1345; EG-NEXT: CF_END 1346; EG-NEXT: PAD 1347; EG-NEXT: Fetch clause starting at 6: 1348; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1349; EG-NEXT: ALU clause starting at 8: 1350; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1351; EG-NEXT: ALU clause starting at 9: 1352; EG-NEXT: FFBL_INT T0.W, T0.X, 1353; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1354; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1355; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1356; EG-NEXT: LSHL * T1.W, PS, literal.y, 1357; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1358; EG-NEXT: LSHL T0.X, PV.W, PS, 1359; EG-NEXT: LSHL * T0.W, literal.x, PS, 1360; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1361; EG-NEXT: MOV T0.Y, 0.0, 1362; EG-NEXT: MOV * T0.Z, 0.0, 1363; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1364; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1365; 1366; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: 1367; GFX10: ; %bb.0: 1368; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1369; GFX10-NEXT: v_mov_b32_e32 v1, 0 1370; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1371; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1373; GFX10-NEXT: s_waitcnt vmcnt(0) 1374; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1375; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1376; GFX10-NEXT: s_endpgm 1377; 1378; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: 1379; GFX10-GISEL: ; %bb.0: 1380; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1381; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1382; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1383; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1384; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1385; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1386; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1387; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1388; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 1389; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1390; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1391; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 1392; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD 1393; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1394; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2 1395; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] 1396; GFX10-GISEL-NEXT: s_endpgm 1397 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1398 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid 1399 %val = load i8, i8 addrspace(1)* %valptr.gep 1400 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 1401 %cmp = icmp eq i8 %val, 0 1402 %sel = select i1 %cmp, i8 -1, i8 %cttz 1403 store i8 %sel, i8 addrspace(1)* %out 1404 ret void 1405} 1406 1407 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { 1408; SI-LABEL: v_cttz_i16_sel_eq_neg1: 1409; SI: ; %bb.0: 1410; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1411; SI-NEXT: s_mov_b32 s3, 0xf000 1412; SI-NEXT: s_mov_b32 s2, -1 1413; SI-NEXT: s_mov_b32 s6, s2 1414; SI-NEXT: s_mov_b32 s7, s3 1415; SI-NEXT: s_waitcnt lgkmcnt(0) 1416; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1417; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1418; SI-NEXT: s_waitcnt vmcnt(0) 1419; SI-NEXT: v_ffbl_b32_e32 v0, v0 1420; SI-NEXT: s_waitcnt lgkmcnt(0) 1421; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1422; SI-NEXT: s_endpgm 1423; 1424; VI-LABEL: v_cttz_i16_sel_eq_neg1: 1425; VI: ; %bb.0: 1426; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 1427; VI-NEXT: s_mov_b32 s3, 0xf000 1428; VI-NEXT: s_mov_b32 s2, -1 1429; VI-NEXT: s_mov_b32 s6, s2 1430; VI-NEXT: s_mov_b32 s7, s3 1431; VI-NEXT: s_waitcnt lgkmcnt(0) 1432; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 1433; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1434; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1435; VI-NEXT: s_waitcnt vmcnt(0) 1436; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 1437; VI-NEXT: v_ffbl_b32_e32 v2, v2 1438; VI-NEXT: v_min_u32_e32 v2, 32, v2 1439; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 1440; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1441; VI-NEXT: s_waitcnt lgkmcnt(0) 1442; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1443; VI-NEXT: s_endpgm 1444; 1445; EG-LABEL: v_cttz_i16_sel_eq_neg1: 1446; EG: ; %bb.0: 1447; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1448; EG-NEXT: TEX 0 @6 1449; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1450; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1451; EG-NEXT: CF_END 1452; EG-NEXT: PAD 1453; EG-NEXT: Fetch clause starting at 6: 1454; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1455; EG-NEXT: ALU clause starting at 8: 1456; EG-NEXT: MOV * T0.X, KC0[2].Z, 1457; EG-NEXT: ALU clause starting at 9: 1458; EG-NEXT: FFBL_INT T0.W, T0.X, 1459; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1460; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1461; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1462; EG-NEXT: LSHL * T1.W, PS, literal.y, 1463; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1464; EG-NEXT: LSHL T0.X, PV.W, PS, 1465; EG-NEXT: LSHL * T0.W, literal.x, PS, 1466; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1467; EG-NEXT: MOV T0.Y, 0.0, 1468; EG-NEXT: MOV * T0.Z, 0.0, 1469; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1470; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1471; 1472; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: 1473; GFX10: ; %bb.0: 1474; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1475; GFX10-NEXT: v_mov_b32_e32 v0, 0 1476; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1477; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1479; GFX10-NEXT: s_waitcnt vmcnt(0) 1480; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 1481; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 1482; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 1483; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 1484; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1485; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1486; GFX10-NEXT: s_endpgm 1487; 1488; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: 1489; GFX10-GISEL: ; %bb.0: 1490; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1491; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 1492; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1493; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1494; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] 1495; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1496; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 1497; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1498; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 1499; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 1500; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo 1501; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1502; GFX10-GISEL-NEXT: s_endpgm 1503 %val = load i16, i16 addrspace(1)* %valptr 1504 %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone 1505 %cmp = icmp eq i16 %val, 0 1506 %sel = select i1 %cmp, i16 -1, i16 %cttz 1507 store i16 %sel, i16 addrspace(1)* %out 1508 ret void 1509} 1510 1511; FIXME: Need to handle non-uniform case for function below (load without gep). 1512define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { 1513; SI-LABEL: v_cttz_i7_sel_eq_neg1: 1514; SI: ; %bb.0: 1515; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1516; SI-NEXT: s_mov_b32 s3, 0xf000 1517; SI-NEXT: v_mov_b32_e32 v1, 0 1518; SI-NEXT: s_mov_b32 s6, 0 1519; SI-NEXT: s_mov_b32 s7, s3 1520; SI-NEXT: s_waitcnt lgkmcnt(0) 1521; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1522; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1523; SI-NEXT: s_mov_b32 s2, -1 1524; SI-NEXT: s_waitcnt vmcnt(0) 1525; SI-NEXT: v_ffbl_b32_e32 v0, v0 1526; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1527; SI-NEXT: s_waitcnt lgkmcnt(0) 1528; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1529; SI-NEXT: s_endpgm 1530; 1531; VI-LABEL: v_cttz_i7_sel_eq_neg1: 1532; VI: ; %bb.0: 1533; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1534; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1535; VI-NEXT: s_waitcnt lgkmcnt(0) 1536; VI-NEXT: v_mov_b32_e32 v1, s3 1537; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1538; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1539; VI-NEXT: flat_load_ubyte v0, v[0:1] 1540; VI-NEXT: s_mov_b32 s3, 0xf000 1541; VI-NEXT: s_mov_b32 s2, -1 1542; VI-NEXT: s_waitcnt vmcnt(0) 1543; VI-NEXT: v_ffbl_b32_e32 v0, v0 1544; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1545; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1546; VI-NEXT: s_endpgm 1547; 1548; EG-LABEL: v_cttz_i7_sel_eq_neg1: 1549; EG: ; %bb.0: 1550; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1551; EG-NEXT: TEX 0 @6 1552; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1553; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1554; EG-NEXT: CF_END 1555; EG-NEXT: PAD 1556; EG-NEXT: Fetch clause starting at 6: 1557; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1558; EG-NEXT: ALU clause starting at 8: 1559; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1560; EG-NEXT: ALU clause starting at 9: 1561; EG-NEXT: FFBL_INT T0.W, T0.X, 1562; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1563; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1564; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1565; EG-NEXT: LSHL * T1.W, PS, literal.y, 1566; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1567; EG-NEXT: LSHL T0.X, PV.W, PS, 1568; EG-NEXT: LSHL * T0.W, literal.x, PS, 1569; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1570; EG-NEXT: MOV T0.Y, 0.0, 1571; EG-NEXT: MOV * T0.Z, 0.0, 1572; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1573; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1574; 1575; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: 1576; GFX10: ; %bb.0: 1577; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1578; GFX10-NEXT: v_mov_b32_e32 v1, 0 1579; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1580; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1581; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1582; GFX10-NEXT: s_waitcnt vmcnt(0) 1583; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1584; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 1585; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1586; GFX10-NEXT: s_endpgm 1587; 1588; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: 1589; GFX10-GISEL: ; %bb.0: 1590; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1591; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1592; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1593; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1595; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1596; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1597; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1598; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1599; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1600; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 1601; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1602; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1603; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1604; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo 1605; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1606; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1607; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1608; GFX10-GISEL-NEXT: s_endpgm 1609 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1610 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid 1611 %val = load i7, i7 addrspace(1)* %valptr.gep 1612 %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone 1613 %cmp = icmp eq i7 %val, 0 1614 %sel = select i1 %cmp, i7 -1, i7 %cttz 1615 store i7 %sel, i7 addrspace(1)* %out 1616 ret void 1617} 1618