1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s 5 6declare i16 @llvm.ctpop.i16(i16) nounwind readnone 7declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone 8declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 9declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 10declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone 11 12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 13 14define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind { 15; SI-LABEL: s_ctpop_i16: 16; SI: ; %bb.0: 17; SI-NEXT: s_load_dword s4, s[0:1], 0xb 18; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 19; SI-NEXT: s_mov_b32 s3, 0xf000 20; SI-NEXT: s_mov_b32 s2, -1 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_and_b32 s4, s4, 0xffff 23; SI-NEXT: s_bcnt1_i32_b32 s4, s4 24; SI-NEXT: v_mov_b32_e32 v0, s4 25; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: s_ctpop_i16: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 31; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 32; VI-NEXT: s_mov_b32 s3, 0xf000 33; VI-NEXT: s_mov_b32 s2, -1 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_and_b32 s4, s4, 0xffff 36; VI-NEXT: s_bcnt1_i32_b32 s4, s4 37; VI-NEXT: v_mov_b32_e32 v0, s4 38; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 39; VI-NEXT: s_endpgm 40; 41; EG-LABEL: s_ctpop_i16: 42; EG: ; %bb.0: 43; EG-NEXT: ALU 0, @8, KC0[], KC1[] 44; EG-NEXT: TEX 0 @6 45; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 46; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 47; EG-NEXT: CF_END 48; EG-NEXT: PAD 49; EG-NEXT: Fetch clause starting at 6: 50; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 51; EG-NEXT: ALU clause starting at 8: 52; EG-NEXT: MOV * T0.X, 0.0, 53; EG-NEXT: ALU clause starting at 9: 54; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 55; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 56; EG-NEXT: BCNT_INT T1.W, T0.X, 57; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 58; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 59; EG-NEXT: LSHL T0.X, PV.W, PS, 60; EG-NEXT: LSHL * T0.W, literal.x, PS, 61; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 62; EG-NEXT: MOV T0.Y, 0.0, 63; EG-NEXT: MOV * T0.Z, 0.0, 64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 65; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 66 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 67 store i16 %ctpop, i16 addrspace(1)* %out, align 4 68 ret void 69} 70 71; XXX - Why 0 in register? 72define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 73; SI-LABEL: v_ctpop_i16: 74; SI: ; %bb.0: 75; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 76; SI-NEXT: s_mov_b32 s3, 0xf000 77; SI-NEXT: s_mov_b32 s6, 0 78; SI-NEXT: s_mov_b32 s7, s3 79; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 80; SI-NEXT: v_mov_b32_e32 v1, 0 81; SI-NEXT: s_waitcnt lgkmcnt(0) 82; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 83; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 84; SI-NEXT: s_mov_b32 s2, -1 85; SI-NEXT: s_waitcnt vmcnt(0) 86; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 87; SI-NEXT: s_waitcnt lgkmcnt(0) 88; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 89; SI-NEXT: s_endpgm 90; 91; VI-LABEL: v_ctpop_i16: 92; VI: ; %bb.0: 93; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 94; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 95; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 96; VI-NEXT: s_waitcnt lgkmcnt(0) 97; VI-NEXT: v_mov_b32_e32 v1, s3 98; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 99; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 100; VI-NEXT: flat_load_ushort v0, v[0:1] 101; VI-NEXT: s_mov_b32 s3, 0xf000 102; VI-NEXT: s_mov_b32 s2, -1 103; VI-NEXT: s_waitcnt vmcnt(0) 104; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 105; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 106; VI-NEXT: s_endpgm 107; 108; EG-LABEL: v_ctpop_i16: 109; EG: ; %bb.0: 110; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 111; EG-NEXT: TEX 0 @6 112; EG-NEXT: ALU 11, @10, KC0[CB0:0-32], KC1[] 113; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 114; EG-NEXT: CF_END 115; EG-NEXT: PAD 116; EG-NEXT: Fetch clause starting at 6: 117; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 118; EG-NEXT: ALU clause starting at 8: 119; EG-NEXT: LSHL * T0.W, T0.X, 1, 120; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 121; EG-NEXT: ALU clause starting at 10: 122; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 123; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 124; EG-NEXT: BCNT_INT T1.W, T0.X, 125; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 126; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 127; EG-NEXT: LSHL T0.X, PV.W, PS, 128; EG-NEXT: LSHL * T0.W, literal.x, PS, 129; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 130; EG-NEXT: MOV T0.Y, 0.0, 131; EG-NEXT: MOV * T0.Z, 0.0, 132; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 134 %tid = call i32 @llvm.amdgcn.workitem.id.x() 135 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 136 %val = load i16, i16 addrspace(1)* %in.gep, align 4 137 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 138 store i16 %ctpop, i16 addrspace(1)* %out, align 4 139 ret void 140} 141 142define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind { 143; SI-LABEL: v_ctpop_add_chain_i16: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 146; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 147; SI-NEXT: s_mov_b32 s3, 0xf000 148; SI-NEXT: s_mov_b32 s6, 0 149; SI-NEXT: s_mov_b32 s7, s3 150; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 151; SI-NEXT: v_mov_b32_e32 v1, 0 152; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 153; SI-NEXT: s_waitcnt lgkmcnt(0) 154; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 155; SI-NEXT: s_waitcnt vmcnt(0) 156; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 glc 157; SI-NEXT: s_waitcnt vmcnt(0) 158; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 159; SI-NEXT: s_mov_b32 s2, -1 160; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 161; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 162; SI-NEXT: s_waitcnt lgkmcnt(0) 163; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 164; SI-NEXT: s_endpgm 165; 166; VI-LABEL: v_ctpop_add_chain_i16: 167; VI: ; %bb.0: 168; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 169; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 170; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 171; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: v_mov_b32_e32 v1, s3 174; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 175; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 176; VI-NEXT: v_mov_b32_e32 v3, s5 177; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 178; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 179; VI-NEXT: flat_load_ushort v0, v[0:1] glc 180; VI-NEXT: s_waitcnt vmcnt(0) 181; VI-NEXT: flat_load_ushort v1, v[2:3] glc 182; VI-NEXT: s_waitcnt vmcnt(0) 183; VI-NEXT: s_mov_b32 s3, 0xf000 184; VI-NEXT: s_mov_b32 s2, -1 185; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 186; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 187; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 188; VI-NEXT: s_endpgm 189; 190; EG-LABEL: v_ctpop_add_chain_i16: 191; EG: ; %bb.0: 192; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 193; EG-NEXT: TEX 0 @8 194; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 195; EG-NEXT: TEX 0 @10 196; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[] 197; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 198; EG-NEXT: CF_END 199; EG-NEXT: PAD 200; EG-NEXT: Fetch clause starting at 8: 201; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 202; EG-NEXT: Fetch clause starting at 10: 203; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 204; EG-NEXT: ALU clause starting at 12: 205; EG-NEXT: LSHL * T0.W, T0.X, 1, 206; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 207; EG-NEXT: ALU clause starting at 14: 208; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W, 209; EG-NEXT: ALU clause starting at 15: 210; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 211; EG-NEXT: AND_INT * T1.W, T1.X, literal.x, 212; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 213; EG-NEXT: BCNT_INT T0.Z, PS, 214; EG-NEXT: BCNT_INT T0.W, PV.W, 215; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 216; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 217; EG-NEXT: ADD_INT T0.W, PV.W, PV.Z, 218; EG-NEXT: LSHL * T1.W, PS, literal.x, 219; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 220; EG-NEXT: LSHL T0.X, PV.W, PS, 221; EG-NEXT: LSHL * T0.W, literal.x, PS, 222; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 223; EG-NEXT: MOV T0.Y, 0.0, 224; EG-NEXT: MOV * T0.Z, 0.0, 225; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 226; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 228 %in0.gep = getelementptr i16, i16 addrspace(1)* %in0, i32 %tid 229 %in1.gep = getelementptr i16, i16 addrspace(1)* %in1, i32 %tid 230 %val0 = load volatile i16, i16 addrspace(1)* %in0.gep, align 4 231 %val1 = load volatile i16, i16 addrspace(1)* %in1.gep, align 4 232 %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone 233 %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone 234 %add = add i16 %ctpop0, %ctpop1 235 store i16 %add, i16 addrspace(1)* %out, align 4 236 ret void 237} 238 239define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind { 240; SI-LABEL: v_ctpop_add_sgpr_i16: 241; SI: ; %bb.0: 242; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 243; SI-NEXT: s_load_dword s8, s[0:1], 0xd 244; SI-NEXT: s_mov_b32 s3, 0xf000 245; SI-NEXT: s_mov_b32 s6, 0 246; SI-NEXT: s_mov_b32 s7, s3 247; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 248; SI-NEXT: v_mov_b32_e32 v1, 0 249; SI-NEXT: s_waitcnt lgkmcnt(0) 250; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 251; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 252; SI-NEXT: s_mov_b32 s2, -1 253; SI-NEXT: s_waitcnt vmcnt(0) 254; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 255; SI-NEXT: s_waitcnt lgkmcnt(0) 256; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 257; SI-NEXT: s_endpgm 258; 259; VI-LABEL: v_ctpop_add_sgpr_i16: 260; VI: ; %bb.0: 261; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 262; VI-NEXT: s_load_dword s4, s[0:1], 0x34 263; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 264; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 265; VI-NEXT: s_waitcnt lgkmcnt(0) 266; VI-NEXT: v_mov_b32_e32 v1, s3 267; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 268; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 269; VI-NEXT: flat_load_ushort v0, v[0:1] 270; VI-NEXT: s_mov_b32 s3, 0xf000 271; VI-NEXT: s_mov_b32 s2, -1 272; VI-NEXT: s_waitcnt vmcnt(0) 273; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 274; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 275; VI-NEXT: s_endpgm 276; 277; EG-LABEL: v_ctpop_add_sgpr_i16: 278; EG: ; %bb.0: 279; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 280; EG-NEXT: TEX 0 @8 281; EG-NEXT: ALU 0, @14, KC0[], KC1[] 282; EG-NEXT: TEX 0 @10 283; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 284; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 285; EG-NEXT: CF_END 286; EG-NEXT: PAD 287; EG-NEXT: Fetch clause starting at 8: 288; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 289; EG-NEXT: Fetch clause starting at 10: 290; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 291; EG-NEXT: ALU clause starting at 12: 292; EG-NEXT: LSHL * T0.W, T0.X, 1, 293; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 294; EG-NEXT: ALU clause starting at 14: 295; EG-NEXT: MOV * T1.X, 0.0, 296; EG-NEXT: ALU clause starting at 15: 297; EG-NEXT: BCNT_INT T0.W, T0.X, 298; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 299; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 300; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X, 301; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 302; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 303; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 304; EG-NEXT: LSHL T0.X, PV.W, PS, 305; EG-NEXT: LSHL * T0.W, literal.x, PS, 306; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 307; EG-NEXT: MOV T0.Y, 0.0, 308; EG-NEXT: MOV * T0.Z, 0.0, 309; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 310; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 311 %tid = call i32 @llvm.amdgcn.workitem.id.x() 312 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 313 %val = load i16, i16 addrspace(1)* %in.gep, align 4 314 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 315 %add = add i16 %ctpop, %sval 316 store i16 %add, i16 addrspace(1)* %out, align 4 317 ret void 318} 319 320define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind { 321; SI-LABEL: v_ctpop_v2i16: 322; SI: ; %bb.0: 323; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 324; SI-NEXT: s_mov_b32 s3, 0xf000 325; SI-NEXT: s_mov_b32 s6, 0 326; SI-NEXT: s_mov_b32 s7, s3 327; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; SI-NEXT: v_mov_b32_e32 v1, 0 329; SI-NEXT: s_waitcnt lgkmcnt(0) 330; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 331; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 332; SI-NEXT: s_mov_b32 s2, -1 333; SI-NEXT: s_waitcnt vmcnt(0) 334; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 335; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 336; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 337; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 338; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 339; SI-NEXT: v_or_b32_e32 v0, v1, v0 340; SI-NEXT: s_waitcnt lgkmcnt(0) 341; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 342; SI-NEXT: s_endpgm 343; 344; VI-LABEL: v_ctpop_v2i16: 345; VI: ; %bb.0: 346; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 347; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 348; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 349; VI-NEXT: s_waitcnt lgkmcnt(0) 350; VI-NEXT: v_mov_b32_e32 v1, s3 351; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 352; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 353; VI-NEXT: flat_load_dword v0, v[0:1] 354; VI-NEXT: s_mov_b32 s3, 0xf000 355; VI-NEXT: s_mov_b32 s2, -1 356; VI-NEXT: s_waitcnt vmcnt(0) 357; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 358; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 359; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 360; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 361; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 362; VI-NEXT: v_or_b32_e32 v0, v0, v1 363; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 364; VI-NEXT: s_endpgm 365; 366; EG-LABEL: v_ctpop_v2i16: 367; EG: ; %bb.0: 368; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 369; EG-NEXT: TEX 0 @6 370; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 371; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 372; EG-NEXT: CF_END 373; EG-NEXT: PAD 374; EG-NEXT: Fetch clause starting at 6: 375; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 376; EG-NEXT: ALU clause starting at 8: 377; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 378; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 379; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 380; EG-NEXT: ALU clause starting at 11: 381; EG-NEXT: LSHR * T0.W, T0.X, literal.x, 382; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 383; EG-NEXT: BCNT_INT T0.W, PV.W, 384; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 385; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 386; EG-NEXT: BCNT_INT T1.W, PS, 387; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 388; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 389; EG-NEXT: OR_INT T0.X, PV.W, PS, 390; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 391; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 392 %tid = call i32 @llvm.amdgcn.workitem.id.x() 393 %in.gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid 394 %val = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep, align 8 395 %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone 396 store <2 x i16> %ctpop, <2 x i16> addrspace(1)* %out, align 8 397 ret void 398} 399 400define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind { 401; SI-LABEL: v_ctpop_v4i16: 402; SI: ; %bb.0: 403; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 404; SI-NEXT: s_mov_b32 s3, 0xf000 405; SI-NEXT: s_mov_b32 s6, 0 406; SI-NEXT: s_mov_b32 s7, s3 407; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 408; SI-NEXT: v_mov_b32_e32 v1, 0 409; SI-NEXT: s_waitcnt lgkmcnt(0) 410; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 411; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 412; SI-NEXT: s_mov_b32 s4, 0xffff 413; SI-NEXT: s_mov_b32 s2, -1 414; SI-NEXT: s_waitcnt vmcnt(0) 415; SI-NEXT: v_and_b32_e32 v2, s4, v0 416; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 417; SI-NEXT: v_and_b32_e32 v3, s4, v1 418; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 419; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 420; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 421; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 422; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 423; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 424; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 425; SI-NEXT: v_or_b32_e32 v1, v3, v1 426; SI-NEXT: v_or_b32_e32 v0, v2, v0 427; SI-NEXT: s_waitcnt lgkmcnt(0) 428; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 429; SI-NEXT: s_endpgm 430; 431; VI-LABEL: v_ctpop_v4i16: 432; VI: ; %bb.0: 433; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 434; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 435; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 436; VI-NEXT: s_mov_b32 s4, 0xffff 437; VI-NEXT: s_waitcnt lgkmcnt(0) 438; VI-NEXT: v_mov_b32_e32 v1, s3 439; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 440; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 441; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 442; VI-NEXT: s_mov_b32 s3, 0xf000 443; VI-NEXT: s_mov_b32 s2, -1 444; VI-NEXT: s_waitcnt vmcnt(0) 445; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 446; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 447; VI-NEXT: v_and_b32_e32 v1, s4, v1 448; VI-NEXT: v_and_b32_e32 v0, s4, v0 449; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 450; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 451; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 452; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 453; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 454; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 455; VI-NEXT: v_or_b32_e32 v1, v1, v2 456; VI-NEXT: v_or_b32_e32 v0, v0, v3 457; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 458; VI-NEXT: s_endpgm 459; 460; EG-LABEL: v_ctpop_v4i16: 461; EG: ; %bb.0: 462; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 463; EG-NEXT: TEX 0 @6 464; EG-NEXT: ALU 42, @11, KC0[CB0:0-32], KC1[] 465; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 466; EG-NEXT: CF_END 467; EG-NEXT: PAD 468; EG-NEXT: Fetch clause starting at 6: 469; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 470; EG-NEXT: ALU clause starting at 8: 471; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 472; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 473; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 474; EG-NEXT: ALU clause starting at 11: 475; EG-NEXT: MOV T2.X, T0.X, 476; EG-NEXT: MOV * T3.X, T0.Y, 477; EG-NEXT: MOV T0.X, T4.X, 478; EG-NEXT: MOV * T0.Y, PV.X, 479; EG-NEXT: AND_INT * T0.W, PV.Y, literal.x, 480; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 481; EG-NEXT: BCNT_INT T0.W, PV.W, 482; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 483; EG-NEXT: -65536(nan), 0(0.000000e+00) 484; EG-NEXT: OR_INT * T0.W, PS, PV.W, 485; EG-NEXT: MOV T0.X, T3.X, 486; EG-NEXT: MOV * T4.X, PV.W, 487; EG-NEXT: MOV T0.Z, PS, 488; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, 489; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 490; EG-NEXT: BCNT_INT T0.W, PV.W, 491; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, 492; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 493; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 494; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 495; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 496; EG-NEXT: MOV T4.X, PV.W, 497; EG-NEXT: MOV T0.Y, T5.X, 498; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 499; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 500; EG-NEXT: BCNT_INT T0.W, PV.W, 501; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 502; EG-NEXT: -65536(nan), 0(0.000000e+00) 503; EG-NEXT: OR_INT * T0.W, PS, PV.W, 504; EG-NEXT: MOV * T5.X, PV.W, 505; EG-NEXT: MOV T0.Y, PV.X, 506; EG-NEXT: LSHR * T0.W, T0.X, literal.x, 507; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 508; EG-NEXT: BCNT_INT T0.W, PV.W, 509; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 510; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 511; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 512; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 513; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 514; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W, 515; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 516; EG-NEXT: MOV T5.X, PV.Y, 517; EG-NEXT: MOV * T8.X, T4.X, 518 %tid = call i32 @llvm.amdgcn.workitem.id.x() 519 %in.gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid 520 %val = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep, align 16 521 %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone 522 store <4 x i16> %ctpop, <4 x i16> addrspace(1)* %out, align 16 523 ret void 524} 525 526define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind { 527; SI-LABEL: v_ctpop_v8i16: 528; SI: ; %bb.0: 529; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 530; SI-NEXT: s_mov_b32 s3, 0xf000 531; SI-NEXT: s_mov_b32 s6, 0 532; SI-NEXT: s_mov_b32 s7, s3 533; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 534; SI-NEXT: v_mov_b32_e32 v1, 0 535; SI-NEXT: s_waitcnt lgkmcnt(0) 536; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 537; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 538; SI-NEXT: s_mov_b32 s4, 0xffff 539; SI-NEXT: s_mov_b32 s2, -1 540; SI-NEXT: s_waitcnt vmcnt(0) 541; SI-NEXT: v_and_b32_e32 v4, s4, v0 542; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 543; SI-NEXT: v_and_b32_e32 v5, s4, v1 544; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 545; SI-NEXT: v_and_b32_e32 v6, s4, v2 546; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 547; SI-NEXT: v_and_b32_e32 v7, s4, v3 548; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 549; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 550; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 551; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 552; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 553; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 554; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 555; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 556; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 557; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 558; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 559; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 560; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 561; SI-NEXT: v_or_b32_e32 v3, v7, v3 562; SI-NEXT: v_or_b32_e32 v2, v6, v2 563; SI-NEXT: v_or_b32_e32 v1, v5, v1 564; SI-NEXT: v_or_b32_e32 v0, v4, v0 565; SI-NEXT: s_waitcnt lgkmcnt(0) 566; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 567; SI-NEXT: s_endpgm 568; 569; VI-LABEL: v_ctpop_v8i16: 570; VI: ; %bb.0: 571; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 572; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 573; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 574; VI-NEXT: s_mov_b32 s4, 0xffff 575; VI-NEXT: s_waitcnt lgkmcnt(0) 576; VI-NEXT: v_mov_b32_e32 v1, s3 577; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 578; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 579; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 580; VI-NEXT: s_mov_b32 s3, 0xf000 581; VI-NEXT: s_mov_b32 s2, -1 582; VI-NEXT: s_waitcnt vmcnt(0) 583; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 584; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 585; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 586; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 587; VI-NEXT: v_and_b32_e32 v3, s4, v3 588; VI-NEXT: v_and_b32_e32 v2, s4, v2 589; VI-NEXT: v_and_b32_e32 v1, s4, v1 590; VI-NEXT: v_and_b32_e32 v0, s4, v0 591; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 592; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 593; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 594; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 595; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 596; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 597; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 598; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 599; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 600; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 601; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 602; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 603; VI-NEXT: v_or_b32_e32 v3, v3, v4 604; VI-NEXT: v_or_b32_e32 v2, v2, v5 605; VI-NEXT: v_or_b32_e32 v1, v1, v6 606; VI-NEXT: v_or_b32_e32 v0, v0, v7 607; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 608; VI-NEXT: s_endpgm 609; 610; EG-LABEL: v_ctpop_v8i16: 611; EG: ; %bb.0: 612; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] 613; EG-NEXT: TEX 0 @6 614; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[] 615; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1 616; EG-NEXT: CF_END 617; EG-NEXT: PAD 618; EG-NEXT: Fetch clause starting at 6: 619; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1 620; EG-NEXT: ALU clause starting at 8: 621; EG-NEXT: MOV T0.Y, T4.X, 622; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 623; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 624; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 625; EG-NEXT: ALU clause starting at 12: 626; EG-NEXT: LSHR * T0.W, T12.X, literal.x, 627; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 628; EG-NEXT: BCNT_INT * T0.W, PV.W, 629; EG-NEXT: LSHL T0.W, PV.W, literal.x, 630; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 631; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 632; EG-NEXT: OR_INT * T0.W, PS, PV.W, 633; EG-NEXT: MOV * T4.X, PV.W, 634; EG-NEXT: MOV T0.X, PV.X, 635; EG-NEXT: AND_INT * T0.W, T12.X, literal.x, 636; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 637; EG-NEXT: BCNT_INT T0.W, PV.W, 638; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 639; EG-NEXT: -65536(nan), 0(0.000000e+00) 640; EG-NEXT: OR_INT * T0.W, PS, PV.W, 641; EG-NEXT: MOV T4.X, PV.W, 642; EG-NEXT: MOV * T0.X, T5.X, 643; EG-NEXT: LSHR * T0.W, T12.Y, literal.x, 644; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 645; EG-NEXT: BCNT_INT T0.W, PV.W, 646; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 647; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 648; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 649; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 650; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 651; EG-NEXT: MOV * T5.X, PV.W, 652; EG-NEXT: MOV T0.X, PV.X, 653; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x, 654; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 655; EG-NEXT: BCNT_INT T0.W, PV.W, 656; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 657; EG-NEXT: -65536(nan), 0(0.000000e+00) 658; EG-NEXT: OR_INT * T0.Y, PS, PV.W, 659; EG-NEXT: MOV T5.X, PV.Y, 660; EG-NEXT: MOV * T0.X, T8.X, 661; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, 662; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 663; EG-NEXT: BCNT_INT T0.W, PV.W, 664; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 665; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 666; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 667; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 668; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 669; EG-NEXT: MOV * T8.X, PV.W, 670; EG-NEXT: MOV T0.X, PV.X, 671; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, 672; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 673; EG-NEXT: BCNT_INT T0.W, PV.W, 674; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 675; EG-NEXT: -65536(nan), 0(0.000000e+00) 676; EG-NEXT: OR_INT * T0.W, PS, PV.W, 677; EG-NEXT: MOV T8.X, PV.W, 678; EG-NEXT: MOV * T0.X, T9.X, 679; EG-NEXT: LSHR * T0.W, T12.W, literal.x, 680; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 681; EG-NEXT: BCNT_INT T0.W, PV.W, 682; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 683; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 684; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 685; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 686; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 687; EG-NEXT: MOV * T9.X, PV.W, 688; EG-NEXT: MOV T0.X, PV.X, 689; EG-NEXT: AND_INT * T0.W, T12.W, literal.x, 690; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 691; EG-NEXT: BCNT_INT T0.W, PV.W, 692; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 693; EG-NEXT: -65536(nan), 0(0.000000e+00) 694; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x, 695; EG-NEXT: OR_INT * T0.W, PS, PV.W, 696; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 697; EG-NEXT: MOV T9.X, PV.W, 698; EG-NEXT: MOV * T0.X, T4.X, 699; EG-NEXT: MOV * T0.Z, T8.X, 700 %tid = call i32 @llvm.amdgcn.workitem.id.x() 701 %in.gep = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %in, i32 %tid 702 %val = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep, align 32 703 %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone 704 store <8 x i16> %ctpop, <8 x i16> addrspace(1)* %out, align 32 705 ret void 706} 707 708define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind { 709; SI-LABEL: v_ctpop_v16i16: 710; SI: ; %bb.0: 711; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 712; SI-NEXT: s_mov_b32 s3, 0xf000 713; SI-NEXT: s_mov_b32 s6, 0 714; SI-NEXT: s_mov_b32 s7, s3 715; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 716; SI-NEXT: v_mov_b32_e32 v5, 0 717; SI-NEXT: s_waitcnt lgkmcnt(0) 718; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 719; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 720; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 721; SI-NEXT: s_mov_b32 s4, 0xffff 722; SI-NEXT: s_mov_b32 s2, -1 723; SI-NEXT: s_waitcnt vmcnt(1) 724; SI-NEXT: v_and_b32_e32 v8, s4, v0 725; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 726; SI-NEXT: v_and_b32_e32 v9, s4, v1 727; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 728; SI-NEXT: v_and_b32_e32 v10, s4, v2 729; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 730; SI-NEXT: v_and_b32_e32 v11, s4, v3 731; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 732; SI-NEXT: s_waitcnt vmcnt(0) 733; SI-NEXT: v_and_b32_e32 v12, s4, v4 734; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 735; SI-NEXT: v_and_b32_e32 v13, s4, v5 736; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 737; SI-NEXT: v_and_b32_e32 v14, s4, v6 738; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 739; SI-NEXT: v_and_b32_e32 v15, s4, v7 740; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 741; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 742; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 743; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 744; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 745; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 746; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 747; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 748; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 749; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0 750; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0 751; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0 752; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0 753; SI-NEXT: v_bcnt_u32_b32_e64 v11, v11, 0 754; SI-NEXT: v_bcnt_u32_b32_e64 v10, v10, 0 755; SI-NEXT: v_bcnt_u32_b32_e64 v9, v9, 0 756; SI-NEXT: v_bcnt_u32_b32_e64 v8, v8, 0 757; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 758; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 759; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 760; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 761; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 762; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 763; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 764; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 765; SI-NEXT: v_or_b32_e32 v3, v15, v7 766; SI-NEXT: v_or_b32_e32 v2, v14, v6 767; SI-NEXT: v_or_b32_e32 v1, v13, v5 768; SI-NEXT: v_or_b32_e32 v0, v12, v4 769; SI-NEXT: v_or_b32_e32 v7, v11, v16 770; SI-NEXT: v_or_b32_e32 v6, v10, v17 771; SI-NEXT: v_or_b32_e32 v5, v9, v18 772; SI-NEXT: v_or_b32_e32 v4, v8, v19 773; SI-NEXT: s_waitcnt lgkmcnt(0) 774; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 775; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 776; SI-NEXT: s_endpgm 777; 778; VI-LABEL: v_ctpop_v16i16: 779; VI: ; %bb.0: 780; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 781; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 782; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 783; VI-NEXT: s_mov_b32 s4, 0xffff 784; VI-NEXT: s_waitcnt lgkmcnt(0) 785; VI-NEXT: v_mov_b32_e32 v1, s3 786; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 787; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 788; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 789; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 790; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 791; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 792; VI-NEXT: s_mov_b32 s3, 0xf000 793; VI-NEXT: s_mov_b32 s2, -1 794; VI-NEXT: s_waitcnt vmcnt(1) 795; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 796; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 797; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 798; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 799; VI-NEXT: v_and_b32_e32 v3, s4, v3 800; VI-NEXT: v_and_b32_e32 v2, s4, v2 801; VI-NEXT: v_and_b32_e32 v1, s4, v1 802; VI-NEXT: v_and_b32_e32 v0, s4, v0 803; VI-NEXT: s_waitcnt vmcnt(0) 804; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 805; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 806; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 807; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 808; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 809; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 810; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 811; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 812; VI-NEXT: v_and_b32_e32 v7, s4, v7 813; VI-NEXT: v_and_b32_e32 v6, s4, v6 814; VI-NEXT: v_and_b32_e32 v5, s4, v5 815; VI-NEXT: v_and_b32_e32 v4, s4, v4 816; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 817; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 818; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 819; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 820; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0 821; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0 822; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0 823; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0 824; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 825; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 826; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 827; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 828; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 829; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 830; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 831; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 832; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 833; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 834; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 835; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 836; VI-NEXT: v_or_b32_e32 v3, v3, v8 837; VI-NEXT: v_or_b32_e32 v2, v2, v9 838; VI-NEXT: v_or_b32_e32 v1, v1, v10 839; VI-NEXT: v_or_b32_e32 v0, v0, v11 840; VI-NEXT: v_or_b32_e32 v7, v7, v12 841; VI-NEXT: v_or_b32_e32 v6, v6, v13 842; VI-NEXT: v_or_b32_e32 v5, v5, v14 843; VI-NEXT: v_or_b32_e32 v4, v4, v15 844; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 845; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 846; VI-NEXT: s_endpgm 847; 848; EG-LABEL: v_ctpop_v16i16: 849; EG: ; %bb.0: 850; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] 851; EG-NEXT: TEX 1 @8 852; EG-NEXT: ALU 114, @16, KC0[], KC1[] 853; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[] 854; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0 855; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1 856; EG-NEXT: CF_END 857; EG-NEXT: PAD 858; EG-NEXT: Fetch clause starting at 8: 859; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1 860; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1 861; EG-NEXT: ALU clause starting at 12: 862; EG-NEXT: MOV T0.Y, T4.X, 863; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 864; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) 865; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 866; EG-NEXT: ALU clause starting at 16: 867; EG-NEXT: LSHR * T0.W, T20.X, literal.x, 868; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 869; EG-NEXT: BCNT_INT * T0.W, PV.W, 870; EG-NEXT: LSHL T0.W, PV.W, literal.x, 871; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 872; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 873; EG-NEXT: OR_INT * T0.W, PS, PV.W, 874; EG-NEXT: MOV * T4.X, PV.W, 875; EG-NEXT: MOV T0.X, PV.X, 876; EG-NEXT: AND_INT * T0.W, T20.X, literal.x, 877; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 878; EG-NEXT: BCNT_INT T0.W, PV.W, 879; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 880; EG-NEXT: -65536(nan), 0(0.000000e+00) 881; EG-NEXT: OR_INT * T0.W, PS, PV.W, 882; EG-NEXT: MOV T4.X, PV.W, 883; EG-NEXT: MOV * T0.X, T5.X, 884; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, 885; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 886; EG-NEXT: BCNT_INT T0.W, PV.W, 887; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 888; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 889; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 890; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 891; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 892; EG-NEXT: MOV * T5.X, PV.W, 893; EG-NEXT: MOV T0.X, PV.X, 894; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x, 895; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 896; EG-NEXT: BCNT_INT T0.W, PV.W, 897; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 898; EG-NEXT: -65536(nan), 0(0.000000e+00) 899; EG-NEXT: OR_INT * T0.Y, PS, PV.W, 900; EG-NEXT: MOV T5.X, PV.Y, 901; EG-NEXT: MOV * T0.X, T8.X, 902; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, 903; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 904; EG-NEXT: BCNT_INT T0.W, PV.W, 905; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 906; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 907; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 908; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 909; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 910; EG-NEXT: MOV * T8.X, PV.W, 911; EG-NEXT: MOV T0.X, PV.X, 912; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x, 913; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 914; EG-NEXT: BCNT_INT T0.W, PV.W, 915; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 916; EG-NEXT: -65536(nan), 0(0.000000e+00) 917; EG-NEXT: OR_INT * T0.W, PS, PV.W, 918; EG-NEXT: MOV T8.X, PV.W, 919; EG-NEXT: MOV * T0.X, T9.X, 920; EG-NEXT: LSHR * T0.W, T20.W, literal.x, 921; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 922; EG-NEXT: BCNT_INT T0.W, PV.W, 923; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 924; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 925; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 926; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 927; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 928; EG-NEXT: MOV * T9.X, PV.W, 929; EG-NEXT: MOV T0.X, PV.X, 930; EG-NEXT: AND_INT * T0.W, T20.W, literal.x, 931; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 932; EG-NEXT: BCNT_INT T0.W, PV.W, 933; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 934; EG-NEXT: -65536(nan), 0(0.000000e+00) 935; EG-NEXT: OR_INT * T0.W, PS, PV.W, 936; EG-NEXT: MOV T9.X, PV.W, 937; EG-NEXT: MOV * T0.X, T12.X, 938; EG-NEXT: LSHR * T1.W, T21.X, literal.x, 939; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 940; EG-NEXT: BCNT_INT T1.W, PV.W, 941; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 942; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 943; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 944; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 945; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 946; EG-NEXT: MOV * T12.X, PV.W, 947; EG-NEXT: MOV T0.X, PV.X, 948; EG-NEXT: AND_INT * T1.W, T21.X, literal.x, 949; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 950; EG-NEXT: BCNT_INT T1.W, PV.W, 951; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 952; EG-NEXT: -65536(nan), 0(0.000000e+00) 953; EG-NEXT: OR_INT * T1.W, PS, PV.W, 954; EG-NEXT: MOV T12.X, PV.W, 955; EG-NEXT: MOV * T0.X, T13.X, 956; EG-NEXT: LSHR * T1.W, T21.Y, literal.x, 957; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 958; EG-NEXT: BCNT_INT T1.W, PV.W, 959; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 960; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 961; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 962; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 963; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 964; EG-NEXT: MOV * T13.X, PV.W, 965; EG-NEXT: MOV T0.X, PV.X, 966; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x, 967; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 968; EG-NEXT: BCNT_INT T1.W, PV.W, 969; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 970; EG-NEXT: -65536(nan), 0(0.000000e+00) 971; EG-NEXT: OR_INT * T20.Y, PS, PV.W, 972; EG-NEXT: MOV T13.X, PV.Y, 973; EG-NEXT: MOV * T0.X, T16.X, 974; EG-NEXT: LSHR * T1.W, T21.Z, literal.x, 975; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 976; EG-NEXT: BCNT_INT T1.W, PV.W, 977; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 978; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 979; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 980; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 981; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 982; EG-NEXT: ALU clause starting at 131: 983; EG-NEXT: MOV * T16.X, T1.W, 984; EG-NEXT: MOV T0.X, PV.X, 985; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x, 986; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 987; EG-NEXT: BCNT_INT T1.W, PV.W, 988; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 989; EG-NEXT: -65536(nan), 0(0.000000e+00) 990; EG-NEXT: OR_INT * T1.W, PS, PV.W, 991; EG-NEXT: MOV T16.X, PV.W, 992; EG-NEXT: MOV * T0.X, T17.X, 993; EG-NEXT: LSHR * T1.W, T21.W, literal.x, 994; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 995; EG-NEXT: BCNT_INT T1.W, PV.W, 996; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 997; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 998; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 999; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1000; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 1001; EG-NEXT: MOV * T17.X, PV.W, 1002; EG-NEXT: MOV T0.X, PV.X, 1003; EG-NEXT: AND_INT T1.W, T21.W, literal.x, 1004; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y, 1005; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) 1006; EG-NEXT: AND_INT T0.Z, PV.X, literal.x, 1007; EG-NEXT: BCNT_INT T1.W, PV.W, 1008; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 1009; EG-NEXT: -65536(nan), 16(2.242078e-44) 1010; EG-NEXT: LSHR T22.X, PS, literal.x, 1011; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W, 1012; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1013; EG-NEXT: MOV T17.X, PV.W, 1014; EG-NEXT: MOV * T0.X, T4.X, 1015; EG-NEXT: MOV * T0.Z, T8.X, 1016; EG-NEXT: MOV T20.X, T12.X, 1017; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212 1018 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1019 %in.gep = getelementptr <16 x i16>, <16 x i16> addrspace(1)* %in, i32 %tid 1020 %val = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep, align 32 1021 %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone 1022 store <16 x i16> %ctpop, <16 x i16> addrspace(1)* %out, align 32 1023 ret void 1024} 1025 1026define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 1027; SI-LABEL: v_ctpop_i16_add_inline_constant: 1028; SI: ; %bb.0: 1029; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1030; SI-NEXT: s_mov_b32 s3, 0xf000 1031; SI-NEXT: s_mov_b32 s6, 0 1032; SI-NEXT: s_mov_b32 s7, s3 1033; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1034; SI-NEXT: v_mov_b32_e32 v1, 0 1035; SI-NEXT: s_waitcnt lgkmcnt(0) 1036; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1037; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1038; SI-NEXT: s_mov_b32 s2, -1 1039; SI-NEXT: s_waitcnt vmcnt(0) 1040; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 1041; SI-NEXT: s_waitcnt lgkmcnt(0) 1042; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1043; SI-NEXT: s_endpgm 1044; 1045; VI-LABEL: v_ctpop_i16_add_inline_constant: 1046; VI: ; %bb.0: 1047; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1048; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1049; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1050; VI-NEXT: s_waitcnt lgkmcnt(0) 1051; VI-NEXT: v_mov_b32_e32 v1, s3 1052; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1053; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1054; VI-NEXT: flat_load_ushort v0, v[0:1] 1055; VI-NEXT: s_mov_b32 s3, 0xf000 1056; VI-NEXT: s_mov_b32 s2, -1 1057; VI-NEXT: s_waitcnt vmcnt(0) 1058; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 1059; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1060; VI-NEXT: s_endpgm 1061; 1062; EG-LABEL: v_ctpop_i16_add_inline_constant: 1063; EG: ; %bb.0: 1064; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1065; EG-NEXT: TEX 0 @6 1066; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1067; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1068; EG-NEXT: CF_END 1069; EG-NEXT: PAD 1070; EG-NEXT: Fetch clause starting at 6: 1071; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1072; EG-NEXT: ALU clause starting at 8: 1073; EG-NEXT: LSHL * T0.W, T0.X, 1, 1074; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1075; EG-NEXT: ALU clause starting at 10: 1076; EG-NEXT: BCNT_INT T0.W, T0.X, 1077; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1078; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1079; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1080; EG-NEXT: LSHL * T1.W, PS, literal.y, 1081; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) 1082; EG-NEXT: LSHL T0.X, PV.W, PS, 1083; EG-NEXT: LSHL * T0.W, literal.x, PS, 1084; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1085; EG-NEXT: MOV T0.Y, 0.0, 1086; EG-NEXT: MOV * T0.Z, 0.0, 1087; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1088; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1089 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1090 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1091 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1092 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1093 %add = add i16 %ctpop, 4 1094 store i16 %add, i16 addrspace(1)* %out, align 4 1095 ret void 1096} 1097 1098define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 1099; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: 1100; SI: ; %bb.0: 1101; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1102; SI-NEXT: s_mov_b32 s3, 0xf000 1103; SI-NEXT: s_mov_b32 s6, 0 1104; SI-NEXT: s_mov_b32 s7, s3 1105; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1106; SI-NEXT: v_mov_b32_e32 v1, 0 1107; SI-NEXT: s_waitcnt lgkmcnt(0) 1108; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1109; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1110; SI-NEXT: s_mov_b32 s2, -1 1111; SI-NEXT: s_waitcnt vmcnt(0) 1112; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 1113; SI-NEXT: s_waitcnt lgkmcnt(0) 1114; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1115; SI-NEXT: s_endpgm 1116; 1117; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: 1118; VI: ; %bb.0: 1119; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1120; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1121; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1122; VI-NEXT: s_waitcnt lgkmcnt(0) 1123; VI-NEXT: v_mov_b32_e32 v1, s3 1124; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1125; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1126; VI-NEXT: flat_load_ushort v0, v[0:1] 1127; VI-NEXT: s_mov_b32 s3, 0xf000 1128; VI-NEXT: s_mov_b32 s2, -1 1129; VI-NEXT: s_waitcnt vmcnt(0) 1130; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 1131; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1132; VI-NEXT: s_endpgm 1133; 1134; EG-LABEL: v_ctpop_i16_add_inline_constant_inv: 1135; EG: ; %bb.0: 1136; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1137; EG-NEXT: TEX 0 @6 1138; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1139; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1140; EG-NEXT: CF_END 1141; EG-NEXT: PAD 1142; EG-NEXT: Fetch clause starting at 6: 1143; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1144; EG-NEXT: ALU clause starting at 8: 1145; EG-NEXT: LSHL * T0.W, T0.X, 1, 1146; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1147; EG-NEXT: ALU clause starting at 10: 1148; EG-NEXT: BCNT_INT T0.W, T0.X, 1149; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1150; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1151; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1152; EG-NEXT: LSHL * T1.W, PS, literal.y, 1153; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) 1154; EG-NEXT: LSHL T0.X, PV.W, PS, 1155; EG-NEXT: LSHL * T0.W, literal.x, PS, 1156; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1157; EG-NEXT: MOV T0.Y, 0.0, 1158; EG-NEXT: MOV * T0.Z, 0.0, 1159; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1160; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1161 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1162 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1163 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1164 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1165 %add = add i16 4, %ctpop 1166 store i16 %add, i16 addrspace(1)* %out, align 4 1167 ret void 1168} 1169 1170define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 1171; SI-LABEL: v_ctpop_i16_add_literal: 1172; SI: ; %bb.0: 1173; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1174; SI-NEXT: s_mov_b32 s3, 0xf000 1175; SI-NEXT: s_mov_b32 s6, 0 1176; SI-NEXT: s_mov_b32 s7, s3 1177; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1178; SI-NEXT: v_mov_b32_e32 v1, 0 1179; SI-NEXT: s_waitcnt lgkmcnt(0) 1180; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1181; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1182; SI-NEXT: s_movk_i32 s4, 0x3e7 1183; SI-NEXT: s_mov_b32 s2, -1 1184; SI-NEXT: s_waitcnt vmcnt(0) 1185; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s4 1186; SI-NEXT: s_waitcnt lgkmcnt(0) 1187; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1188; SI-NEXT: s_endpgm 1189; 1190; VI-LABEL: v_ctpop_i16_add_literal: 1191; VI: ; %bb.0: 1192; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1193; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1194; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1195; VI-NEXT: s_movk_i32 s4, 0x3e7 1196; VI-NEXT: s_waitcnt lgkmcnt(0) 1197; VI-NEXT: v_mov_b32_e32 v1, s3 1198; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1199; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1200; VI-NEXT: flat_load_ushort v0, v[0:1] 1201; VI-NEXT: s_mov_b32 s3, 0xf000 1202; VI-NEXT: s_mov_b32 s2, -1 1203; VI-NEXT: s_waitcnt vmcnt(0) 1204; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1205; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1206; VI-NEXT: s_endpgm 1207; 1208; EG-LABEL: v_ctpop_i16_add_literal: 1209; EG: ; %bb.0: 1210; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1211; EG-NEXT: TEX 0 @6 1212; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1213; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1214; EG-NEXT: CF_END 1215; EG-NEXT: PAD 1216; EG-NEXT: Fetch clause starting at 6: 1217; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1218; EG-NEXT: ALU clause starting at 8: 1219; EG-NEXT: LSHL * T0.W, T0.X, 1, 1220; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1221; EG-NEXT: ALU clause starting at 10: 1222; EG-NEXT: BCNT_INT T0.W, T0.X, 1223; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1224; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1225; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1226; EG-NEXT: LSHL * T1.W, PS, literal.y, 1227; EG-NEXT: 999(1.399897e-42), 3(4.203895e-45) 1228; EG-NEXT: LSHL T0.X, PV.W, PS, 1229; EG-NEXT: LSHL * T0.W, literal.x, PS, 1230; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1231; EG-NEXT: MOV T0.Y, 0.0, 1232; EG-NEXT: MOV * T0.Z, 0.0, 1233; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1234; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1235 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1236 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1237 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1238 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1239 %add = add i16 %ctpop, 999 1240 store i16 %add, i16 addrspace(1)* %out, align 4 1241 ret void 1242} 1243 1244define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { 1245; SI-LABEL: v_ctpop_i16_add_var: 1246; SI: ; %bb.0: 1247; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1248; SI-NEXT: s_load_dword s8, s[0:1], 0xd 1249; SI-NEXT: s_mov_b32 s3, 0xf000 1250; SI-NEXT: s_mov_b32 s6, 0 1251; SI-NEXT: s_mov_b32 s7, s3 1252; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1253; SI-NEXT: v_mov_b32_e32 v1, 0 1254; SI-NEXT: s_waitcnt lgkmcnt(0) 1255; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1256; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1257; SI-NEXT: s_mov_b32 s2, -1 1258; SI-NEXT: s_waitcnt vmcnt(0) 1259; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 1260; SI-NEXT: s_waitcnt lgkmcnt(0) 1261; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1262; SI-NEXT: s_endpgm 1263; 1264; VI-LABEL: v_ctpop_i16_add_var: 1265; VI: ; %bb.0: 1266; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1267; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1268; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1269; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1270; VI-NEXT: s_waitcnt lgkmcnt(0) 1271; VI-NEXT: v_mov_b32_e32 v1, s3 1272; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1273; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1274; VI-NEXT: flat_load_ushort v0, v[0:1] 1275; VI-NEXT: s_mov_b32 s3, 0xf000 1276; VI-NEXT: s_mov_b32 s2, -1 1277; VI-NEXT: s_waitcnt vmcnt(0) 1278; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1279; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1280; VI-NEXT: s_endpgm 1281; 1282; EG-LABEL: v_ctpop_i16_add_var: 1283; EG: ; %bb.0: 1284; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1285; EG-NEXT: TEX 0 @8 1286; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1287; EG-NEXT: TEX 0 @10 1288; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1289; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1290; EG-NEXT: CF_END 1291; EG-NEXT: PAD 1292; EG-NEXT: Fetch clause starting at 8: 1293; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1294; EG-NEXT: Fetch clause starting at 10: 1295; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 1296; EG-NEXT: ALU clause starting at 12: 1297; EG-NEXT: LSHL * T0.W, T0.X, 1, 1298; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1299; EG-NEXT: ALU clause starting at 14: 1300; EG-NEXT: MOV * T1.X, 0.0, 1301; EG-NEXT: ALU clause starting at 15: 1302; EG-NEXT: BCNT_INT T0.W, T0.X, 1303; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1304; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1305; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X, 1306; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1307; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1308; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1309; EG-NEXT: LSHL T0.X, PV.W, PS, 1310; EG-NEXT: LSHL * T0.W, literal.x, PS, 1311; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1312; EG-NEXT: MOV T0.Y, 0.0, 1313; EG-NEXT: MOV * T0.Z, 0.0, 1314; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1315; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1316 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1317 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1318 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1319 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1320 %add = add i16 %ctpop, %const 1321 store i16 %add, i16 addrspace(1)* %out, align 4 1322 ret void 1323} 1324 1325define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { 1326; SI-LABEL: v_ctpop_i16_add_var_inv: 1327; SI: ; %bb.0: 1328; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1329; SI-NEXT: s_load_dword s8, s[0:1], 0xd 1330; SI-NEXT: s_mov_b32 s3, 0xf000 1331; SI-NEXT: s_mov_b32 s6, 0 1332; SI-NEXT: s_mov_b32 s7, s3 1333; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1334; SI-NEXT: v_mov_b32_e32 v1, 0 1335; SI-NEXT: s_waitcnt lgkmcnt(0) 1336; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1337; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1338; SI-NEXT: s_mov_b32 s2, -1 1339; SI-NEXT: s_waitcnt vmcnt(0) 1340; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 1341; SI-NEXT: s_waitcnt lgkmcnt(0) 1342; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1343; SI-NEXT: s_endpgm 1344; 1345; VI-LABEL: v_ctpop_i16_add_var_inv: 1346; VI: ; %bb.0: 1347; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1348; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1349; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1350; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1351; VI-NEXT: s_waitcnt lgkmcnt(0) 1352; VI-NEXT: v_mov_b32_e32 v1, s3 1353; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1354; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1355; VI-NEXT: flat_load_ushort v0, v[0:1] 1356; VI-NEXT: s_mov_b32 s3, 0xf000 1357; VI-NEXT: s_mov_b32 s2, -1 1358; VI-NEXT: s_waitcnt vmcnt(0) 1359; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1360; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1361; VI-NEXT: s_endpgm 1362; 1363; EG-LABEL: v_ctpop_i16_add_var_inv: 1364; EG: ; %bb.0: 1365; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1366; EG-NEXT: TEX 0 @8 1367; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1368; EG-NEXT: TEX 0 @10 1369; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1370; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1371; EG-NEXT: CF_END 1372; EG-NEXT: PAD 1373; EG-NEXT: Fetch clause starting at 8: 1374; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1375; EG-NEXT: Fetch clause starting at 10: 1376; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 1377; EG-NEXT: ALU clause starting at 12: 1378; EG-NEXT: LSHL * T0.W, T0.X, 1, 1379; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1380; EG-NEXT: ALU clause starting at 14: 1381; EG-NEXT: MOV * T1.X, 0.0, 1382; EG-NEXT: ALU clause starting at 15: 1383; EG-NEXT: BCNT_INT T0.W, T0.X, 1384; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1385; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1386; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W, 1387; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1388; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1389; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1390; EG-NEXT: LSHL T0.X, PV.W, PS, 1391; EG-NEXT: LSHL * T0.W, literal.x, PS, 1392; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1393; EG-NEXT: MOV T0.Y, 0.0, 1394; EG-NEXT: MOV * T0.Z, 0.0, 1395; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1396; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1397 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1398 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1399 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1400 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1401 %add = add i16 %const, %ctpop 1402 store i16 %add, i16 addrspace(1)* %out, align 4 1403 ret void 1404} 1405 1406define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind { 1407; SI-LABEL: v_ctpop_i16_add_vvar_inv: 1408; SI: ; %bb.0: 1409; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1410; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1411; SI-NEXT: s_mov_b32 s3, 0xf000 1412; SI-NEXT: s_mov_b32 s6, 0 1413; SI-NEXT: s_mov_b32 s7, s3 1414; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1415; SI-NEXT: v_mov_b32_e32 v1, 0 1416; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 1417; SI-NEXT: s_waitcnt lgkmcnt(0) 1418; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 1419; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1420; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1421; SI-NEXT: s_mov_b32 s2, -1 1422; SI-NEXT: s_waitcnt vmcnt(0) 1423; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 1424; SI-NEXT: s_waitcnt lgkmcnt(0) 1425; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1426; SI-NEXT: s_endpgm 1427; 1428; VI-LABEL: v_ctpop_i16_add_vvar_inv: 1429; VI: ; %bb.0: 1430; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1431; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1432; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1433; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1434; VI-NEXT: s_waitcnt lgkmcnt(0) 1435; VI-NEXT: v_mov_b32_e32 v1, s3 1436; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1437; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1438; VI-NEXT: flat_load_ushort v3, v[0:1] 1439; VI-NEXT: v_mov_b32_e32 v1, s5 1440; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1441; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1442; VI-NEXT: flat_load_ushort v0, v[0:1] 1443; VI-NEXT: s_mov_b32 s3, 0xf000 1444; VI-NEXT: s_mov_b32 s2, -1 1445; VI-NEXT: s_waitcnt vmcnt(0) 1446; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 1447; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1448; VI-NEXT: s_endpgm 1449; 1450; EG-LABEL: v_ctpop_i16_add_vvar_inv: 1451; EG: ; %bb.0: 1452; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1453; EG-NEXT: TEX 0 @8 1454; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1455; EG-NEXT: TEX 0 @10 1456; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1457; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1458; EG-NEXT: CF_END 1459; EG-NEXT: PAD 1460; EG-NEXT: Fetch clause starting at 8: 1461; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1462; EG-NEXT: Fetch clause starting at 10: 1463; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 1464; EG-NEXT: ALU clause starting at 12: 1465; EG-NEXT: LSHL * T0.W, T0.X, 1, 1466; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1467; EG-NEXT: ALU clause starting at 14: 1468; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W, 1469; EG-NEXT: ALU clause starting at 15: 1470; EG-NEXT: BCNT_INT T0.W, T0.X, 1471; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1472; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1473; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W, 1474; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1475; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1476; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1477; EG-NEXT: LSHL T0.X, PV.W, PS, 1478; EG-NEXT: LSHL * T0.W, literal.x, PS, 1479; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1480; EG-NEXT: MOV T0.Y, 0.0, 1481; EG-NEXT: MOV * T0.Z, 0.0, 1482; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1483; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1484 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1485 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1486 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1487 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1488 %gep = getelementptr i16, i16 addrspace(1)* %constptr, i32 %tid 1489 %const = load i16, i16 addrspace(1)* %gep, align 4 1490 %add = add i16 %const, %ctpop 1491 store i16 %add, i16 addrspace(1)* %out, align 4 1492 ret void 1493} 1494 1495; FIXME: We currently disallow SALU instructions in all branches, 1496; but there are some cases when the should be allowed. 1497define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %ctpop_arg, i16 %cond) { 1498; SI-LABEL: ctpop_i16_in_br: 1499; SI: ; %bb.0: ; %entry 1500; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1501; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1502; SI-NEXT: s_waitcnt lgkmcnt(0) 1503; SI-NEXT: s_lshr_b32 s5, s4, 16 1504; SI-NEXT: s_cmp_lg_u32 s5, 0 1505; SI-NEXT: s_cbranch_scc0 .LBB14_2 1506; SI-NEXT: ; %bb.1: ; %else 1507; SI-NEXT: s_mov_b32 s11, 0xf000 1508; SI-NEXT: s_mov_b32 s10, -1 1509; SI-NEXT: s_mov_b32 s8, s2 1510; SI-NEXT: s_mov_b32 s9, s3 1511; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1512; SI-NEXT: s_mov_b64 s[2:3], 0 1513; SI-NEXT: s_cbranch_execz .LBB14_3 1514; SI-NEXT: s_branch .LBB14_4 1515; SI-NEXT: .LBB14_2: 1516; SI-NEXT: s_mov_b64 s[2:3], -1 1517; SI-NEXT: v_mov_b32_e32 v0, 0 1518; SI-NEXT: .LBB14_3: ; %if 1519; SI-NEXT: s_and_b32 s2, s4, 0xffff 1520; SI-NEXT: s_bcnt1_i32_b32 s2, s2 1521; SI-NEXT: s_waitcnt vmcnt(0) 1522; SI-NEXT: v_mov_b32_e32 v0, s2 1523; SI-NEXT: .LBB14_4: ; %endif 1524; SI-NEXT: s_mov_b32 s3, 0xf000 1525; SI-NEXT: s_mov_b32 s2, -1 1526; SI-NEXT: s_waitcnt vmcnt(0) 1527; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1528; SI-NEXT: s_endpgm 1529; 1530; VI-LABEL: ctpop_i16_in_br: 1531; VI: ; %bb.0: ; %entry 1532; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1533; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1534; VI-NEXT: s_waitcnt lgkmcnt(0) 1535; VI-NEXT: s_lshr_b32 s5, s4, 16 1536; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0 1537; VI-NEXT: s_and_b64 vcc, exec, s[6:7] 1538; VI-NEXT: s_cbranch_vccz .LBB14_2 1539; VI-NEXT: ; %bb.1: ; %else 1540; VI-NEXT: s_mov_b32 s11, 0xf000 1541; VI-NEXT: s_mov_b32 s10, -1 1542; VI-NEXT: s_mov_b32 s8, s2 1543; VI-NEXT: s_mov_b32 s9, s3 1544; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1545; VI-NEXT: s_mov_b64 s[2:3], 0 1546; VI-NEXT: s_cbranch_execz .LBB14_3 1547; VI-NEXT: s_branch .LBB14_4 1548; VI-NEXT: .LBB14_2: 1549; VI-NEXT: s_mov_b64 s[2:3], -1 1550; VI-NEXT: ; implicit-def: $vgpr0 1551; VI-NEXT: .LBB14_3: ; %if 1552; VI-NEXT: s_and_b32 s2, s4, 0xffff 1553; VI-NEXT: s_bcnt1_i32_b32 s2, s2 1554; VI-NEXT: s_waitcnt vmcnt(0) 1555; VI-NEXT: v_mov_b32_e32 v0, s2 1556; VI-NEXT: .LBB14_4: ; %endif 1557; VI-NEXT: s_mov_b32 s3, 0xf000 1558; VI-NEXT: s_mov_b32 s2, -1 1559; VI-NEXT: s_waitcnt vmcnt(0) 1560; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1561; VI-NEXT: s_endpgm 1562; 1563; EG-LABEL: ctpop_i16_in_br: 1564; EG: ; %bb.0: ; %entry 1565; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1566; EG-NEXT: TEX 0 @14 1567; EG-NEXT: ALU_PUSH_BEFORE 6, @21, KC0[], KC1[] 1568; EG-NEXT: JUMP @7 POP:1 1569; EG-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[] 1570; EG-NEXT: TEX 0 @16 1571; EG-NEXT: ALU_POP_AFTER 1, @29, KC0[], KC1[] 1572; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] 1573; EG-NEXT: JUMP @11 POP:1 1574; EG-NEXT: TEX 0 @18 1575; EG-NEXT: ALU_POP_AFTER 0, @34, KC0[], KC1[] 1576; EG-NEXT: ALU 11, @35, KC0[], KC1[] 1577; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 1578; EG-NEXT: CF_END 1579; EG-NEXT: Fetch clause starting at 14: 1580; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3 1581; EG-NEXT: Fetch clause starting at 16: 1582; EG-NEXT: VTX_READ_16 T1.X, T1.X, 2, #1 1583; EG-NEXT: Fetch clause starting at 18: 1584; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 1585; EG-NEXT: ALU clause starting at 20: 1586; EG-NEXT: MOV * T0.X, 0.0, 1587; EG-NEXT: ALU clause starting at 21: 1588; EG-NEXT: AND_INT * T0.W, T1.X, literal.x, 1589; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1590; EG-NEXT: MOV T1.X, literal.x, 1591; EG-NEXT: MOV T1.W, literal.y, 1592; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, 1593; EG-NEXT: 0(0.000000e+00), 1(1.401298e-45) 1594; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1595; EG-NEXT: ALU clause starting at 28: 1596; EG-NEXT: MOV * T1.X, KC0[2].Z, 1597; EG-NEXT: ALU clause starting at 29: 1598; EG-NEXT: MOV * T1.W, literal.x, 1599; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1600; EG-NEXT: ALU clause starting at 31: 1601; EG-NEXT: MOV T0.W, KC0[2].Y, 1602; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 1603; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1604; EG-NEXT: ALU clause starting at 34: 1605; EG-NEXT: BCNT_INT * T1.X, T0.X, 1606; EG-NEXT: ALU clause starting at 35: 1607; EG-NEXT: LSHL * T1.W, T0.W, literal.x, 1608; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1609; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1610; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, 1611; EG-NEXT: 24(3.363116e-44), 65535(9.183409e-41) 1612; EG-NEXT: LSHL T1.X, PS, PV.W, 1613; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 1614; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1615; EG-NEXT: MOV T1.Y, 0.0, 1616; EG-NEXT: MOV * T1.Z, 0.0, 1617; EG-NEXT: LSHR * T0.X, T0.W, literal.x, 1618; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1619entry: 1620 %tmp0 = icmp eq i16 %cond, 0 1621 br i1 %tmp0, label %if, label %else 1622 1623if: 1624 %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg) 1625 br label %endif 1626 1627else: 1628 %tmp3 = getelementptr i16, i16 addrspace(1)* %in, i16 1 1629 %tmp4 = load i16, i16 addrspace(1)* %tmp3 1630 br label %endif 1631 1632endif: 1633 %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else] 1634 store i16 %tmp5, i16 addrspace(1)* %out 1635 ret void 1636} 1637