1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s 5 6declare i16 @llvm.ctpop.i16(i16) nounwind readnone 7declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone 8declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 9declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 10declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone 11 12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 13 14define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind { 15; SI-LABEL: s_ctpop_i16: 16; SI: ; %bb.0: 17; SI-NEXT: s_load_dword s4, s[0:1], 0xb 18; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 19; SI-NEXT: s_mov_b32 s3, 0xf000 20; SI-NEXT: s_mov_b32 s2, -1 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_and_b32 s4, s4, 0xffff 23; SI-NEXT: s_bcnt1_i32_b32 s4, s4 24; SI-NEXT: v_mov_b32_e32 v0, s4 25; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: s_ctpop_i16: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 31; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 32; VI-NEXT: s_mov_b32 s3, 0xf000 33; VI-NEXT: s_mov_b32 s2, -1 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_and_b32 s4, s4, 0xffff 36; VI-NEXT: s_bcnt1_i32_b32 s4, s4 37; VI-NEXT: v_mov_b32_e32 v0, s4 38; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 39; VI-NEXT: s_endpgm 40; 41; EG-LABEL: s_ctpop_i16: 42; EG: ; %bb.0: 43; EG-NEXT: ALU 0, @8, KC0[], KC1[] 44; EG-NEXT: TEX 0 @6 45; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 46; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 47; EG-NEXT: CF_END 48; EG-NEXT: PAD 49; EG-NEXT: Fetch clause starting at 6: 50; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 51; EG-NEXT: ALU clause starting at 8: 52; EG-NEXT: MOV * T0.X, 0.0, 53; EG-NEXT: ALU clause starting at 9: 54; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 55; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 56; EG-NEXT: BCNT_INT T1.W, T0.X, 57; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 58; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 59; EG-NEXT: LSHL T0.X, PV.W, PS, 60; EG-NEXT: LSHL * T0.W, literal.x, PS, 61; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 62; EG-NEXT: MOV T0.Y, 0.0, 63; EG-NEXT: MOV * T0.Z, 0.0, 64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 65; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 66 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 67 store i16 %ctpop, i16 addrspace(1)* %out, align 4 68 ret void 69} 70 71; XXX - Why 0 in register? 72define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 73; SI-LABEL: v_ctpop_i16: 74; SI: ; %bb.0: 75; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 76; SI-NEXT: s_mov_b32 s3, 0xf000 77; SI-NEXT: s_mov_b32 s6, 0 78; SI-NEXT: s_mov_b32 s7, s3 79; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 80; SI-NEXT: v_mov_b32_e32 v1, 0 81; SI-NEXT: s_waitcnt lgkmcnt(0) 82; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 83; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 84; SI-NEXT: s_mov_b32 s2, -1 85; SI-NEXT: s_waitcnt vmcnt(0) 86; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 87; SI-NEXT: s_waitcnt lgkmcnt(0) 88; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 89; SI-NEXT: s_endpgm 90; 91; VI-LABEL: v_ctpop_i16: 92; VI: ; %bb.0: 93; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 94; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 95; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 96; VI-NEXT: s_waitcnt lgkmcnt(0) 97; VI-NEXT: v_mov_b32_e32 v1, s3 98; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 99; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 100; VI-NEXT: flat_load_ushort v0, v[0:1] 101; VI-NEXT: s_mov_b32 s3, 0xf000 102; VI-NEXT: s_mov_b32 s2, -1 103; VI-NEXT: s_waitcnt vmcnt(0) 104; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 105; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 106; VI-NEXT: s_endpgm 107; 108; EG-LABEL: v_ctpop_i16: 109; EG: ; %bb.0: 110; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 111; EG-NEXT: TEX 0 @6 112; EG-NEXT: ALU 11, @10, KC0[CB0:0-32], KC1[] 113; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 114; EG-NEXT: CF_END 115; EG-NEXT: PAD 116; EG-NEXT: Fetch clause starting at 6: 117; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 118; EG-NEXT: ALU clause starting at 8: 119; EG-NEXT: LSHL * T0.W, T0.X, 1, 120; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 121; EG-NEXT: ALU clause starting at 10: 122; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 123; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 124; EG-NEXT: BCNT_INT T1.W, T0.X, 125; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 126; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 127; EG-NEXT: LSHL T0.X, PV.W, PS, 128; EG-NEXT: LSHL * T0.W, literal.x, PS, 129; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 130; EG-NEXT: MOV T0.Y, 0.0, 131; EG-NEXT: MOV * T0.Z, 0.0, 132; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 134 %tid = call i32 @llvm.amdgcn.workitem.id.x() 135 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 136 %val = load i16, i16 addrspace(1)* %in.gep, align 4 137 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 138 store i16 %ctpop, i16 addrspace(1)* %out, align 4 139 ret void 140} 141 142define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind { 143; SI-LABEL: v_ctpop_add_chain_i16: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 146; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 147; SI-NEXT: s_mov_b32 s3, 0xf000 148; SI-NEXT: s_mov_b32 s6, 0 149; SI-NEXT: s_mov_b32 s7, s3 150; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 151; SI-NEXT: v_mov_b32_e32 v1, 0 152; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 153; SI-NEXT: s_waitcnt lgkmcnt(0) 154; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 155; SI-NEXT: s_waitcnt vmcnt(0) 156; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 glc 157; SI-NEXT: s_waitcnt vmcnt(0) 158; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 159; SI-NEXT: s_mov_b32 s2, -1 160; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 161; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 162; SI-NEXT: s_waitcnt lgkmcnt(0) 163; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 164; SI-NEXT: s_endpgm 165; 166; VI-LABEL: v_ctpop_add_chain_i16: 167; VI: ; %bb.0: 168; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 169; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 170; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 171; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: v_mov_b32_e32 v1, s3 174; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 175; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 176; VI-NEXT: v_mov_b32_e32 v3, s5 177; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 178; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 179; VI-NEXT: flat_load_ushort v0, v[0:1] glc 180; VI-NEXT: s_waitcnt vmcnt(0) 181; VI-NEXT: flat_load_ushort v1, v[2:3] glc 182; VI-NEXT: s_waitcnt vmcnt(0) 183; VI-NEXT: s_mov_b32 s3, 0xf000 184; VI-NEXT: s_mov_b32 s2, -1 185; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 186; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 187; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 188; VI-NEXT: s_endpgm 189; 190; EG-LABEL: v_ctpop_add_chain_i16: 191; EG: ; %bb.0: 192; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 193; EG-NEXT: TEX 0 @8 194; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 195; EG-NEXT: TEX 0 @10 196; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[] 197; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 198; EG-NEXT: CF_END 199; EG-NEXT: PAD 200; EG-NEXT: Fetch clause starting at 8: 201; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 202; EG-NEXT: Fetch clause starting at 10: 203; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 204; EG-NEXT: ALU clause starting at 12: 205; EG-NEXT: LSHL * T0.W, T0.X, 1, 206; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 207; EG-NEXT: ALU clause starting at 14: 208; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W, 209; EG-NEXT: ALU clause starting at 15: 210; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 211; EG-NEXT: AND_INT * T1.W, T1.X, literal.x, 212; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 213; EG-NEXT: BCNT_INT T0.Z, PS, 214; EG-NEXT: BCNT_INT T0.W, PV.W, 215; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 216; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 217; EG-NEXT: ADD_INT T0.W, PV.W, PV.Z, 218; EG-NEXT: LSHL * T1.W, PS, literal.x, 219; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 220; EG-NEXT: LSHL T0.X, PV.W, PS, 221; EG-NEXT: LSHL * T0.W, literal.x, PS, 222; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 223; EG-NEXT: MOV T0.Y, 0.0, 224; EG-NEXT: MOV * T0.Z, 0.0, 225; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 226; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 228 %in0.gep = getelementptr i16, i16 addrspace(1)* %in0, i32 %tid 229 %in1.gep = getelementptr i16, i16 addrspace(1)* %in1, i32 %tid 230 %val0 = load volatile i16, i16 addrspace(1)* %in0.gep, align 4 231 %val1 = load volatile i16, i16 addrspace(1)* %in1.gep, align 4 232 %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone 233 %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone 234 %add = add i16 %ctpop0, %ctpop1 235 store i16 %add, i16 addrspace(1)* %out, align 4 236 ret void 237} 238 239define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind { 240; SI-LABEL: v_ctpop_add_sgpr_i16: 241; SI: ; %bb.0: 242; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 243; SI-NEXT: s_load_dword s8, s[0:1], 0xd 244; SI-NEXT: s_mov_b32 s3, 0xf000 245; SI-NEXT: s_mov_b32 s6, 0 246; SI-NEXT: s_mov_b32 s7, s3 247; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 248; SI-NEXT: v_mov_b32_e32 v1, 0 249; SI-NEXT: s_waitcnt lgkmcnt(0) 250; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 251; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 252; SI-NEXT: s_mov_b32 s2, -1 253; SI-NEXT: s_waitcnt vmcnt(0) 254; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 255; SI-NEXT: s_waitcnt lgkmcnt(0) 256; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 257; SI-NEXT: s_endpgm 258; 259; VI-LABEL: v_ctpop_add_sgpr_i16: 260; VI: ; %bb.0: 261; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 262; VI-NEXT: s_load_dword s4, s[0:1], 0x34 263; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 264; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 265; VI-NEXT: s_waitcnt lgkmcnt(0) 266; VI-NEXT: v_mov_b32_e32 v1, s3 267; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 268; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 269; VI-NEXT: flat_load_ushort v0, v[0:1] 270; VI-NEXT: s_mov_b32 s3, 0xf000 271; VI-NEXT: s_mov_b32 s2, -1 272; VI-NEXT: s_waitcnt vmcnt(0) 273; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 274; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 275; VI-NEXT: s_endpgm 276; 277; EG-LABEL: v_ctpop_add_sgpr_i16: 278; EG: ; %bb.0: 279; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 280; EG-NEXT: TEX 0 @8 281; EG-NEXT: ALU 0, @14, KC0[], KC1[] 282; EG-NEXT: TEX 0 @10 283; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 284; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 285; EG-NEXT: CF_END 286; EG-NEXT: PAD 287; EG-NEXT: Fetch clause starting at 8: 288; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 289; EG-NEXT: Fetch clause starting at 10: 290; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 291; EG-NEXT: ALU clause starting at 12: 292; EG-NEXT: LSHL * T0.W, T0.X, 1, 293; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 294; EG-NEXT: ALU clause starting at 14: 295; EG-NEXT: MOV * T1.X, 0.0, 296; EG-NEXT: ALU clause starting at 15: 297; EG-NEXT: BCNT_INT T0.W, T0.X, 298; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 299; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 300; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X, 301; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 302; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 303; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 304; EG-NEXT: LSHL T0.X, PV.W, PS, 305; EG-NEXT: LSHL * T0.W, literal.x, PS, 306; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 307; EG-NEXT: MOV T0.Y, 0.0, 308; EG-NEXT: MOV * T0.Z, 0.0, 309; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 310; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 311 %tid = call i32 @llvm.amdgcn.workitem.id.x() 312 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 313 %val = load i16, i16 addrspace(1)* %in.gep, align 4 314 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 315 %add = add i16 %ctpop, %sval 316 store i16 %add, i16 addrspace(1)* %out, align 4 317 ret void 318} 319 320define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind { 321; SI-LABEL: v_ctpop_v2i16: 322; SI: ; %bb.0: 323; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 324; SI-NEXT: s_mov_b32 s3, 0xf000 325; SI-NEXT: s_mov_b32 s6, 0 326; SI-NEXT: s_mov_b32 s7, s3 327; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; SI-NEXT: v_mov_b32_e32 v1, 0 329; SI-NEXT: s_waitcnt lgkmcnt(0) 330; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 331; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 332; SI-NEXT: s_mov_b32 s2, -1 333; SI-NEXT: s_waitcnt vmcnt(0) 334; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 335; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 336; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 337; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 338; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 339; SI-NEXT: v_or_b32_e32 v0, v1, v0 340; SI-NEXT: s_waitcnt lgkmcnt(0) 341; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 342; SI-NEXT: s_endpgm 343; 344; VI-LABEL: v_ctpop_v2i16: 345; VI: ; %bb.0: 346; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 347; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 348; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 349; VI-NEXT: s_waitcnt lgkmcnt(0) 350; VI-NEXT: v_mov_b32_e32 v1, s3 351; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 352; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 353; VI-NEXT: flat_load_dword v0, v[0:1] 354; VI-NEXT: s_mov_b32 s3, 0xf000 355; VI-NEXT: s_mov_b32 s2, -1 356; VI-NEXT: s_waitcnt vmcnt(0) 357; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 358; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 359; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 360; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 361; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 362; VI-NEXT: v_or_b32_e32 v0, v0, v1 363; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 364; VI-NEXT: s_endpgm 365; 366; EG-LABEL: v_ctpop_v2i16: 367; EG: ; %bb.0: 368; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 369; EG-NEXT: TEX 0 @6 370; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 371; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 372; EG-NEXT: CF_END 373; EG-NEXT: PAD 374; EG-NEXT: Fetch clause starting at 6: 375; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 376; EG-NEXT: ALU clause starting at 8: 377; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 378; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 379; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 380; EG-NEXT: ALU clause starting at 11: 381; EG-NEXT: LSHR * T0.W, T0.X, literal.x, 382; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 383; EG-NEXT: BCNT_INT T0.W, PV.W, 384; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 385; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 386; EG-NEXT: BCNT_INT T1.W, PS, 387; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 388; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 389; EG-NEXT: OR_INT T0.X, PV.W, PS, 390; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 391; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 392 %tid = call i32 @llvm.amdgcn.workitem.id.x() 393 %in.gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid 394 %val = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep, align 8 395 %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone 396 store <2 x i16> %ctpop, <2 x i16> addrspace(1)* %out, align 8 397 ret void 398} 399 400define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind { 401; SI-LABEL: v_ctpop_v4i16: 402; SI: ; %bb.0: 403; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 404; SI-NEXT: s_mov_b32 s3, 0xf000 405; SI-NEXT: s_mov_b32 s6, 0 406; SI-NEXT: s_mov_b32 s7, s3 407; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 408; SI-NEXT: v_mov_b32_e32 v1, 0 409; SI-NEXT: s_waitcnt lgkmcnt(0) 410; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 411; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 412; SI-NEXT: s_mov_b32 s2, -1 413; SI-NEXT: s_waitcnt vmcnt(0) 414; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 415; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 416; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 417; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 418; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 419; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 420; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 421; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 422; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 423; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 424; SI-NEXT: v_or_b32_e32 v1, v3, v1 425; SI-NEXT: v_or_b32_e32 v0, v2, v0 426; SI-NEXT: s_waitcnt lgkmcnt(0) 427; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 428; SI-NEXT: s_endpgm 429; 430; VI-LABEL: v_ctpop_v4i16: 431; VI: ; %bb.0: 432; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 433; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 434; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: v_mov_b32_e32 v1, s3 437; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 438; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 439; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 440; VI-NEXT: s_mov_b32 s3, 0xf000 441; VI-NEXT: s_mov_b32 s2, -1 442; VI-NEXT: s_waitcnt vmcnt(0) 443; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 444; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 445; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 446; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 447; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 448; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 449; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 450; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 451; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 452; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 453; VI-NEXT: v_or_b32_e32 v1, v1, v2 454; VI-NEXT: v_or_b32_e32 v0, v0, v3 455; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 456; VI-NEXT: s_endpgm 457; 458; EG-LABEL: v_ctpop_v4i16: 459; EG: ; %bb.0: 460; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 461; EG-NEXT: TEX 0 @6 462; EG-NEXT: ALU 42, @11, KC0[CB0:0-32], KC1[] 463; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 464; EG-NEXT: CF_END 465; EG-NEXT: PAD 466; EG-NEXT: Fetch clause starting at 6: 467; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 468; EG-NEXT: ALU clause starting at 8: 469; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 470; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 471; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 472; EG-NEXT: ALU clause starting at 11: 473; EG-NEXT: MOV T2.X, T0.X, 474; EG-NEXT: MOV * T3.X, T0.Y, 475; EG-NEXT: MOV T0.X, T4.X, 476; EG-NEXT: MOV * T0.Y, PV.X, 477; EG-NEXT: AND_INT * T0.W, PV.Y, literal.x, 478; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 479; EG-NEXT: BCNT_INT T0.W, PV.W, 480; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 481; EG-NEXT: -65536(nan), 0(0.000000e+00) 482; EG-NEXT: OR_INT * T0.W, PS, PV.W, 483; EG-NEXT: MOV T0.X, T3.X, 484; EG-NEXT: MOV * T4.X, PV.W, 485; EG-NEXT: MOV T0.Z, PS, 486; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, 487; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 488; EG-NEXT: BCNT_INT T0.W, PV.W, 489; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, 490; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 491; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 492; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 493; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 494; EG-NEXT: MOV T4.X, PV.W, 495; EG-NEXT: MOV T0.Y, T5.X, 496; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 497; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 498; EG-NEXT: BCNT_INT T0.W, PV.W, 499; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 500; EG-NEXT: -65536(nan), 0(0.000000e+00) 501; EG-NEXT: OR_INT * T0.W, PS, PV.W, 502; EG-NEXT: MOV * T5.X, PV.W, 503; EG-NEXT: MOV T0.Y, PV.X, 504; EG-NEXT: LSHR * T0.W, T0.X, literal.x, 505; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 506; EG-NEXT: BCNT_INT T0.W, PV.W, 507; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 508; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 509; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 510; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 511; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 512; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W, 513; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 514; EG-NEXT: MOV T5.X, PV.Y, 515; EG-NEXT: MOV * T8.X, T4.X, 516 %tid = call i32 @llvm.amdgcn.workitem.id.x() 517 %in.gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid 518 %val = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep, align 16 519 %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone 520 store <4 x i16> %ctpop, <4 x i16> addrspace(1)* %out, align 16 521 ret void 522} 523 524define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind { 525; SI-LABEL: v_ctpop_v8i16: 526; SI: ; %bb.0: 527; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 528; SI-NEXT: s_mov_b32 s3, 0xf000 529; SI-NEXT: s_mov_b32 s6, 0 530; SI-NEXT: s_mov_b32 s7, s3 531; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 532; SI-NEXT: v_mov_b32_e32 v1, 0 533; SI-NEXT: s_waitcnt lgkmcnt(0) 534; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 535; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 536; SI-NEXT: s_mov_b32 s2, -1 537; SI-NEXT: s_waitcnt vmcnt(0) 538; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 539; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 540; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1 541; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 542; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2 543; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 544; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 545; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 546; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 547; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 548; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 549; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 550; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 551; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 552; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 553; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 554; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 555; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 556; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 557; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 558; SI-NEXT: v_or_b32_e32 v3, v7, v3 559; SI-NEXT: v_or_b32_e32 v2, v6, v2 560; SI-NEXT: v_or_b32_e32 v1, v5, v1 561; SI-NEXT: v_or_b32_e32 v0, v4, v0 562; SI-NEXT: s_waitcnt lgkmcnt(0) 563; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 564; SI-NEXT: s_endpgm 565; 566; VI-LABEL: v_ctpop_v8i16: 567; VI: ; %bb.0: 568; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 569; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 570; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 571; VI-NEXT: s_waitcnt lgkmcnt(0) 572; VI-NEXT: v_mov_b32_e32 v1, s3 573; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 574; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 575; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 576; VI-NEXT: s_mov_b32 s3, 0xf000 577; VI-NEXT: s_mov_b32 s2, -1 578; VI-NEXT: s_waitcnt vmcnt(0) 579; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 580; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 581; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 582; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 583; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 584; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 585; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 586; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 587; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 588; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 589; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 590; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 591; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 592; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 593; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 594; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 595; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 596; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 597; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 598; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 599; VI-NEXT: v_or_b32_e32 v3, v3, v4 600; VI-NEXT: v_or_b32_e32 v2, v2, v5 601; VI-NEXT: v_or_b32_e32 v1, v1, v6 602; VI-NEXT: v_or_b32_e32 v0, v0, v7 603; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 604; VI-NEXT: s_endpgm 605; 606; EG-LABEL: v_ctpop_v8i16: 607; EG: ; %bb.0: 608; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] 609; EG-NEXT: TEX 0 @6 610; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[] 611; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1 612; EG-NEXT: CF_END 613; EG-NEXT: PAD 614; EG-NEXT: Fetch clause starting at 6: 615; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1 616; EG-NEXT: ALU clause starting at 8: 617; EG-NEXT: MOV T0.Y, T4.X, 618; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 619; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 620; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 621; EG-NEXT: ALU clause starting at 12: 622; EG-NEXT: LSHR * T0.W, T12.X, literal.x, 623; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 624; EG-NEXT: BCNT_INT * T0.W, PV.W, 625; EG-NEXT: LSHL T0.W, PV.W, literal.x, 626; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 627; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 628; EG-NEXT: OR_INT * T0.W, PS, PV.W, 629; EG-NEXT: MOV * T4.X, PV.W, 630; EG-NEXT: MOV T0.X, PV.X, 631; EG-NEXT: AND_INT * T0.W, T12.X, literal.x, 632; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 633; EG-NEXT: BCNT_INT T0.W, PV.W, 634; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 635; EG-NEXT: -65536(nan), 0(0.000000e+00) 636; EG-NEXT: OR_INT * T0.W, PS, PV.W, 637; EG-NEXT: MOV T4.X, PV.W, 638; EG-NEXT: MOV * T0.X, T5.X, 639; EG-NEXT: LSHR * T0.W, T12.Y, literal.x, 640; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 641; EG-NEXT: BCNT_INT T0.W, PV.W, 642; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 643; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 644; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 645; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 646; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 647; EG-NEXT: MOV * T5.X, PV.W, 648; EG-NEXT: MOV T0.X, PV.X, 649; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x, 650; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 651; EG-NEXT: BCNT_INT T0.W, PV.W, 652; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 653; EG-NEXT: -65536(nan), 0(0.000000e+00) 654; EG-NEXT: OR_INT * T0.Y, PS, PV.W, 655; EG-NEXT: MOV T5.X, PV.Y, 656; EG-NEXT: MOV * T0.X, T8.X, 657; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, 658; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 659; EG-NEXT: BCNT_INT T0.W, PV.W, 660; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 661; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 662; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 663; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 664; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 665; EG-NEXT: MOV * T8.X, PV.W, 666; EG-NEXT: MOV T0.X, PV.X, 667; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, 668; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 669; EG-NEXT: BCNT_INT T0.W, PV.W, 670; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 671; EG-NEXT: -65536(nan), 0(0.000000e+00) 672; EG-NEXT: OR_INT * T0.W, PS, PV.W, 673; EG-NEXT: MOV T8.X, PV.W, 674; EG-NEXT: MOV * T0.X, T9.X, 675; EG-NEXT: LSHR * T0.W, T12.W, literal.x, 676; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 677; EG-NEXT: BCNT_INT T0.W, PV.W, 678; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 679; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 680; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 681; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 682; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 683; EG-NEXT: MOV * T9.X, PV.W, 684; EG-NEXT: MOV T0.X, PV.X, 685; EG-NEXT: AND_INT * T0.W, T12.W, literal.x, 686; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 687; EG-NEXT: BCNT_INT T0.W, PV.W, 688; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 689; EG-NEXT: -65536(nan), 0(0.000000e+00) 690; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x, 691; EG-NEXT: OR_INT * T0.W, PS, PV.W, 692; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 693; EG-NEXT: MOV T9.X, PV.W, 694; EG-NEXT: MOV * T0.X, T4.X, 695; EG-NEXT: MOV * T0.Z, T8.X, 696 %tid = call i32 @llvm.amdgcn.workitem.id.x() 697 %in.gep = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %in, i32 %tid 698 %val = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep, align 32 699 %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone 700 store <8 x i16> %ctpop, <8 x i16> addrspace(1)* %out, align 32 701 ret void 702} 703 704define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind { 705; SI-LABEL: v_ctpop_v16i16: 706; SI: ; %bb.0: 707; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 708; SI-NEXT: s_mov_b32 s3, 0xf000 709; SI-NEXT: s_mov_b32 s6, 0 710; SI-NEXT: s_mov_b32 s7, s3 711; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 712; SI-NEXT: v_mov_b32_e32 v5, 0 713; SI-NEXT: s_waitcnt lgkmcnt(0) 714; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 715; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 716; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 717; SI-NEXT: s_mov_b32 s2, -1 718; SI-NEXT: s_waitcnt vmcnt(1) 719; SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 720; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 721; SI-NEXT: v_and_b32_e32 v9, 0xffff, v1 722; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 723; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 724; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 725; SI-NEXT: v_and_b32_e32 v11, 0xffff, v3 726; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 727; SI-NEXT: s_waitcnt vmcnt(0) 728; SI-NEXT: v_and_b32_e32 v12, 0xffff, v4 729; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 730; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5 731; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 732; SI-NEXT: v_and_b32_e32 v14, 0xffff, v6 733; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 734; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 735; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 736; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 737; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 738; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 739; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 740; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 741; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 742; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 743; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 744; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0 745; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0 746; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0 747; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0 748; SI-NEXT: v_bcnt_u32_b32_e64 v11, v11, 0 749; SI-NEXT: v_bcnt_u32_b32_e64 v10, v10, 0 750; SI-NEXT: v_bcnt_u32_b32_e64 v9, v9, 0 751; SI-NEXT: v_bcnt_u32_b32_e64 v8, v8, 0 752; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 753; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 754; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 755; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 756; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 757; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 758; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 759; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 760; SI-NEXT: v_or_b32_e32 v3, v15, v7 761; SI-NEXT: v_or_b32_e32 v2, v14, v6 762; SI-NEXT: v_or_b32_e32 v1, v13, v5 763; SI-NEXT: v_or_b32_e32 v0, v12, v4 764; SI-NEXT: v_or_b32_e32 v7, v11, v16 765; SI-NEXT: v_or_b32_e32 v6, v10, v17 766; SI-NEXT: v_or_b32_e32 v5, v9, v18 767; SI-NEXT: v_or_b32_e32 v4, v8, v19 768; SI-NEXT: s_waitcnt lgkmcnt(0) 769; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 770; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 771; SI-NEXT: s_endpgm 772; 773; VI-LABEL: v_ctpop_v16i16: 774; VI: ; %bb.0: 775; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 776; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 777; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 778; VI-NEXT: s_waitcnt lgkmcnt(0) 779; VI-NEXT: v_mov_b32_e32 v1, s3 780; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 781; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 782; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 783; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 784; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 785; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 786; VI-NEXT: s_mov_b32 s3, 0xf000 787; VI-NEXT: s_mov_b32 s2, -1 788; VI-NEXT: s_waitcnt vmcnt(1) 789; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 790; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 791; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 792; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 793; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 794; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 795; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 796; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 797; VI-NEXT: s_waitcnt vmcnt(0) 798; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 799; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 800; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 801; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 802; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 803; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 804; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 805; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 806; VI-NEXT: v_and_b32_e32 v7, 0xffff, v7 807; VI-NEXT: v_and_b32_e32 v6, 0xffff, v6 808; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5 809; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 810; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 811; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 812; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 813; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 814; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0 815; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0 816; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0 817; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0 818; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 819; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 820; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 821; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 822; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 823; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 824; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 825; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 826; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 827; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 828; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 829; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 830; VI-NEXT: v_or_b32_e32 v3, v3, v8 831; VI-NEXT: v_or_b32_e32 v2, v2, v9 832; VI-NEXT: v_or_b32_e32 v1, v1, v10 833; VI-NEXT: v_or_b32_e32 v0, v0, v11 834; VI-NEXT: v_or_b32_e32 v7, v7, v12 835; VI-NEXT: v_or_b32_e32 v6, v6, v13 836; VI-NEXT: v_or_b32_e32 v5, v5, v14 837; VI-NEXT: v_or_b32_e32 v4, v4, v15 838; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 839; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 840; VI-NEXT: s_endpgm 841; 842; EG-LABEL: v_ctpop_v16i16: 843; EG: ; %bb.0: 844; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] 845; EG-NEXT: TEX 1 @8 846; EG-NEXT: ALU 114, @16, KC0[], KC1[] 847; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[] 848; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0 849; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1 850; EG-NEXT: CF_END 851; EG-NEXT: PAD 852; EG-NEXT: Fetch clause starting at 8: 853; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1 854; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1 855; EG-NEXT: ALU clause starting at 12: 856; EG-NEXT: MOV T0.Y, T4.X, 857; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 858; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) 859; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 860; EG-NEXT: ALU clause starting at 16: 861; EG-NEXT: LSHR * T0.W, T20.X, literal.x, 862; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 863; EG-NEXT: BCNT_INT * T0.W, PV.W, 864; EG-NEXT: LSHL T0.W, PV.W, literal.x, 865; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 866; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 867; EG-NEXT: OR_INT * T0.W, PS, PV.W, 868; EG-NEXT: MOV * T4.X, PV.W, 869; EG-NEXT: MOV T0.X, PV.X, 870; EG-NEXT: AND_INT * T0.W, T20.X, literal.x, 871; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 872; EG-NEXT: BCNT_INT T0.W, PV.W, 873; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 874; EG-NEXT: -65536(nan), 0(0.000000e+00) 875; EG-NEXT: OR_INT * T0.W, PS, PV.W, 876; EG-NEXT: MOV T4.X, PV.W, 877; EG-NEXT: MOV * T0.X, T5.X, 878; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, 879; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 880; EG-NEXT: BCNT_INT T0.W, PV.W, 881; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 882; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 883; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 884; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 885; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 886; EG-NEXT: MOV * T5.X, PV.W, 887; EG-NEXT: MOV T0.X, PV.X, 888; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x, 889; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 890; EG-NEXT: BCNT_INT T0.W, PV.W, 891; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 892; EG-NEXT: -65536(nan), 0(0.000000e+00) 893; EG-NEXT: OR_INT * T0.Y, PS, PV.W, 894; EG-NEXT: MOV T5.X, PV.Y, 895; EG-NEXT: MOV * T0.X, T8.X, 896; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, 897; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 898; EG-NEXT: BCNT_INT T0.W, PV.W, 899; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 900; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 901; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 902; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 903; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 904; EG-NEXT: MOV * T8.X, PV.W, 905; EG-NEXT: MOV T0.X, PV.X, 906; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x, 907; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 908; EG-NEXT: BCNT_INT T0.W, PV.W, 909; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 910; EG-NEXT: -65536(nan), 0(0.000000e+00) 911; EG-NEXT: OR_INT * T0.W, PS, PV.W, 912; EG-NEXT: MOV T8.X, PV.W, 913; EG-NEXT: MOV * T0.X, T9.X, 914; EG-NEXT: LSHR * T0.W, T20.W, literal.x, 915; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 916; EG-NEXT: BCNT_INT T0.W, PV.W, 917; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 918; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 919; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 920; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 921; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 922; EG-NEXT: MOV * T9.X, PV.W, 923; EG-NEXT: MOV T0.X, PV.X, 924; EG-NEXT: AND_INT * T0.W, T20.W, literal.x, 925; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 926; EG-NEXT: BCNT_INT T0.W, PV.W, 927; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 928; EG-NEXT: -65536(nan), 0(0.000000e+00) 929; EG-NEXT: OR_INT * T0.W, PS, PV.W, 930; EG-NEXT: MOV T9.X, PV.W, 931; EG-NEXT: MOV * T0.X, T12.X, 932; EG-NEXT: LSHR * T1.W, T21.X, literal.x, 933; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 934; EG-NEXT: BCNT_INT T1.W, PV.W, 935; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 936; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 937; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 938; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 939; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 940; EG-NEXT: MOV * T12.X, PV.W, 941; EG-NEXT: MOV T0.X, PV.X, 942; EG-NEXT: AND_INT * T1.W, T21.X, literal.x, 943; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 944; EG-NEXT: BCNT_INT T1.W, PV.W, 945; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 946; EG-NEXT: -65536(nan), 0(0.000000e+00) 947; EG-NEXT: OR_INT * T1.W, PS, PV.W, 948; EG-NEXT: MOV T12.X, PV.W, 949; EG-NEXT: MOV * T0.X, T13.X, 950; EG-NEXT: LSHR * T1.W, T21.Y, literal.x, 951; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 952; EG-NEXT: BCNT_INT T1.W, PV.W, 953; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 954; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 955; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 956; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 957; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 958; EG-NEXT: MOV * T13.X, PV.W, 959; EG-NEXT: MOV T0.X, PV.X, 960; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x, 961; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 962; EG-NEXT: BCNT_INT T1.W, PV.W, 963; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 964; EG-NEXT: -65536(nan), 0(0.000000e+00) 965; EG-NEXT: OR_INT * T20.Y, PS, PV.W, 966; EG-NEXT: MOV T13.X, PV.Y, 967; EG-NEXT: MOV * T0.X, T16.X, 968; EG-NEXT: LSHR * T1.W, T21.Z, literal.x, 969; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 970; EG-NEXT: BCNT_INT T1.W, PV.W, 971; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 972; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 973; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 974; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 975; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 976; EG-NEXT: ALU clause starting at 131: 977; EG-NEXT: MOV * T16.X, T1.W, 978; EG-NEXT: MOV T0.X, PV.X, 979; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x, 980; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 981; EG-NEXT: BCNT_INT T1.W, PV.W, 982; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 983; EG-NEXT: -65536(nan), 0(0.000000e+00) 984; EG-NEXT: OR_INT * T1.W, PS, PV.W, 985; EG-NEXT: MOV T16.X, PV.W, 986; EG-NEXT: MOV * T0.X, T17.X, 987; EG-NEXT: LSHR * T1.W, T21.W, literal.x, 988; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 989; EG-NEXT: BCNT_INT T1.W, PV.W, 990; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 991; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 992; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 993; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 994; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 995; EG-NEXT: MOV * T17.X, PV.W, 996; EG-NEXT: MOV T0.X, PV.X, 997; EG-NEXT: AND_INT T1.W, T21.W, literal.x, 998; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y, 999; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) 1000; EG-NEXT: AND_INT T0.Z, PV.X, literal.x, 1001; EG-NEXT: BCNT_INT T1.W, PV.W, 1002; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 1003; EG-NEXT: -65536(nan), 16(2.242078e-44) 1004; EG-NEXT: LSHR T22.X, PS, literal.x, 1005; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W, 1006; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1007; EG-NEXT: MOV T17.X, PV.W, 1008; EG-NEXT: MOV * T0.X, T4.X, 1009; EG-NEXT: MOV * T0.Z, T8.X, 1010; EG-NEXT: MOV T20.X, T12.X, 1011; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212 1012 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1013 %in.gep = getelementptr <16 x i16>, <16 x i16> addrspace(1)* %in, i32 %tid 1014 %val = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep, align 32 1015 %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone 1016 store <16 x i16> %ctpop, <16 x i16> addrspace(1)* %out, align 32 1017 ret void 1018} 1019 1020define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 1021; SI-LABEL: v_ctpop_i16_add_inline_constant: 1022; SI: ; %bb.0: 1023; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1024; SI-NEXT: s_mov_b32 s3, 0xf000 1025; SI-NEXT: s_mov_b32 s6, 0 1026; SI-NEXT: s_mov_b32 s7, s3 1027; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1028; SI-NEXT: v_mov_b32_e32 v1, 0 1029; SI-NEXT: s_waitcnt lgkmcnt(0) 1030; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1031; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1032; SI-NEXT: s_mov_b32 s2, -1 1033; SI-NEXT: s_waitcnt vmcnt(0) 1034; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 1035; SI-NEXT: s_waitcnt lgkmcnt(0) 1036; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1037; SI-NEXT: s_endpgm 1038; 1039; VI-LABEL: v_ctpop_i16_add_inline_constant: 1040; VI: ; %bb.0: 1041; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1042; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1043; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1044; VI-NEXT: s_waitcnt lgkmcnt(0) 1045; VI-NEXT: v_mov_b32_e32 v1, s3 1046; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1047; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1048; VI-NEXT: flat_load_ushort v0, v[0:1] 1049; VI-NEXT: s_mov_b32 s3, 0xf000 1050; VI-NEXT: s_mov_b32 s2, -1 1051; VI-NEXT: s_waitcnt vmcnt(0) 1052; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 1053; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1054; VI-NEXT: s_endpgm 1055; 1056; EG-LABEL: v_ctpop_i16_add_inline_constant: 1057; EG: ; %bb.0: 1058; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1059; EG-NEXT: TEX 0 @6 1060; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1061; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1062; EG-NEXT: CF_END 1063; EG-NEXT: PAD 1064; EG-NEXT: Fetch clause starting at 6: 1065; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1066; EG-NEXT: ALU clause starting at 8: 1067; EG-NEXT: LSHL * T0.W, T0.X, 1, 1068; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1069; EG-NEXT: ALU clause starting at 10: 1070; EG-NEXT: BCNT_INT T0.W, T0.X, 1071; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1072; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1073; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1074; EG-NEXT: LSHL * T1.W, PS, literal.y, 1075; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) 1076; EG-NEXT: LSHL T0.X, PV.W, PS, 1077; EG-NEXT: LSHL * T0.W, literal.x, PS, 1078; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1079; EG-NEXT: MOV T0.Y, 0.0, 1080; EG-NEXT: MOV * T0.Z, 0.0, 1081; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1082; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1083 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1084 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1085 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1086 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1087 %add = add i16 %ctpop, 4 1088 store i16 %add, i16 addrspace(1)* %out, align 4 1089 ret void 1090} 1091 1092define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 1093; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: 1094; SI: ; %bb.0: 1095; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1096; SI-NEXT: s_mov_b32 s3, 0xf000 1097; SI-NEXT: s_mov_b32 s6, 0 1098; SI-NEXT: s_mov_b32 s7, s3 1099; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1100; SI-NEXT: v_mov_b32_e32 v1, 0 1101; SI-NEXT: s_waitcnt lgkmcnt(0) 1102; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1103; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1104; SI-NEXT: s_mov_b32 s2, -1 1105; SI-NEXT: s_waitcnt vmcnt(0) 1106; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 1107; SI-NEXT: s_waitcnt lgkmcnt(0) 1108; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1109; SI-NEXT: s_endpgm 1110; 1111; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: 1112; VI: ; %bb.0: 1113; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1114; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1115; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1116; VI-NEXT: s_waitcnt lgkmcnt(0) 1117; VI-NEXT: v_mov_b32_e32 v1, s3 1118; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1119; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1120; VI-NEXT: flat_load_ushort v0, v[0:1] 1121; VI-NEXT: s_mov_b32 s3, 0xf000 1122; VI-NEXT: s_mov_b32 s2, -1 1123; VI-NEXT: s_waitcnt vmcnt(0) 1124; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 1125; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1126; VI-NEXT: s_endpgm 1127; 1128; EG-LABEL: v_ctpop_i16_add_inline_constant_inv: 1129; EG: ; %bb.0: 1130; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1131; EG-NEXT: TEX 0 @6 1132; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1133; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1134; EG-NEXT: CF_END 1135; EG-NEXT: PAD 1136; EG-NEXT: Fetch clause starting at 6: 1137; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1138; EG-NEXT: ALU clause starting at 8: 1139; EG-NEXT: LSHL * T0.W, T0.X, 1, 1140; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1141; EG-NEXT: ALU clause starting at 10: 1142; EG-NEXT: BCNT_INT T0.W, T0.X, 1143; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1144; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1145; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1146; EG-NEXT: LSHL * T1.W, PS, literal.y, 1147; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) 1148; EG-NEXT: LSHL T0.X, PV.W, PS, 1149; EG-NEXT: LSHL * T0.W, literal.x, PS, 1150; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1151; EG-NEXT: MOV T0.Y, 0.0, 1152; EG-NEXT: MOV * T0.Z, 0.0, 1153; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1154; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1155 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1156 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1157 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1158 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1159 %add = add i16 4, %ctpop 1160 store i16 %add, i16 addrspace(1)* %out, align 4 1161 ret void 1162} 1163 1164define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { 1165; SI-LABEL: v_ctpop_i16_add_literal: 1166; SI: ; %bb.0: 1167; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1168; SI-NEXT: s_mov_b32 s3, 0xf000 1169; SI-NEXT: s_mov_b32 s6, 0 1170; SI-NEXT: s_mov_b32 s7, s3 1171; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1172; SI-NEXT: v_mov_b32_e32 v1, 0 1173; SI-NEXT: s_waitcnt lgkmcnt(0) 1174; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1175; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1176; SI-NEXT: s_movk_i32 s4, 0x3e7 1177; SI-NEXT: s_mov_b32 s2, -1 1178; SI-NEXT: s_waitcnt vmcnt(0) 1179; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s4 1180; SI-NEXT: s_waitcnt lgkmcnt(0) 1181; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1182; SI-NEXT: s_endpgm 1183; 1184; VI-LABEL: v_ctpop_i16_add_literal: 1185; VI: ; %bb.0: 1186; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1187; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1188; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1189; VI-NEXT: s_movk_i32 s4, 0x3e7 1190; VI-NEXT: s_waitcnt lgkmcnt(0) 1191; VI-NEXT: v_mov_b32_e32 v1, s3 1192; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1193; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1194; VI-NEXT: flat_load_ushort v0, v[0:1] 1195; VI-NEXT: s_mov_b32 s3, 0xf000 1196; VI-NEXT: s_mov_b32 s2, -1 1197; VI-NEXT: s_waitcnt vmcnt(0) 1198; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1199; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1200; VI-NEXT: s_endpgm 1201; 1202; EG-LABEL: v_ctpop_i16_add_literal: 1203; EG: ; %bb.0: 1204; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1205; EG-NEXT: TEX 0 @6 1206; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1207; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1208; EG-NEXT: CF_END 1209; EG-NEXT: PAD 1210; EG-NEXT: Fetch clause starting at 6: 1211; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1212; EG-NEXT: ALU clause starting at 8: 1213; EG-NEXT: LSHL * T0.W, T0.X, 1, 1214; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1215; EG-NEXT: ALU clause starting at 10: 1216; EG-NEXT: BCNT_INT T0.W, T0.X, 1217; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1218; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1219; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1220; EG-NEXT: LSHL * T1.W, PS, literal.y, 1221; EG-NEXT: 999(1.399897e-42), 3(4.203895e-45) 1222; EG-NEXT: LSHL T0.X, PV.W, PS, 1223; EG-NEXT: LSHL * T0.W, literal.x, PS, 1224; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1225; EG-NEXT: MOV T0.Y, 0.0, 1226; EG-NEXT: MOV * T0.Z, 0.0, 1227; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1228; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1229 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1230 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1231 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1232 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1233 %add = add i16 %ctpop, 999 1234 store i16 %add, i16 addrspace(1)* %out, align 4 1235 ret void 1236} 1237 1238define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { 1239; SI-LABEL: v_ctpop_i16_add_var: 1240; SI: ; %bb.0: 1241; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1242; SI-NEXT: s_load_dword s8, s[0:1], 0xd 1243; SI-NEXT: s_mov_b32 s3, 0xf000 1244; SI-NEXT: s_mov_b32 s6, 0 1245; SI-NEXT: s_mov_b32 s7, s3 1246; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1247; SI-NEXT: v_mov_b32_e32 v1, 0 1248; SI-NEXT: s_waitcnt lgkmcnt(0) 1249; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1250; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1251; SI-NEXT: s_mov_b32 s2, -1 1252; SI-NEXT: s_waitcnt vmcnt(0) 1253; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 1254; SI-NEXT: s_waitcnt lgkmcnt(0) 1255; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1256; SI-NEXT: s_endpgm 1257; 1258; VI-LABEL: v_ctpop_i16_add_var: 1259; VI: ; %bb.0: 1260; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1261; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1262; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1263; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1264; VI-NEXT: s_waitcnt lgkmcnt(0) 1265; VI-NEXT: v_mov_b32_e32 v1, s3 1266; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1267; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1268; VI-NEXT: flat_load_ushort v0, v[0:1] 1269; VI-NEXT: s_mov_b32 s3, 0xf000 1270; VI-NEXT: s_mov_b32 s2, -1 1271; VI-NEXT: s_waitcnt vmcnt(0) 1272; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1273; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1274; VI-NEXT: s_endpgm 1275; 1276; EG-LABEL: v_ctpop_i16_add_var: 1277; EG: ; %bb.0: 1278; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1279; EG-NEXT: TEX 0 @8 1280; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1281; EG-NEXT: TEX 0 @10 1282; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1283; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1284; EG-NEXT: CF_END 1285; EG-NEXT: PAD 1286; EG-NEXT: Fetch clause starting at 8: 1287; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1288; EG-NEXT: Fetch clause starting at 10: 1289; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 1290; EG-NEXT: ALU clause starting at 12: 1291; EG-NEXT: LSHL * T0.W, T0.X, 1, 1292; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1293; EG-NEXT: ALU clause starting at 14: 1294; EG-NEXT: MOV * T1.X, 0.0, 1295; EG-NEXT: ALU clause starting at 15: 1296; EG-NEXT: BCNT_INT T0.W, T0.X, 1297; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1298; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1299; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X, 1300; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1301; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1302; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1303; EG-NEXT: LSHL T0.X, PV.W, PS, 1304; EG-NEXT: LSHL * T0.W, literal.x, PS, 1305; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1306; EG-NEXT: MOV T0.Y, 0.0, 1307; EG-NEXT: MOV * T0.Z, 0.0, 1308; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1309; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1310 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1311 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1312 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1313 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1314 %add = add i16 %ctpop, %const 1315 store i16 %add, i16 addrspace(1)* %out, align 4 1316 ret void 1317} 1318 1319define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { 1320; SI-LABEL: v_ctpop_i16_add_var_inv: 1321; SI: ; %bb.0: 1322; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1323; SI-NEXT: s_load_dword s8, s[0:1], 0xd 1324; SI-NEXT: s_mov_b32 s3, 0xf000 1325; SI-NEXT: s_mov_b32 s6, 0 1326; SI-NEXT: s_mov_b32 s7, s3 1327; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1328; SI-NEXT: v_mov_b32_e32 v1, 0 1329; SI-NEXT: s_waitcnt lgkmcnt(0) 1330; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1331; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1332; SI-NEXT: s_mov_b32 s2, -1 1333; SI-NEXT: s_waitcnt vmcnt(0) 1334; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 1335; SI-NEXT: s_waitcnt lgkmcnt(0) 1336; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1337; SI-NEXT: s_endpgm 1338; 1339; VI-LABEL: v_ctpop_i16_add_var_inv: 1340; VI: ; %bb.0: 1341; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1342; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1343; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1344; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1345; VI-NEXT: s_waitcnt lgkmcnt(0) 1346; VI-NEXT: v_mov_b32_e32 v1, s3 1347; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1348; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1349; VI-NEXT: flat_load_ushort v0, v[0:1] 1350; VI-NEXT: s_mov_b32 s3, 0xf000 1351; VI-NEXT: s_mov_b32 s2, -1 1352; VI-NEXT: s_waitcnt vmcnt(0) 1353; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1354; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1355; VI-NEXT: s_endpgm 1356; 1357; EG-LABEL: v_ctpop_i16_add_var_inv: 1358; EG: ; %bb.0: 1359; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1360; EG-NEXT: TEX 0 @8 1361; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1362; EG-NEXT: TEX 0 @10 1363; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1364; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1365; EG-NEXT: CF_END 1366; EG-NEXT: PAD 1367; EG-NEXT: Fetch clause starting at 8: 1368; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1369; EG-NEXT: Fetch clause starting at 10: 1370; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 1371; EG-NEXT: ALU clause starting at 12: 1372; EG-NEXT: LSHL * T0.W, T0.X, 1, 1373; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1374; EG-NEXT: ALU clause starting at 14: 1375; EG-NEXT: MOV * T1.X, 0.0, 1376; EG-NEXT: ALU clause starting at 15: 1377; EG-NEXT: BCNT_INT T0.W, T0.X, 1378; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1379; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1380; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W, 1381; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1382; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1383; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1384; EG-NEXT: LSHL T0.X, PV.W, PS, 1385; EG-NEXT: LSHL * T0.W, literal.x, PS, 1386; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1387; EG-NEXT: MOV T0.Y, 0.0, 1388; EG-NEXT: MOV * T0.Z, 0.0, 1389; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1390; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1391 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1392 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1393 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1394 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1395 %add = add i16 %const, %ctpop 1396 store i16 %add, i16 addrspace(1)* %out, align 4 1397 ret void 1398} 1399 1400define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind { 1401; SI-LABEL: v_ctpop_i16_add_vvar_inv: 1402; SI: ; %bb.0: 1403; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1404; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1405; SI-NEXT: s_mov_b32 s3, 0xf000 1406; SI-NEXT: s_mov_b32 s6, 0 1407; SI-NEXT: s_mov_b32 s7, s3 1408; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1409; SI-NEXT: v_mov_b32_e32 v1, 0 1410; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 1411; SI-NEXT: s_waitcnt lgkmcnt(0) 1412; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 1413; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1414; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1415; SI-NEXT: s_mov_b32 s2, -1 1416; SI-NEXT: s_waitcnt vmcnt(0) 1417; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 1418; SI-NEXT: s_waitcnt lgkmcnt(0) 1419; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1420; SI-NEXT: s_endpgm 1421; 1422; VI-LABEL: v_ctpop_i16_add_vvar_inv: 1423; VI: ; %bb.0: 1424; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 1425; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1426; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1427; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1428; VI-NEXT: s_waitcnt lgkmcnt(0) 1429; VI-NEXT: v_mov_b32_e32 v1, s3 1430; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1431; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1432; VI-NEXT: flat_load_ushort v3, v[0:1] 1433; VI-NEXT: v_mov_b32_e32 v1, s5 1434; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1435; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1436; VI-NEXT: flat_load_ushort v0, v[0:1] 1437; VI-NEXT: s_mov_b32 s3, 0xf000 1438; VI-NEXT: s_mov_b32 s2, -1 1439; VI-NEXT: s_waitcnt vmcnt(0) 1440; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 1441; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1442; VI-NEXT: s_endpgm 1443; 1444; EG-LABEL: v_ctpop_i16_add_vvar_inv: 1445; EG: ; %bb.0: 1446; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1447; EG-NEXT: TEX 0 @8 1448; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1449; EG-NEXT: TEX 0 @10 1450; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1451; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1452; EG-NEXT: CF_END 1453; EG-NEXT: PAD 1454; EG-NEXT: Fetch clause starting at 8: 1455; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1456; EG-NEXT: Fetch clause starting at 10: 1457; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 1458; EG-NEXT: ALU clause starting at 12: 1459; EG-NEXT: LSHL * T0.W, T0.X, 1, 1460; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1461; EG-NEXT: ALU clause starting at 14: 1462; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W, 1463; EG-NEXT: ALU clause starting at 15: 1464; EG-NEXT: BCNT_INT T0.W, T0.X, 1465; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1466; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1467; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W, 1468; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1469; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1470; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1471; EG-NEXT: LSHL T0.X, PV.W, PS, 1472; EG-NEXT: LSHL * T0.W, literal.x, PS, 1473; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1474; EG-NEXT: MOV T0.Y, 0.0, 1475; EG-NEXT: MOV * T0.Z, 0.0, 1476; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1477; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1478 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1479 %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid 1480 %val = load i16, i16 addrspace(1)* %in.gep, align 4 1481 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1482 %gep = getelementptr i16, i16 addrspace(1)* %constptr, i32 %tid 1483 %const = load i16, i16 addrspace(1)* %gep, align 4 1484 %add = add i16 %const, %ctpop 1485 store i16 %add, i16 addrspace(1)* %out, align 4 1486 ret void 1487} 1488 1489; FIXME: We currently disallow SALU instructions in all branches, 1490; but there are some cases when the should be allowed. 1491define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %ctpop_arg, i16 %cond) { 1492; SI-LABEL: ctpop_i16_in_br: 1493; SI: ; %bb.0: ; %entry 1494; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1495; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1496; SI-NEXT: s_waitcnt lgkmcnt(0) 1497; SI-NEXT: s_lshr_b32 s5, s4, 16 1498; SI-NEXT: s_cmp_lg_u32 s5, 0 1499; SI-NEXT: s_cbranch_scc0 .LBB14_4 1500; SI-NEXT: ; %bb.1: ; %else 1501; SI-NEXT: s_mov_b32 s11, 0xf000 1502; SI-NEXT: s_mov_b32 s10, -1 1503; SI-NEXT: s_mov_b32 s8, s2 1504; SI-NEXT: s_mov_b32 s9, s3 1505; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1506; SI-NEXT: s_mov_b64 s[2:3], 0 1507; SI-NEXT: s_cbranch_execnz .LBB14_3 1508; SI-NEXT: .LBB14_2: ; %if 1509; SI-NEXT: s_and_b32 s2, s4, 0xffff 1510; SI-NEXT: s_bcnt1_i32_b32 s2, s2 1511; SI-NEXT: s_waitcnt vmcnt(0) 1512; SI-NEXT: v_mov_b32_e32 v0, s2 1513; SI-NEXT: .LBB14_3: ; %endif 1514; SI-NEXT: s_mov_b32 s3, 0xf000 1515; SI-NEXT: s_mov_b32 s2, -1 1516; SI-NEXT: s_waitcnt vmcnt(0) 1517; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1518; SI-NEXT: s_endpgm 1519; SI-NEXT: .LBB14_4: 1520; SI-NEXT: s_mov_b64 s[2:3], -1 1521; SI-NEXT: v_mov_b32_e32 v0, 0 1522; SI-NEXT: s_branch .LBB14_2 1523; 1524; VI-LABEL: ctpop_i16_in_br: 1525; VI: ; %bb.0: ; %entry 1526; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1527; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1528; VI-NEXT: s_waitcnt lgkmcnt(0) 1529; VI-NEXT: s_lshr_b32 s5, s4, 16 1530; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0 1531; VI-NEXT: s_and_b64 vcc, exec, s[6:7] 1532; VI-NEXT: s_cbranch_vccz .LBB14_4 1533; VI-NEXT: ; %bb.1: ; %else 1534; VI-NEXT: s_mov_b32 s11, 0xf000 1535; VI-NEXT: s_mov_b32 s10, -1 1536; VI-NEXT: s_mov_b32 s8, s2 1537; VI-NEXT: s_mov_b32 s9, s3 1538; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1539; VI-NEXT: s_mov_b64 s[2:3], 0 1540; VI-NEXT: s_cbranch_execnz .LBB14_3 1541; VI-NEXT: .LBB14_2: ; %if 1542; VI-NEXT: s_and_b32 s2, s4, 0xffff 1543; VI-NEXT: s_bcnt1_i32_b32 s2, s2 1544; VI-NEXT: s_waitcnt vmcnt(0) 1545; VI-NEXT: v_mov_b32_e32 v0, s2 1546; VI-NEXT: .LBB14_3: ; %endif 1547; VI-NEXT: s_mov_b32 s3, 0xf000 1548; VI-NEXT: s_mov_b32 s2, -1 1549; VI-NEXT: s_waitcnt vmcnt(0) 1550; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1551; VI-NEXT: s_endpgm 1552; VI-NEXT: .LBB14_4: 1553; VI-NEXT: s_mov_b64 s[2:3], -1 1554; VI-NEXT: ; implicit-def: $vgpr0 1555; VI-NEXT: s_branch .LBB14_2 1556; 1557; EG-LABEL: ctpop_i16_in_br: 1558; EG: ; %bb.0: ; %entry 1559; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1560; EG-NEXT: TEX 0 @14 1561; EG-NEXT: ALU_PUSH_BEFORE 6, @21, KC0[], KC1[] 1562; EG-NEXT: JUMP @7 POP:1 1563; EG-NEXT: ALU 0, @28, KC0[CB0:0-32], KC1[] 1564; EG-NEXT: TEX 0 @16 1565; EG-NEXT: ALU_POP_AFTER 1, @29, KC0[], KC1[] 1566; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] 1567; EG-NEXT: JUMP @11 POP:1 1568; EG-NEXT: TEX 0 @18 1569; EG-NEXT: ALU_POP_AFTER 0, @34, KC0[], KC1[] 1570; EG-NEXT: ALU 11, @35, KC0[], KC1[] 1571; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 1572; EG-NEXT: CF_END 1573; EG-NEXT: Fetch clause starting at 14: 1574; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3 1575; EG-NEXT: Fetch clause starting at 16: 1576; EG-NEXT: VTX_READ_16 T1.X, T1.X, 2, #1 1577; EG-NEXT: Fetch clause starting at 18: 1578; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 1579; EG-NEXT: ALU clause starting at 20: 1580; EG-NEXT: MOV * T0.X, 0.0, 1581; EG-NEXT: ALU clause starting at 21: 1582; EG-NEXT: AND_INT * T0.W, T1.X, literal.x, 1583; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1584; EG-NEXT: MOV T1.X, literal.x, 1585; EG-NEXT: MOV T1.W, literal.y, 1586; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, 1587; EG-NEXT: 0(0.000000e+00), 1(1.401298e-45) 1588; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1589; EG-NEXT: ALU clause starting at 28: 1590; EG-NEXT: MOV * T1.X, KC0[2].Z, 1591; EG-NEXT: ALU clause starting at 29: 1592; EG-NEXT: MOV * T1.W, literal.x, 1593; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1594; EG-NEXT: ALU clause starting at 31: 1595; EG-NEXT: MOV T0.W, KC0[2].Y, 1596; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 1597; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1598; EG-NEXT: ALU clause starting at 34: 1599; EG-NEXT: BCNT_INT * T1.X, T0.X, 1600; EG-NEXT: ALU clause starting at 35: 1601; EG-NEXT: LSHL * T1.W, T0.W, literal.x, 1602; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1603; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1604; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, 1605; EG-NEXT: 24(3.363116e-44), 65535(9.183409e-41) 1606; EG-NEXT: LSHL T1.X, PS, PV.W, 1607; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 1608; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1609; EG-NEXT: MOV T1.Y, 0.0, 1610; EG-NEXT: MOV * T1.Z, 0.0, 1611; EG-NEXT: LSHR * T0.X, T0.W, literal.x, 1612; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1613entry: 1614 %tmp0 = icmp eq i16 %cond, 0 1615 br i1 %tmp0, label %if, label %else 1616 1617if: 1618 %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg) 1619 br label %endif 1620 1621else: 1622 %tmp3 = getelementptr i16, i16 addrspace(1)* %in, i16 1 1623 %tmp4 = load i16, i16 addrspace(1)* %tmp3 1624 br label %endif 1625 1626endif: 1627 %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else] 1628 store i16 %tmp5, i16 addrspace(1)* %out 1629 ret void 1630} 1631