1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
5
6declare i16 @llvm.ctpop.i16(i16) nounwind readnone
7declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone
8declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
9declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
10declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone
11
12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
13
14define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
15; SI-LABEL: s_ctpop_i16:
16; SI:       ; %bb.0:
17; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
18; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
19; SI-NEXT:    s_mov_b32 s3, 0xf000
20; SI-NEXT:    s_mov_b32 s2, -1
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_and_b32 s4, s4, 0xffff
23; SI-NEXT:    s_bcnt1_i32_b32 s4, s4
24; SI-NEXT:    v_mov_b32_e32 v0, s4
25; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: s_ctpop_i16:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
31; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
32; VI-NEXT:    s_mov_b32 s3, 0xf000
33; VI-NEXT:    s_mov_b32 s2, -1
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    s_and_b32 s4, s4, 0xffff
36; VI-NEXT:    s_bcnt1_i32_b32 s4, s4
37; VI-NEXT:    v_mov_b32_e32 v0, s4
38; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
39; VI-NEXT:    s_endpgm
40;
41; EG-LABEL: s_ctpop_i16:
42; EG:       ; %bb.0:
43; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
44; EG-NEXT:    TEX 0 @6
45; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
46; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
47; EG-NEXT:    CF_END
48; EG-NEXT:    PAD
49; EG-NEXT:    Fetch clause starting at 6:
50; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
51; EG-NEXT:    ALU clause starting at 8:
52; EG-NEXT:     MOV * T0.X, 0.0,
53; EG-NEXT:    ALU clause starting at 9:
54; EG-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
55; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
56; EG-NEXT:     BCNT_INT T1.W, T0.X,
57; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
58; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
59; EG-NEXT:     LSHL T0.X, PV.W, PS,
60; EG-NEXT:     LSHL * T0.W, literal.x, PS,
61; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
62; EG-NEXT:     MOV T0.Y, 0.0,
63; EG-NEXT:     MOV * T0.Z, 0.0,
64; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
65; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
66  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
67  store i16 %ctpop, i16 addrspace(1)* %out, align 4
68  ret void
69}
70
71; XXX - Why 0 in register?
72define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
73; SI-LABEL: v_ctpop_i16:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
76; SI-NEXT:    s_mov_b32 s3, 0xf000
77; SI-NEXT:    s_mov_b32 s6, 0
78; SI-NEXT:    s_mov_b32 s7, s3
79; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
80; SI-NEXT:    v_mov_b32_e32 v1, 0
81; SI-NEXT:    s_waitcnt lgkmcnt(0)
82; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
83; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
84; SI-NEXT:    s_mov_b32 s2, -1
85; SI-NEXT:    s_waitcnt vmcnt(0)
86; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
87; SI-NEXT:    s_waitcnt lgkmcnt(0)
88; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
89; SI-NEXT:    s_endpgm
90;
91; VI-LABEL: v_ctpop_i16:
92; VI:       ; %bb.0:
93; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
94; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
95; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    v_mov_b32_e32 v1, s3
98; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
99; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
100; VI-NEXT:    flat_load_ushort v0, v[0:1]
101; VI-NEXT:    s_mov_b32 s3, 0xf000
102; VI-NEXT:    s_mov_b32 s2, -1
103; VI-NEXT:    s_waitcnt vmcnt(0)
104; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
105; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
106; VI-NEXT:    s_endpgm
107;
108; EG-LABEL: v_ctpop_i16:
109; EG:       ; %bb.0:
110; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
111; EG-NEXT:    TEX 0 @6
112; EG-NEXT:    ALU 11, @10, KC0[CB0:0-32], KC1[]
113; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
114; EG-NEXT:    CF_END
115; EG-NEXT:    PAD
116; EG-NEXT:    Fetch clause starting at 6:
117; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
118; EG-NEXT:    ALU clause starting at 8:
119; EG-NEXT:     LSHL * T0.W, T0.X, 1,
120; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
121; EG-NEXT:    ALU clause starting at 10:
122; EG-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
123; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
124; EG-NEXT:     BCNT_INT T1.W, T0.X,
125; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
126; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
127; EG-NEXT:     LSHL T0.X, PV.W, PS,
128; EG-NEXT:     LSHL * T0.W, literal.x, PS,
129; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
130; EG-NEXT:     MOV T0.Y, 0.0,
131; EG-NEXT:     MOV * T0.Z, 0.0,
132; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
134  %tid = call i32 @llvm.amdgcn.workitem.id.x()
135  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
136  %val = load i16, i16 addrspace(1)* %in.gep, align 4
137  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
138  store i16 %ctpop, i16 addrspace(1)* %out, align 4
139  ret void
140}
141
142define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind {
143; SI-LABEL: v_ctpop_add_chain_i16:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
146; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
147; SI-NEXT:    s_mov_b32 s3, 0xf000
148; SI-NEXT:    s_mov_b32 s6, 0
149; SI-NEXT:    s_mov_b32 s7, s3
150; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
151; SI-NEXT:    v_mov_b32_e32 v1, 0
152; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
153; SI-NEXT:    s_waitcnt lgkmcnt(0)
154; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 glc
157; SI-NEXT:    s_waitcnt vmcnt(0)
158; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
159; SI-NEXT:    s_mov_b32 s2, -1
160; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
161; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v2, v0
162; SI-NEXT:    s_waitcnt lgkmcnt(0)
163; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
164; SI-NEXT:    s_endpgm
165;
166; VI-LABEL: v_ctpop_add_chain_i16:
167; VI:       ; %bb.0:
168; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
169; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
170; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
171; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    v_mov_b32_e32 v1, s3
174; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
175; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
176; VI-NEXT:    v_mov_b32_e32 v3, s5
177; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
178; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
179; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
180; VI-NEXT:    s_waitcnt vmcnt(0)
181; VI-NEXT:    flat_load_ushort v1, v[2:3] glc
182; VI-NEXT:    s_waitcnt vmcnt(0)
183; VI-NEXT:    s_mov_b32 s3, 0xf000
184; VI-NEXT:    s_mov_b32 s2, -1
185; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
186; VI-NEXT:    v_bcnt_u32_b32 v0, v0, v1
187; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
188; VI-NEXT:    s_endpgm
189;
190; EG-LABEL: v_ctpop_add_chain_i16:
191; EG:       ; %bb.0:
192; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
193; EG-NEXT:    TEX 0 @8
194; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
195; EG-NEXT:    TEX 0 @10
196; EG-NEXT:    ALU 16, @15, KC0[CB0:0-32], KC1[]
197; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
198; EG-NEXT:    CF_END
199; EG-NEXT:    PAD
200; EG-NEXT:    Fetch clause starting at 8:
201; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
202; EG-NEXT:    Fetch clause starting at 10:
203; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
204; EG-NEXT:    ALU clause starting at 12:
205; EG-NEXT:     LSHL * T0.W, T0.X, 1,
206; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
207; EG-NEXT:    ALU clause starting at 14:
208; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, T0.W,
209; EG-NEXT:    ALU clause starting at 15:
210; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
211; EG-NEXT:     AND_INT * T1.W, T1.X, literal.x,
212; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
213; EG-NEXT:     BCNT_INT T0.Z, PS,
214; EG-NEXT:     BCNT_INT T0.W, PV.W,
215; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
216; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
217; EG-NEXT:     ADD_INT T0.W, PV.W, PV.Z,
218; EG-NEXT:     LSHL * T1.W, PS, literal.x,
219; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
220; EG-NEXT:     LSHL T0.X, PV.W, PS,
221; EG-NEXT:     LSHL * T0.W, literal.x, PS,
222; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
223; EG-NEXT:     MOV T0.Y, 0.0,
224; EG-NEXT:     MOV * T0.Z, 0.0,
225; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
226; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
227  %tid = call i32 @llvm.amdgcn.workitem.id.x()
228  %in0.gep = getelementptr i16, i16 addrspace(1)* %in0, i32 %tid
229  %in1.gep = getelementptr i16, i16 addrspace(1)* %in1, i32 %tid
230  %val0 = load volatile i16, i16 addrspace(1)* %in0.gep, align 4
231  %val1 = load volatile i16, i16 addrspace(1)* %in1.gep, align 4
232  %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone
233  %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone
234  %add = add i16 %ctpop0, %ctpop1
235  store i16 %add, i16 addrspace(1)* %out, align 4
236  ret void
237}
238
239define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind {
240; SI-LABEL: v_ctpop_add_sgpr_i16:
241; SI:       ; %bb.0:
242; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
243; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
244; SI-NEXT:    s_mov_b32 s3, 0xf000
245; SI-NEXT:    s_mov_b32 s6, 0
246; SI-NEXT:    s_mov_b32 s7, s3
247; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
248; SI-NEXT:    v_mov_b32_e32 v1, 0
249; SI-NEXT:    s_waitcnt lgkmcnt(0)
250; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
251; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
252; SI-NEXT:    s_mov_b32 s2, -1
253; SI-NEXT:    s_waitcnt vmcnt(0)
254; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s8
255; SI-NEXT:    s_waitcnt lgkmcnt(0)
256; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
257; SI-NEXT:    s_endpgm
258;
259; VI-LABEL: v_ctpop_add_sgpr_i16:
260; VI:       ; %bb.0:
261; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
262; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
263; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
264; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
265; VI-NEXT:    s_waitcnt lgkmcnt(0)
266; VI-NEXT:    v_mov_b32_e32 v1, s3
267; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
268; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
269; VI-NEXT:    flat_load_ushort v0, v[0:1]
270; VI-NEXT:    s_mov_b32 s3, 0xf000
271; VI-NEXT:    s_mov_b32 s2, -1
272; VI-NEXT:    s_waitcnt vmcnt(0)
273; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
274; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
275; VI-NEXT:    s_endpgm
276;
277; EG-LABEL: v_ctpop_add_sgpr_i16:
278; EG:       ; %bb.0:
279; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
280; EG-NEXT:    TEX 0 @8
281; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
282; EG-NEXT:    TEX 0 @10
283; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
284; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
285; EG-NEXT:    CF_END
286; EG-NEXT:    PAD
287; EG-NEXT:    Fetch clause starting at 8:
288; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
289; EG-NEXT:    Fetch clause starting at 10:
290; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
291; EG-NEXT:    ALU clause starting at 12:
292; EG-NEXT:     LSHL * T0.W, T0.X, 1,
293; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
294; EG-NEXT:    ALU clause starting at 14:
295; EG-NEXT:     MOV * T1.X, 0.0,
296; EG-NEXT:    ALU clause starting at 15:
297; EG-NEXT:     BCNT_INT T0.W, T0.X,
298; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
299; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
300; EG-NEXT:     ADD_INT * T0.W, PV.W, T1.X,
301; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
302; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
303; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
304; EG-NEXT:     LSHL T0.X, PV.W, PS,
305; EG-NEXT:     LSHL * T0.W, literal.x, PS,
306; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
307; EG-NEXT:     MOV T0.Y, 0.0,
308; EG-NEXT:     MOV * T0.Z, 0.0,
309; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
310; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
311  %tid = call i32 @llvm.amdgcn.workitem.id.x()
312  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
313  %val = load i16, i16 addrspace(1)* %in.gep, align 4
314  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
315  %add = add i16 %ctpop, %sval
316  store i16 %add, i16 addrspace(1)* %out, align 4
317  ret void
318}
319
320define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind {
321; SI-LABEL: v_ctpop_v2i16:
322; SI:       ; %bb.0:
323; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
324; SI-NEXT:    s_mov_b32 s3, 0xf000
325; SI-NEXT:    s_mov_b32 s6, 0
326; SI-NEXT:    s_mov_b32 s7, s3
327; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
328; SI-NEXT:    v_mov_b32_e32 v1, 0
329; SI-NEXT:    s_waitcnt lgkmcnt(0)
330; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
331; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
332; SI-NEXT:    s_mov_b32 s2, -1
333; SI-NEXT:    s_waitcnt vmcnt(0)
334; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
335; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
336; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
337; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
338; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
339; SI-NEXT:    v_or_b32_e32 v0, v1, v0
340; SI-NEXT:    s_waitcnt lgkmcnt(0)
341; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
342; SI-NEXT:    s_endpgm
343;
344; VI-LABEL: v_ctpop_v2i16:
345; VI:       ; %bb.0:
346; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
347; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
348; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
349; VI-NEXT:    s_waitcnt lgkmcnt(0)
350; VI-NEXT:    v_mov_b32_e32 v1, s3
351; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
352; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
353; VI-NEXT:    flat_load_dword v0, v[0:1]
354; VI-NEXT:    s_mov_b32 s3, 0xf000
355; VI-NEXT:    s_mov_b32 s2, -1
356; VI-NEXT:    s_waitcnt vmcnt(0)
357; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
358; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
359; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
360; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
361; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
362; VI-NEXT:    v_or_b32_e32 v0, v0, v1
363; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
364; VI-NEXT:    s_endpgm
365;
366; EG-LABEL: v_ctpop_v2i16:
367; EG:       ; %bb.0:
368; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
369; EG-NEXT:    TEX 0 @6
370; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
371; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
372; EG-NEXT:    CF_END
373; EG-NEXT:    PAD
374; EG-NEXT:    Fetch clause starting at 6:
375; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
376; EG-NEXT:    ALU clause starting at 8:
377; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
378; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
379; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
380; EG-NEXT:    ALU clause starting at 11:
381; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
382; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
383; EG-NEXT:     BCNT_INT T0.W, PV.W,
384; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
385; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
386; EG-NEXT:     BCNT_INT T1.W, PS,
387; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
388; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
389; EG-NEXT:     OR_INT T0.X, PV.W, PS,
390; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
391; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
392  %tid = call i32 @llvm.amdgcn.workitem.id.x()
393  %in.gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
394  %val = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep, align 8
395  %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone
396  store <2 x i16> %ctpop, <2 x i16> addrspace(1)* %out, align 8
397  ret void
398}
399
400define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind {
401; SI-LABEL: v_ctpop_v4i16:
402; SI:       ; %bb.0:
403; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
404; SI-NEXT:    s_mov_b32 s3, 0xf000
405; SI-NEXT:    s_mov_b32 s6, 0
406; SI-NEXT:    s_mov_b32 s7, s3
407; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
408; SI-NEXT:    v_mov_b32_e32 v1, 0
409; SI-NEXT:    s_waitcnt lgkmcnt(0)
410; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
411; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
412; SI-NEXT:    s_mov_b32 s4, 0xffff
413; SI-NEXT:    s_mov_b32 s2, -1
414; SI-NEXT:    s_waitcnt vmcnt(0)
415; SI-NEXT:    v_and_b32_e32 v2, s4, v0
416; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
417; SI-NEXT:    v_and_b32_e32 v3, s4, v1
418; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
419; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
420; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
421; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
422; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
423; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
424; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
425; SI-NEXT:    v_or_b32_e32 v1, v3, v1
426; SI-NEXT:    v_or_b32_e32 v0, v2, v0
427; SI-NEXT:    s_waitcnt lgkmcnt(0)
428; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
429; SI-NEXT:    s_endpgm
430;
431; VI-LABEL: v_ctpop_v4i16:
432; VI:       ; %bb.0:
433; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
434; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
435; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
436; VI-NEXT:    s_mov_b32 s4, 0xffff
437; VI-NEXT:    s_waitcnt lgkmcnt(0)
438; VI-NEXT:    v_mov_b32_e32 v1, s3
439; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
440; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
441; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
442; VI-NEXT:    s_mov_b32 s3, 0xf000
443; VI-NEXT:    s_mov_b32 s2, -1
444; VI-NEXT:    s_waitcnt vmcnt(0)
445; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
446; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
447; VI-NEXT:    v_and_b32_e32 v1, s4, v1
448; VI-NEXT:    v_and_b32_e32 v0, s4, v0
449; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
450; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
451; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
452; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
453; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
454; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
455; VI-NEXT:    v_or_b32_e32 v1, v1, v2
456; VI-NEXT:    v_or_b32_e32 v0, v0, v3
457; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
458; VI-NEXT:    s_endpgm
459;
460; EG-LABEL: v_ctpop_v4i16:
461; EG:       ; %bb.0:
462; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
463; EG-NEXT:    TEX 0 @6
464; EG-NEXT:    ALU 42, @11, KC0[CB0:0-32], KC1[]
465; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
466; EG-NEXT:    CF_END
467; EG-NEXT:    PAD
468; EG-NEXT:    Fetch clause starting at 6:
469; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
470; EG-NEXT:    ALU clause starting at 8:
471; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
472; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
473; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
474; EG-NEXT:    ALU clause starting at 11:
475; EG-NEXT:     MOV T2.X, T0.X,
476; EG-NEXT:     MOV * T3.X, T0.Y,
477; EG-NEXT:     MOV T0.X, T4.X,
478; EG-NEXT:     MOV * T0.Y, PV.X,
479; EG-NEXT:     AND_INT * T0.W, PV.Y, literal.x,
480; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
481; EG-NEXT:     BCNT_INT T0.W, PV.W,
482; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
483; EG-NEXT:    -65536(nan), 0(0.000000e+00)
484; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
485; EG-NEXT:     MOV T0.X, T3.X,
486; EG-NEXT:     MOV * T4.X, PV.W,
487; EG-NEXT:     MOV T0.Z, PS,
488; EG-NEXT:     LSHR * T0.W, T0.Y, literal.x,
489; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
490; EG-NEXT:     BCNT_INT T0.W, PV.W,
491; EG-NEXT:     AND_INT * T1.W, PV.Z, literal.x,
492; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
493; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
494; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
495; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
496; EG-NEXT:     MOV T4.X, PV.W,
497; EG-NEXT:     MOV T0.Y, T5.X,
498; EG-NEXT:     AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
499; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
500; EG-NEXT:     BCNT_INT T0.W, PV.W,
501; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
502; EG-NEXT:    -65536(nan), 0(0.000000e+00)
503; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
504; EG-NEXT:     MOV * T5.X, PV.W,
505; EG-NEXT:     MOV T0.Y, PV.X,
506; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
507; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
508; EG-NEXT:     BCNT_INT T0.W, PV.W,
509; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
510; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
511; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
512; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
513; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
514; EG-NEXT:     OR_INT * T8.Y, T1.W, PV.W,
515; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
516; EG-NEXT:     MOV T5.X, PV.Y,
517; EG-NEXT:     MOV * T8.X, T4.X,
518  %tid = call i32 @llvm.amdgcn.workitem.id.x()
519  %in.gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
520  %val = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep, align 16
521  %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone
522  store <4 x i16> %ctpop, <4 x i16> addrspace(1)* %out, align 16
523  ret void
524}
525
526define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind {
527; SI-LABEL: v_ctpop_v8i16:
528; SI:       ; %bb.0:
529; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
530; SI-NEXT:    s_mov_b32 s3, 0xf000
531; SI-NEXT:    s_mov_b32 s6, 0
532; SI-NEXT:    s_mov_b32 s7, s3
533; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
534; SI-NEXT:    v_mov_b32_e32 v1, 0
535; SI-NEXT:    s_waitcnt lgkmcnt(0)
536; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
537; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
538; SI-NEXT:    s_mov_b32 s4, 0xffff
539; SI-NEXT:    s_mov_b32 s2, -1
540; SI-NEXT:    s_waitcnt vmcnt(0)
541; SI-NEXT:    v_and_b32_e32 v4, s4, v0
542; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
543; SI-NEXT:    v_and_b32_e32 v5, s4, v1
544; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
545; SI-NEXT:    v_and_b32_e32 v6, s4, v2
546; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
547; SI-NEXT:    v_and_b32_e32 v7, s4, v3
548; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
549; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
550; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
551; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
552; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
553; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
554; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
555; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
556; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
557; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
558; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
559; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
560; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
561; SI-NEXT:    v_or_b32_e32 v3, v7, v3
562; SI-NEXT:    v_or_b32_e32 v2, v6, v2
563; SI-NEXT:    v_or_b32_e32 v1, v5, v1
564; SI-NEXT:    v_or_b32_e32 v0, v4, v0
565; SI-NEXT:    s_waitcnt lgkmcnt(0)
566; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
567; SI-NEXT:    s_endpgm
568;
569; VI-LABEL: v_ctpop_v8i16:
570; VI:       ; %bb.0:
571; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
572; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
573; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
574; VI-NEXT:    s_mov_b32 s4, 0xffff
575; VI-NEXT:    s_waitcnt lgkmcnt(0)
576; VI-NEXT:    v_mov_b32_e32 v1, s3
577; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
578; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
579; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
580; VI-NEXT:    s_mov_b32 s3, 0xf000
581; VI-NEXT:    s_mov_b32 s2, -1
582; VI-NEXT:    s_waitcnt vmcnt(0)
583; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
584; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
585; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
586; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
587; VI-NEXT:    v_and_b32_e32 v3, s4, v3
588; VI-NEXT:    v_and_b32_e32 v2, s4, v2
589; VI-NEXT:    v_and_b32_e32 v1, s4, v1
590; VI-NEXT:    v_and_b32_e32 v0, s4, v0
591; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
592; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
593; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
594; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
595; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
596; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
597; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
598; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
599; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
600; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
601; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
602; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
603; VI-NEXT:    v_or_b32_e32 v3, v3, v4
604; VI-NEXT:    v_or_b32_e32 v2, v2, v5
605; VI-NEXT:    v_or_b32_e32 v1, v1, v6
606; VI-NEXT:    v_or_b32_e32 v0, v0, v7
607; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
608; VI-NEXT:    s_endpgm
609;
610; EG-LABEL: v_ctpop_v8i16:
611; EG:       ; %bb.0:
612; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
613; EG-NEXT:    TEX 0 @6
614; EG-NEXT:    ALU 73, @12, KC0[CB0:0-32], KC1[]
615; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
616; EG-NEXT:    CF_END
617; EG-NEXT:    PAD
618; EG-NEXT:    Fetch clause starting at 6:
619; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 0, #1
620; EG-NEXT:    ALU clause starting at 8:
621; EG-NEXT:     MOV T0.Y, T4.X,
622; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
623; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
624; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
625; EG-NEXT:    ALU clause starting at 12:
626; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
627; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
628; EG-NEXT:     BCNT_INT * T0.W, PV.W,
629; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
630; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
631; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
632; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
633; EG-NEXT:     MOV * T4.X, PV.W,
634; EG-NEXT:     MOV T0.X, PV.X,
635; EG-NEXT:     AND_INT * T0.W, T12.X, literal.x,
636; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
637; EG-NEXT:     BCNT_INT T0.W, PV.W,
638; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
639; EG-NEXT:    -65536(nan), 0(0.000000e+00)
640; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
641; EG-NEXT:     MOV T4.X, PV.W,
642; EG-NEXT:     MOV * T0.X, T5.X,
643; EG-NEXT:     LSHR * T0.W, T12.Y, literal.x,
644; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
645; EG-NEXT:     BCNT_INT T0.W, PV.W,
646; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
647; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
648; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
649; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
650; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
651; EG-NEXT:     MOV * T5.X, PV.W,
652; EG-NEXT:     MOV T0.X, PV.X,
653; EG-NEXT:     AND_INT * T0.W, T12.Y, literal.x,
654; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
655; EG-NEXT:     BCNT_INT T0.W, PV.W,
656; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
657; EG-NEXT:    -65536(nan), 0(0.000000e+00)
658; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
659; EG-NEXT:     MOV T5.X, PV.Y,
660; EG-NEXT:     MOV * T0.X, T8.X,
661; EG-NEXT:     LSHR * T0.W, T12.Z, literal.x,
662; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
663; EG-NEXT:     BCNT_INT T0.W, PV.W,
664; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
665; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
666; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
667; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
668; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
669; EG-NEXT:     MOV * T8.X, PV.W,
670; EG-NEXT:     MOV T0.X, PV.X,
671; EG-NEXT:     AND_INT * T0.W, T12.Z, literal.x,
672; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
673; EG-NEXT:     BCNT_INT T0.W, PV.W,
674; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
675; EG-NEXT:    -65536(nan), 0(0.000000e+00)
676; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
677; EG-NEXT:     MOV T8.X, PV.W,
678; EG-NEXT:     MOV * T0.X, T9.X,
679; EG-NEXT:     LSHR * T0.W, T12.W, literal.x,
680; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
681; EG-NEXT:     BCNT_INT T0.W, PV.W,
682; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
683; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
684; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
685; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
686; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
687; EG-NEXT:     MOV * T9.X, PV.W,
688; EG-NEXT:     MOV T0.X, PV.X,
689; EG-NEXT:     AND_INT * T0.W, T12.W, literal.x,
690; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
691; EG-NEXT:     BCNT_INT T0.W, PV.W,
692; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
693; EG-NEXT:    -65536(nan), 0(0.000000e+00)
694; EG-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
695; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
696; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
697; EG-NEXT:     MOV T9.X, PV.W,
698; EG-NEXT:     MOV * T0.X, T4.X,
699; EG-NEXT:     MOV * T0.Z, T8.X,
700  %tid = call i32 @llvm.amdgcn.workitem.id.x()
701  %in.gep = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %in, i32 %tid
702  %val = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep, align 32
703  %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone
704  store <8 x i16> %ctpop, <8 x i16> addrspace(1)* %out, align 32
705  ret void
706}
707
708define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind {
709; SI-LABEL: v_ctpop_v16i16:
710; SI:       ; %bb.0:
711; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
712; SI-NEXT:    s_mov_b32 s3, 0xf000
713; SI-NEXT:    s_mov_b32 s6, 0
714; SI-NEXT:    s_mov_b32 s7, s3
715; SI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
716; SI-NEXT:    v_mov_b32_e32 v5, 0
717; SI-NEXT:    s_waitcnt lgkmcnt(0)
718; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
719; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
720; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
721; SI-NEXT:    s_mov_b32 s4, 0xffff
722; SI-NEXT:    s_mov_b32 s2, -1
723; SI-NEXT:    s_waitcnt vmcnt(1)
724; SI-NEXT:    v_and_b32_e32 v8, s4, v0
725; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
726; SI-NEXT:    v_and_b32_e32 v9, s4, v1
727; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
728; SI-NEXT:    v_and_b32_e32 v10, s4, v2
729; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
730; SI-NEXT:    v_and_b32_e32 v11, s4, v3
731; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
732; SI-NEXT:    s_waitcnt vmcnt(0)
733; SI-NEXT:    v_and_b32_e32 v12, s4, v4
734; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
735; SI-NEXT:    v_and_b32_e32 v13, s4, v5
736; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
737; SI-NEXT:    v_and_b32_e32 v14, s4, v6
738; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
739; SI-NEXT:    v_and_b32_e32 v15, s4, v7
740; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
741; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
742; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
743; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
744; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
745; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
746; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
747; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
748; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
749; SI-NEXT:    v_bcnt_u32_b32_e64 v15, v15, 0
750; SI-NEXT:    v_bcnt_u32_b32_e64 v14, v14, 0
751; SI-NEXT:    v_bcnt_u32_b32_e64 v13, v13, 0
752; SI-NEXT:    v_bcnt_u32_b32_e64 v12, v12, 0
753; SI-NEXT:    v_bcnt_u32_b32_e64 v11, v11, 0
754; SI-NEXT:    v_bcnt_u32_b32_e64 v10, v10, 0
755; SI-NEXT:    v_bcnt_u32_b32_e64 v9, v9, 0
756; SI-NEXT:    v_bcnt_u32_b32_e64 v8, v8, 0
757; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
758; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
759; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
760; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
761; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
762; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
763; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
764; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
765; SI-NEXT:    v_or_b32_e32 v3, v15, v7
766; SI-NEXT:    v_or_b32_e32 v2, v14, v6
767; SI-NEXT:    v_or_b32_e32 v1, v13, v5
768; SI-NEXT:    v_or_b32_e32 v0, v12, v4
769; SI-NEXT:    v_or_b32_e32 v7, v11, v16
770; SI-NEXT:    v_or_b32_e32 v6, v10, v17
771; SI-NEXT:    v_or_b32_e32 v5, v9, v18
772; SI-NEXT:    v_or_b32_e32 v4, v8, v19
773; SI-NEXT:    s_waitcnt lgkmcnt(0)
774; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
775; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
776; SI-NEXT:    s_endpgm
777;
778; VI-LABEL: v_ctpop_v16i16:
779; VI:       ; %bb.0:
780; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
781; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
782; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
783; VI-NEXT:    s_mov_b32 s4, 0xffff
784; VI-NEXT:    s_waitcnt lgkmcnt(0)
785; VI-NEXT:    v_mov_b32_e32 v1, s3
786; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
787; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
788; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
789; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
790; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
791; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
792; VI-NEXT:    s_mov_b32 s3, 0xf000
793; VI-NEXT:    s_mov_b32 s2, -1
794; VI-NEXT:    s_waitcnt vmcnt(1)
795; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
796; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
797; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
798; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
799; VI-NEXT:    v_and_b32_e32 v3, s4, v3
800; VI-NEXT:    v_and_b32_e32 v2, s4, v2
801; VI-NEXT:    v_and_b32_e32 v1, s4, v1
802; VI-NEXT:    v_and_b32_e32 v0, s4, v0
803; VI-NEXT:    s_waitcnt vmcnt(0)
804; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
805; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
806; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
807; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
808; VI-NEXT:    v_bcnt_u32_b32 v8, v8, 0
809; VI-NEXT:    v_bcnt_u32_b32 v9, v9, 0
810; VI-NEXT:    v_bcnt_u32_b32 v10, v10, 0
811; VI-NEXT:    v_bcnt_u32_b32 v11, v11, 0
812; VI-NEXT:    v_and_b32_e32 v7, s4, v7
813; VI-NEXT:    v_and_b32_e32 v6, s4, v6
814; VI-NEXT:    v_and_b32_e32 v5, s4, v5
815; VI-NEXT:    v_and_b32_e32 v4, s4, v4
816; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
817; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
818; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
819; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
820; VI-NEXT:    v_bcnt_u32_b32 v12, v12, 0
821; VI-NEXT:    v_bcnt_u32_b32 v13, v13, 0
822; VI-NEXT:    v_bcnt_u32_b32 v14, v14, 0
823; VI-NEXT:    v_bcnt_u32_b32 v15, v15, 0
824; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
825; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
826; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
827; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
828; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
829; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
830; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
831; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
832; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
833; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
834; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
835; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
836; VI-NEXT:    v_or_b32_e32 v3, v3, v8
837; VI-NEXT:    v_or_b32_e32 v2, v2, v9
838; VI-NEXT:    v_or_b32_e32 v1, v1, v10
839; VI-NEXT:    v_or_b32_e32 v0, v0, v11
840; VI-NEXT:    v_or_b32_e32 v7, v7, v12
841; VI-NEXT:    v_or_b32_e32 v6, v6, v13
842; VI-NEXT:    v_or_b32_e32 v5, v5, v14
843; VI-NEXT:    v_or_b32_e32 v4, v4, v15
844; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
845; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
846; VI-NEXT:    s_endpgm
847;
848; EG-LABEL: v_ctpop_v16i16:
849; EG:       ; %bb.0:
850; EG-NEXT:    ALU 3, @12, KC0[CB0:0-32], KC1[]
851; EG-NEXT:    TEX 1 @8
852; EG-NEXT:    ALU 114, @16, KC0[], KC1[]
853; EG-NEXT:    ALU 34, @131, KC0[CB0:0-32], KC1[]
854; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
855; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
856; EG-NEXT:    CF_END
857; EG-NEXT:    PAD
858; EG-NEXT:    Fetch clause starting at 8:
859; EG-NEXT:     VTX_READ_128 T20.XYZW, T0.X, 16, #1
860; EG-NEXT:     VTX_READ_128 T21.XYZW, T0.X, 0, #1
861; EG-NEXT:    ALU clause starting at 12:
862; EG-NEXT:     MOV T0.Y, T4.X,
863; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
864; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
865; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
866; EG-NEXT:    ALU clause starting at 16:
867; EG-NEXT:     LSHR * T0.W, T20.X, literal.x,
868; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
869; EG-NEXT:     BCNT_INT * T0.W, PV.W,
870; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
871; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
872; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
873; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
874; EG-NEXT:     MOV * T4.X, PV.W,
875; EG-NEXT:     MOV T0.X, PV.X,
876; EG-NEXT:     AND_INT * T0.W, T20.X, literal.x,
877; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
878; EG-NEXT:     BCNT_INT T0.W, PV.W,
879; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
880; EG-NEXT:    -65536(nan), 0(0.000000e+00)
881; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
882; EG-NEXT:     MOV T4.X, PV.W,
883; EG-NEXT:     MOV * T0.X, T5.X,
884; EG-NEXT:     LSHR * T0.W, T20.Y, literal.x,
885; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
886; EG-NEXT:     BCNT_INT T0.W, PV.W,
887; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
888; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
889; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
890; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
891; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
892; EG-NEXT:     MOV * T5.X, PV.W,
893; EG-NEXT:     MOV T0.X, PV.X,
894; EG-NEXT:     AND_INT * T0.W, T20.Y, literal.x,
895; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
896; EG-NEXT:     BCNT_INT T0.W, PV.W,
897; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
898; EG-NEXT:    -65536(nan), 0(0.000000e+00)
899; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
900; EG-NEXT:     MOV T5.X, PV.Y,
901; EG-NEXT:     MOV * T0.X, T8.X,
902; EG-NEXT:     LSHR * T0.W, T20.Z, literal.x,
903; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
904; EG-NEXT:     BCNT_INT T0.W, PV.W,
905; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
906; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
907; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
908; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
909; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
910; EG-NEXT:     MOV * T8.X, PV.W,
911; EG-NEXT:     MOV T0.X, PV.X,
912; EG-NEXT:     AND_INT * T0.W, T20.Z, literal.x,
913; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
914; EG-NEXT:     BCNT_INT T0.W, PV.W,
915; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
916; EG-NEXT:    -65536(nan), 0(0.000000e+00)
917; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
918; EG-NEXT:     MOV T8.X, PV.W,
919; EG-NEXT:     MOV * T0.X, T9.X,
920; EG-NEXT:     LSHR * T0.W, T20.W, literal.x,
921; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
922; EG-NEXT:     BCNT_INT T0.W, PV.W,
923; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
924; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
925; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
926; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
927; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
928; EG-NEXT:     MOV * T9.X, PV.W,
929; EG-NEXT:     MOV T0.X, PV.X,
930; EG-NEXT:     AND_INT * T0.W, T20.W, literal.x,
931; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
932; EG-NEXT:     BCNT_INT T0.W, PV.W,
933; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
934; EG-NEXT:    -65536(nan), 0(0.000000e+00)
935; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
936; EG-NEXT:     MOV T9.X, PV.W,
937; EG-NEXT:     MOV * T0.X, T12.X,
938; EG-NEXT:     LSHR * T1.W, T21.X, literal.x,
939; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
940; EG-NEXT:     BCNT_INT T1.W, PV.W,
941; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
942; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
943; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
944; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
945; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
946; EG-NEXT:     MOV * T12.X, PV.W,
947; EG-NEXT:     MOV T0.X, PV.X,
948; EG-NEXT:     AND_INT * T1.W, T21.X, literal.x,
949; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
950; EG-NEXT:     BCNT_INT T1.W, PV.W,
951; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
952; EG-NEXT:    -65536(nan), 0(0.000000e+00)
953; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
954; EG-NEXT:     MOV T12.X, PV.W,
955; EG-NEXT:     MOV * T0.X, T13.X,
956; EG-NEXT:     LSHR * T1.W, T21.Y, literal.x,
957; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
958; EG-NEXT:     BCNT_INT T1.W, PV.W,
959; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
960; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
961; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
962; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
963; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
964; EG-NEXT:     MOV * T13.X, PV.W,
965; EG-NEXT:     MOV T0.X, PV.X,
966; EG-NEXT:     AND_INT * T1.W, T21.Y, literal.x,
967; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
968; EG-NEXT:     BCNT_INT T1.W, PV.W,
969; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
970; EG-NEXT:    -65536(nan), 0(0.000000e+00)
971; EG-NEXT:     OR_INT * T20.Y, PS, PV.W,
972; EG-NEXT:     MOV T13.X, PV.Y,
973; EG-NEXT:     MOV * T0.X, T16.X,
974; EG-NEXT:     LSHR * T1.W, T21.Z, literal.x,
975; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
976; EG-NEXT:     BCNT_INT T1.W, PV.W,
977; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
978; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
979; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
980; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
981; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
982; EG-NEXT:    ALU clause starting at 131:
983; EG-NEXT:     MOV * T16.X, T1.W,
984; EG-NEXT:     MOV T0.X, PV.X,
985; EG-NEXT:     AND_INT * T1.W, T21.Z, literal.x,
986; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
987; EG-NEXT:     BCNT_INT T1.W, PV.W,
988; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
989; EG-NEXT:    -65536(nan), 0(0.000000e+00)
990; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
991; EG-NEXT:     MOV T16.X, PV.W,
992; EG-NEXT:     MOV * T0.X, T17.X,
993; EG-NEXT:     LSHR * T1.W, T21.W, literal.x,
994; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
995; EG-NEXT:     BCNT_INT T1.W, PV.W,
996; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
997; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
998; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
999; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1000; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
1001; EG-NEXT:     MOV * T17.X, PV.W,
1002; EG-NEXT:     MOV T0.X, PV.X,
1003; EG-NEXT:     AND_INT T1.W, T21.W, literal.x,
1004; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.y,
1005; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1006; EG-NEXT:     AND_INT T0.Z, PV.X, literal.x,
1007; EG-NEXT:     BCNT_INT T1.W, PV.W,
1008; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
1009; EG-NEXT:    -65536(nan), 16(2.242078e-44)
1010; EG-NEXT:     LSHR T22.X, PS, literal.x,
1011; EG-NEXT:     OR_INT * T20.W, PV.Z, PV.W,
1012; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1013; EG-NEXT:     MOV T17.X, PV.W,
1014; EG-NEXT:     MOV * T0.X, T4.X,
1015; EG-NEXT:     MOV * T0.Z, T8.X,
1016; EG-NEXT:     MOV T20.X, T12.X,
1017; EG-NEXT:     MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
1018  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1019  %in.gep = getelementptr <16 x i16>, <16 x i16> addrspace(1)* %in, i32 %tid
1020  %val = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep, align 32
1021  %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone
1022  store <16 x i16> %ctpop, <16 x i16> addrspace(1)* %out, align 32
1023  ret void
1024}
1025
1026define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
1027; SI-LABEL: v_ctpop_i16_add_inline_constant:
1028; SI:       ; %bb.0:
1029; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1030; SI-NEXT:    s_mov_b32 s3, 0xf000
1031; SI-NEXT:    s_mov_b32 s6, 0
1032; SI-NEXT:    s_mov_b32 s7, s3
1033; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1034; SI-NEXT:    v_mov_b32_e32 v1, 0
1035; SI-NEXT:    s_waitcnt lgkmcnt(0)
1036; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1037; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1038; SI-NEXT:    s_mov_b32 s2, -1
1039; SI-NEXT:    s_waitcnt vmcnt(0)
1040; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 4
1041; SI-NEXT:    s_waitcnt lgkmcnt(0)
1042; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1043; SI-NEXT:    s_endpgm
1044;
1045; VI-LABEL: v_ctpop_i16_add_inline_constant:
1046; VI:       ; %bb.0:
1047; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1048; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1049; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1050; VI-NEXT:    s_waitcnt lgkmcnt(0)
1051; VI-NEXT:    v_mov_b32_e32 v1, s3
1052; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1053; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1054; VI-NEXT:    flat_load_ushort v0, v[0:1]
1055; VI-NEXT:    s_mov_b32 s3, 0xf000
1056; VI-NEXT:    s_mov_b32 s2, -1
1057; VI-NEXT:    s_waitcnt vmcnt(0)
1058; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
1059; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1060; VI-NEXT:    s_endpgm
1061;
1062; EG-LABEL: v_ctpop_i16_add_inline_constant:
1063; EG:       ; %bb.0:
1064; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1065; EG-NEXT:    TEX 0 @6
1066; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1067; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1068; EG-NEXT:    CF_END
1069; EG-NEXT:    PAD
1070; EG-NEXT:    Fetch clause starting at 6:
1071; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1072; EG-NEXT:    ALU clause starting at 8:
1073; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1074; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1075; EG-NEXT:    ALU clause starting at 10:
1076; EG-NEXT:     BCNT_INT T0.W, T0.X,
1077; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1078; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1079; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1080; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1081; EG-NEXT:    4(5.605194e-45), 3(4.203895e-45)
1082; EG-NEXT:     LSHL T0.X, PV.W, PS,
1083; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1084; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1085; EG-NEXT:     MOV T0.Y, 0.0,
1086; EG-NEXT:     MOV * T0.Z, 0.0,
1087; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1088; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1089  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1090  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1091  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1092  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1093  %add = add i16 %ctpop, 4
1094  store i16 %add, i16 addrspace(1)* %out, align 4
1095  ret void
1096}
1097
1098define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
1099; SI-LABEL: v_ctpop_i16_add_inline_constant_inv:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1102; SI-NEXT:    s_mov_b32 s3, 0xf000
1103; SI-NEXT:    s_mov_b32 s6, 0
1104; SI-NEXT:    s_mov_b32 s7, s3
1105; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1106; SI-NEXT:    v_mov_b32_e32 v1, 0
1107; SI-NEXT:    s_waitcnt lgkmcnt(0)
1108; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1109; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1110; SI-NEXT:    s_mov_b32 s2, -1
1111; SI-NEXT:    s_waitcnt vmcnt(0)
1112; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 4
1113; SI-NEXT:    s_waitcnt lgkmcnt(0)
1114; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1115; SI-NEXT:    s_endpgm
1116;
1117; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
1118; VI:       ; %bb.0:
1119; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1120; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1121; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1122; VI-NEXT:    s_waitcnt lgkmcnt(0)
1123; VI-NEXT:    v_mov_b32_e32 v1, s3
1124; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1125; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1126; VI-NEXT:    flat_load_ushort v0, v[0:1]
1127; VI-NEXT:    s_mov_b32 s3, 0xf000
1128; VI-NEXT:    s_mov_b32 s2, -1
1129; VI-NEXT:    s_waitcnt vmcnt(0)
1130; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
1131; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1132; VI-NEXT:    s_endpgm
1133;
1134; EG-LABEL: v_ctpop_i16_add_inline_constant_inv:
1135; EG:       ; %bb.0:
1136; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1137; EG-NEXT:    TEX 0 @6
1138; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1139; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1140; EG-NEXT:    CF_END
1141; EG-NEXT:    PAD
1142; EG-NEXT:    Fetch clause starting at 6:
1143; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1144; EG-NEXT:    ALU clause starting at 8:
1145; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1146; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1147; EG-NEXT:    ALU clause starting at 10:
1148; EG-NEXT:     BCNT_INT T0.W, T0.X,
1149; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1150; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1151; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1152; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1153; EG-NEXT:    4(5.605194e-45), 3(4.203895e-45)
1154; EG-NEXT:     LSHL T0.X, PV.W, PS,
1155; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1156; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1157; EG-NEXT:     MOV T0.Y, 0.0,
1158; EG-NEXT:     MOV * T0.Z, 0.0,
1159; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1160; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1161  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1162  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1163  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1164  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1165  %add = add i16 4, %ctpop
1166  store i16 %add, i16 addrspace(1)* %out, align 4
1167  ret void
1168}
1169
1170define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
1171; SI-LABEL: v_ctpop_i16_add_literal:
1172; SI:       ; %bb.0:
1173; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1174; SI-NEXT:    s_mov_b32 s3, 0xf000
1175; SI-NEXT:    s_mov_b32 s6, 0
1176; SI-NEXT:    s_mov_b32 s7, s3
1177; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1178; SI-NEXT:    v_mov_b32_e32 v1, 0
1179; SI-NEXT:    s_waitcnt lgkmcnt(0)
1180; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1181; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1182; SI-NEXT:    s_movk_i32 s4, 0x3e7
1183; SI-NEXT:    s_mov_b32 s2, -1
1184; SI-NEXT:    s_waitcnt vmcnt(0)
1185; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s4
1186; SI-NEXT:    s_waitcnt lgkmcnt(0)
1187; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1188; SI-NEXT:    s_endpgm
1189;
1190; VI-LABEL: v_ctpop_i16_add_literal:
1191; VI:       ; %bb.0:
1192; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1193; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1194; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1195; VI-NEXT:    s_movk_i32 s4, 0x3e7
1196; VI-NEXT:    s_waitcnt lgkmcnt(0)
1197; VI-NEXT:    v_mov_b32_e32 v1, s3
1198; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1199; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1200; VI-NEXT:    flat_load_ushort v0, v[0:1]
1201; VI-NEXT:    s_mov_b32 s3, 0xf000
1202; VI-NEXT:    s_mov_b32 s2, -1
1203; VI-NEXT:    s_waitcnt vmcnt(0)
1204; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1205; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1206; VI-NEXT:    s_endpgm
1207;
1208; EG-LABEL: v_ctpop_i16_add_literal:
1209; EG:       ; %bb.0:
1210; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1211; EG-NEXT:    TEX 0 @6
1212; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1213; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1214; EG-NEXT:    CF_END
1215; EG-NEXT:    PAD
1216; EG-NEXT:    Fetch clause starting at 6:
1217; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1218; EG-NEXT:    ALU clause starting at 8:
1219; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1220; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1221; EG-NEXT:    ALU clause starting at 10:
1222; EG-NEXT:     BCNT_INT T0.W, T0.X,
1223; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1224; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1225; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1226; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1227; EG-NEXT:    999(1.399897e-42), 3(4.203895e-45)
1228; EG-NEXT:     LSHL T0.X, PV.W, PS,
1229; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1230; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1231; EG-NEXT:     MOV T0.Y, 0.0,
1232; EG-NEXT:     MOV * T0.Z, 0.0,
1233; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1234; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1235  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1236  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1237  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1238  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1239  %add = add i16 %ctpop, 999
1240  store i16 %add, i16 addrspace(1)* %out, align 4
1241  ret void
1242}
1243
1244define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
1245; SI-LABEL: v_ctpop_i16_add_var:
1246; SI:       ; %bb.0:
1247; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1248; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
1249; SI-NEXT:    s_mov_b32 s3, 0xf000
1250; SI-NEXT:    s_mov_b32 s6, 0
1251; SI-NEXT:    s_mov_b32 s7, s3
1252; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1253; SI-NEXT:    v_mov_b32_e32 v1, 0
1254; SI-NEXT:    s_waitcnt lgkmcnt(0)
1255; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1256; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1257; SI-NEXT:    s_mov_b32 s2, -1
1258; SI-NEXT:    s_waitcnt vmcnt(0)
1259; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s8
1260; SI-NEXT:    s_waitcnt lgkmcnt(0)
1261; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1262; SI-NEXT:    s_endpgm
1263;
1264; VI-LABEL: v_ctpop_i16_add_var:
1265; VI:       ; %bb.0:
1266; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1267; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1268; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1269; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1270; VI-NEXT:    s_waitcnt lgkmcnt(0)
1271; VI-NEXT:    v_mov_b32_e32 v1, s3
1272; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1273; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274; VI-NEXT:    flat_load_ushort v0, v[0:1]
1275; VI-NEXT:    s_mov_b32 s3, 0xf000
1276; VI-NEXT:    s_mov_b32 s2, -1
1277; VI-NEXT:    s_waitcnt vmcnt(0)
1278; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1279; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1280; VI-NEXT:    s_endpgm
1281;
1282; EG-LABEL: v_ctpop_i16_add_var:
1283; EG:       ; %bb.0:
1284; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1285; EG-NEXT:    TEX 0 @8
1286; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1287; EG-NEXT:    TEX 0 @10
1288; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1289; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1290; EG-NEXT:    CF_END
1291; EG-NEXT:    PAD
1292; EG-NEXT:    Fetch clause starting at 8:
1293; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1294; EG-NEXT:    Fetch clause starting at 10:
1295; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
1296; EG-NEXT:    ALU clause starting at 12:
1297; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1298; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1299; EG-NEXT:    ALU clause starting at 14:
1300; EG-NEXT:     MOV * T1.X, 0.0,
1301; EG-NEXT:    ALU clause starting at 15:
1302; EG-NEXT:     BCNT_INT T0.W, T0.X,
1303; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1304; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1305; EG-NEXT:     ADD_INT * T0.W, PV.W, T1.X,
1306; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1307; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1308; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1309; EG-NEXT:     LSHL T0.X, PV.W, PS,
1310; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1311; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1312; EG-NEXT:     MOV T0.Y, 0.0,
1313; EG-NEXT:     MOV * T0.Z, 0.0,
1314; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1315; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1316  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1317  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1318  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1319  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1320  %add = add i16 %ctpop, %const
1321  store i16 %add, i16 addrspace(1)* %out, align 4
1322  ret void
1323}
1324
1325define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
1326; SI-LABEL: v_ctpop_i16_add_var_inv:
1327; SI:       ; %bb.0:
1328; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1329; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
1330; SI-NEXT:    s_mov_b32 s3, 0xf000
1331; SI-NEXT:    s_mov_b32 s6, 0
1332; SI-NEXT:    s_mov_b32 s7, s3
1333; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1334; SI-NEXT:    v_mov_b32_e32 v1, 0
1335; SI-NEXT:    s_waitcnt lgkmcnt(0)
1336; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1337; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1338; SI-NEXT:    s_mov_b32 s2, -1
1339; SI-NEXT:    s_waitcnt vmcnt(0)
1340; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s8
1341; SI-NEXT:    s_waitcnt lgkmcnt(0)
1342; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1343; SI-NEXT:    s_endpgm
1344;
1345; VI-LABEL: v_ctpop_i16_add_var_inv:
1346; VI:       ; %bb.0:
1347; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1348; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1349; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1350; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1351; VI-NEXT:    s_waitcnt lgkmcnt(0)
1352; VI-NEXT:    v_mov_b32_e32 v1, s3
1353; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1354; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1355; VI-NEXT:    flat_load_ushort v0, v[0:1]
1356; VI-NEXT:    s_mov_b32 s3, 0xf000
1357; VI-NEXT:    s_mov_b32 s2, -1
1358; VI-NEXT:    s_waitcnt vmcnt(0)
1359; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1360; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1361; VI-NEXT:    s_endpgm
1362;
1363; EG-LABEL: v_ctpop_i16_add_var_inv:
1364; EG:       ; %bb.0:
1365; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1366; EG-NEXT:    TEX 0 @8
1367; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1368; EG-NEXT:    TEX 0 @10
1369; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1370; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1371; EG-NEXT:    CF_END
1372; EG-NEXT:    PAD
1373; EG-NEXT:    Fetch clause starting at 8:
1374; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1375; EG-NEXT:    Fetch clause starting at 10:
1376; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
1377; EG-NEXT:    ALU clause starting at 12:
1378; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1379; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1380; EG-NEXT:    ALU clause starting at 14:
1381; EG-NEXT:     MOV * T1.X, 0.0,
1382; EG-NEXT:    ALU clause starting at 15:
1383; EG-NEXT:     BCNT_INT T0.W, T0.X,
1384; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1385; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1386; EG-NEXT:     ADD_INT * T0.W, T1.X, PV.W,
1387; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1388; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1389; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1390; EG-NEXT:     LSHL T0.X, PV.W, PS,
1391; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1392; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1393; EG-NEXT:     MOV T0.Y, 0.0,
1394; EG-NEXT:     MOV * T0.Z, 0.0,
1395; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1396; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1397  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1398  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1399  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1400  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1401  %add = add i16 %const, %ctpop
1402  store i16 %add, i16 addrspace(1)* %out, align 4
1403  ret void
1404}
1405
1406define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind {
1407; SI-LABEL: v_ctpop_i16_add_vvar_inv:
1408; SI:       ; %bb.0:
1409; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1410; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1411; SI-NEXT:    s_mov_b32 s3, 0xf000
1412; SI-NEXT:    s_mov_b32 s6, 0
1413; SI-NEXT:    s_mov_b32 s7, s3
1414; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1415; SI-NEXT:    v_mov_b32_e32 v1, 0
1416; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1417; SI-NEXT:    s_waitcnt lgkmcnt(0)
1418; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
1419; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1420; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1421; SI-NEXT:    s_mov_b32 s2, -1
1422; SI-NEXT:    s_waitcnt vmcnt(0)
1423; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v2, v0
1424; SI-NEXT:    s_waitcnt lgkmcnt(0)
1425; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1426; SI-NEXT:    s_endpgm
1427;
1428; VI-LABEL: v_ctpop_i16_add_vvar_inv:
1429; VI:       ; %bb.0:
1430; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1431; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1432; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1433; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1434; VI-NEXT:    s_waitcnt lgkmcnt(0)
1435; VI-NEXT:    v_mov_b32_e32 v1, s3
1436; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1437; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1438; VI-NEXT:    flat_load_ushort v3, v[0:1]
1439; VI-NEXT:    v_mov_b32_e32 v1, s5
1440; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1441; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1442; VI-NEXT:    flat_load_ushort v0, v[0:1]
1443; VI-NEXT:    s_mov_b32 s3, 0xf000
1444; VI-NEXT:    s_mov_b32 s2, -1
1445; VI-NEXT:    s_waitcnt vmcnt(0)
1446; VI-NEXT:    v_bcnt_u32_b32 v0, v3, v0
1447; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1448; VI-NEXT:    s_endpgm
1449;
1450; EG-LABEL: v_ctpop_i16_add_vvar_inv:
1451; EG:       ; %bb.0:
1452; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1453; EG-NEXT:    TEX 0 @8
1454; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1455; EG-NEXT:    TEX 0 @10
1456; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1457; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1458; EG-NEXT:    CF_END
1459; EG-NEXT:    PAD
1460; EG-NEXT:    Fetch clause starting at 8:
1461; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1462; EG-NEXT:    Fetch clause starting at 10:
1463; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
1464; EG-NEXT:    ALU clause starting at 12:
1465; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1466; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1467; EG-NEXT:    ALU clause starting at 14:
1468; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, T0.W,
1469; EG-NEXT:    ALU clause starting at 15:
1470; EG-NEXT:     BCNT_INT T0.W, T0.X,
1471; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1472; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1473; EG-NEXT:     ADD_INT * T0.W, T1.X, PV.W,
1474; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1475; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1476; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1477; EG-NEXT:     LSHL T0.X, PV.W, PS,
1478; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1479; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1480; EG-NEXT:     MOV T0.Y, 0.0,
1481; EG-NEXT:     MOV * T0.Z, 0.0,
1482; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1483; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1484  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1485  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1486  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1487  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1488  %gep = getelementptr i16, i16 addrspace(1)* %constptr, i32 %tid
1489  %const = load i16, i16 addrspace(1)* %gep, align 4
1490  %add = add i16 %const, %ctpop
1491  store i16 %add, i16 addrspace(1)* %out, align 4
1492  ret void
1493}
1494
1495; FIXME: We currently disallow SALU instructions in all branches,
1496; but there are some cases when the should be allowed.
1497define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %ctpop_arg, i16 %cond) {
1498; SI-LABEL: ctpop_i16_in_br:
1499; SI:       ; %bb.0: ; %entry
1500; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1501; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1502; SI-NEXT:    s_waitcnt lgkmcnt(0)
1503; SI-NEXT:    s_lshr_b32 s5, s4, 16
1504; SI-NEXT:    s_cmp_lg_u32 s5, 0
1505; SI-NEXT:    s_cbranch_scc0 .LBB14_2
1506; SI-NEXT:  ; %bb.1: ; %else
1507; SI-NEXT:    s_mov_b32 s11, 0xf000
1508; SI-NEXT:    s_mov_b32 s10, -1
1509; SI-NEXT:    s_mov_b32 s8, s2
1510; SI-NEXT:    s_mov_b32 s9, s3
1511; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1512; SI-NEXT:    s_mov_b64 s[2:3], 0
1513; SI-NEXT:    s_cbranch_execz .LBB14_3
1514; SI-NEXT:    s_branch .LBB14_4
1515; SI-NEXT:  .LBB14_2:
1516; SI-NEXT:    s_mov_b64 s[2:3], -1
1517; SI-NEXT:    v_mov_b32_e32 v0, 0
1518; SI-NEXT:  .LBB14_3: ; %if
1519; SI-NEXT:    s_and_b32 s2, s4, 0xffff
1520; SI-NEXT:    s_bcnt1_i32_b32 s2, s2
1521; SI-NEXT:    s_waitcnt vmcnt(0)
1522; SI-NEXT:    v_mov_b32_e32 v0, s2
1523; SI-NEXT:  .LBB14_4: ; %endif
1524; SI-NEXT:    s_mov_b32 s3, 0xf000
1525; SI-NEXT:    s_mov_b32 s2, -1
1526; SI-NEXT:    s_waitcnt vmcnt(0)
1527; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1528; SI-NEXT:    s_endpgm
1529;
1530; VI-LABEL: ctpop_i16_in_br:
1531; VI:       ; %bb.0: ; %entry
1532; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1533; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1534; VI-NEXT:    s_waitcnt lgkmcnt(0)
1535; VI-NEXT:    s_lshr_b32 s5, s4, 16
1536; VI-NEXT:    v_cmp_ne_u16_e64 s[6:7], s5, 0
1537; VI-NEXT:    s_and_b64 vcc, exec, s[6:7]
1538; VI-NEXT:    s_cbranch_vccz .LBB14_2
1539; VI-NEXT:  ; %bb.1: ; %else
1540; VI-NEXT:    s_mov_b32 s11, 0xf000
1541; VI-NEXT:    s_mov_b32 s10, -1
1542; VI-NEXT:    s_mov_b32 s8, s2
1543; VI-NEXT:    s_mov_b32 s9, s3
1544; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1545; VI-NEXT:    s_mov_b64 s[2:3], 0
1546; VI-NEXT:    s_cbranch_execz .LBB14_3
1547; VI-NEXT:    s_branch .LBB14_4
1548; VI-NEXT:  .LBB14_2:
1549; VI-NEXT:    s_mov_b64 s[2:3], -1
1550; VI-NEXT:    ; implicit-def: $vgpr0
1551; VI-NEXT:  .LBB14_3: ; %if
1552; VI-NEXT:    s_and_b32 s2, s4, 0xffff
1553; VI-NEXT:    s_bcnt1_i32_b32 s2, s2
1554; VI-NEXT:    s_waitcnt vmcnt(0)
1555; VI-NEXT:    v_mov_b32_e32 v0, s2
1556; VI-NEXT:  .LBB14_4: ; %endif
1557; VI-NEXT:    s_mov_b32 s3, 0xf000
1558; VI-NEXT:    s_mov_b32 s2, -1
1559; VI-NEXT:    s_waitcnt vmcnt(0)
1560; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1561; VI-NEXT:    s_endpgm
1562;
1563; EG-LABEL: ctpop_i16_in_br:
1564; EG:       ; %bb.0: ; %entry
1565; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
1566; EG-NEXT:    TEX 0 @14
1567; EG-NEXT:    ALU_PUSH_BEFORE 6, @21, KC0[], KC1[]
1568; EG-NEXT:    JUMP @7 POP:1
1569; EG-NEXT:    ALU 0, @28, KC0[CB0:0-32], KC1[]
1570; EG-NEXT:    TEX 0 @16
1571; EG-NEXT:    ALU_POP_AFTER 1, @29, KC0[], KC1[]
1572; EG-NEXT:    ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
1573; EG-NEXT:    JUMP @11 POP:1
1574; EG-NEXT:    TEX 0 @18
1575; EG-NEXT:    ALU_POP_AFTER 0, @34, KC0[], KC1[]
1576; EG-NEXT:    ALU 11, @35, KC0[], KC1[]
1577; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
1578; EG-NEXT:    CF_END
1579; EG-NEXT:    Fetch clause starting at 14:
1580; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 46, #3
1581; EG-NEXT:    Fetch clause starting at 16:
1582; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 2, #1
1583; EG-NEXT:    Fetch clause starting at 18:
1584; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
1585; EG-NEXT:    ALU clause starting at 20:
1586; EG-NEXT:     MOV * T0.X, 0.0,
1587; EG-NEXT:    ALU clause starting at 21:
1588; EG-NEXT:     AND_INT * T0.W, T1.X, literal.x,
1589; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1590; EG-NEXT:     MOV T1.X, literal.x,
1591; EG-NEXT:     MOV T1.W, literal.y,
1592; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
1593; EG-NEXT:    0(0.000000e+00), 1(1.401298e-45)
1594; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1595; EG-NEXT:    ALU clause starting at 28:
1596; EG-NEXT:     MOV * T1.X, KC0[2].Z,
1597; EG-NEXT:    ALU clause starting at 29:
1598; EG-NEXT:     MOV * T1.W, literal.x,
1599; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1600; EG-NEXT:    ALU clause starting at 31:
1601; EG-NEXT:     MOV T0.W, KC0[2].Y,
1602; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
1603; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1604; EG-NEXT:    ALU clause starting at 34:
1605; EG-NEXT:     BCNT_INT * T1.X, T0.X,
1606; EG-NEXT:    ALU clause starting at 35:
1607; EG-NEXT:     LSHL * T1.W, T0.W, literal.x,
1608; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1609; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1610; EG-NEXT:     AND_INT * T2.W, T1.X, literal.y,
1611; EG-NEXT:    24(3.363116e-44), 65535(9.183409e-41)
1612; EG-NEXT:     LSHL T1.X, PS, PV.W,
1613; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
1614; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1615; EG-NEXT:     MOV T1.Y, 0.0,
1616; EG-NEXT:     MOV * T1.Z, 0.0,
1617; EG-NEXT:     LSHR * T0.X, T0.W, literal.x,
1618; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1619entry:
1620  %tmp0 = icmp eq i16 %cond, 0
1621  br i1 %tmp0, label %if, label %else
1622
1623if:
1624  %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg)
1625  br label %endif
1626
1627else:
1628  %tmp3 = getelementptr i16, i16 addrspace(1)* %in, i16 1
1629  %tmp4 = load i16, i16 addrspace(1)* %tmp3
1630  br label %endif
1631
1632endif:
1633  %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else]
1634  store i16 %tmp5, i16 addrspace(1)* %out
1635  ret void
1636}
1637