1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
5
6declare i16 @llvm.ctpop.i16(i16) nounwind readnone
7declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone
8declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
9declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
10declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone
11
12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
13
14define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
15; SI-LABEL: s_ctpop_i16:
16; SI:       ; %bb.0:
17; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
18; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
19; SI-NEXT:    s_mov_b32 s3, 0xf000
20; SI-NEXT:    s_mov_b32 s2, -1
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_and_b32 s4, s4, 0xffff
23; SI-NEXT:    s_bcnt1_i32_b32 s4, s4
24; SI-NEXT:    v_mov_b32_e32 v0, s4
25; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: s_ctpop_i16:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
31; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
32; VI-NEXT:    s_mov_b32 s3, 0xf000
33; VI-NEXT:    s_mov_b32 s2, -1
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    s_and_b32 s4, s4, 0xffff
36; VI-NEXT:    s_bcnt1_i32_b32 s4, s4
37; VI-NEXT:    v_mov_b32_e32 v0, s4
38; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
39; VI-NEXT:    s_endpgm
40;
41; EG-LABEL: s_ctpop_i16:
42; EG:       ; %bb.0:
43; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
44; EG-NEXT:    TEX 0 @6
45; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
46; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
47; EG-NEXT:    CF_END
48; EG-NEXT:    PAD
49; EG-NEXT:    Fetch clause starting at 6:
50; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
51; EG-NEXT:    ALU clause starting at 8:
52; EG-NEXT:     MOV * T0.X, 0.0,
53; EG-NEXT:    ALU clause starting at 9:
54; EG-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
55; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
56; EG-NEXT:     BCNT_INT T1.W, T0.X,
57; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
58; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
59; EG-NEXT:     LSHL T0.X, PV.W, PS,
60; EG-NEXT:     LSHL * T0.W, literal.x, PS,
61; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
62; EG-NEXT:     MOV T0.Y, 0.0,
63; EG-NEXT:     MOV * T0.Z, 0.0,
64; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
65; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
66  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
67  store i16 %ctpop, i16 addrspace(1)* %out, align 4
68  ret void
69}
70
71; XXX - Why 0 in register?
72define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
73; SI-LABEL: v_ctpop_i16:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
76; SI-NEXT:    s_mov_b32 s3, 0xf000
77; SI-NEXT:    s_mov_b32 s6, 0
78; SI-NEXT:    s_mov_b32 s7, s3
79; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
80; SI-NEXT:    v_mov_b32_e32 v1, 0
81; SI-NEXT:    s_waitcnt lgkmcnt(0)
82; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
83; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
84; SI-NEXT:    s_mov_b32 s2, -1
85; SI-NEXT:    s_waitcnt vmcnt(0)
86; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
87; SI-NEXT:    s_waitcnt lgkmcnt(0)
88; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
89; SI-NEXT:    s_endpgm
90;
91; VI-LABEL: v_ctpop_i16:
92; VI:       ; %bb.0:
93; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
94; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
95; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    v_mov_b32_e32 v1, s3
98; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
99; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
100; VI-NEXT:    flat_load_ushort v0, v[0:1]
101; VI-NEXT:    s_mov_b32 s3, 0xf000
102; VI-NEXT:    s_mov_b32 s2, -1
103; VI-NEXT:    s_waitcnt vmcnt(0)
104; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
105; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
106; VI-NEXT:    s_endpgm
107;
108; EG-LABEL: v_ctpop_i16:
109; EG:       ; %bb.0:
110; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
111; EG-NEXT:    TEX 0 @6
112; EG-NEXT:    ALU 11, @10, KC0[CB0:0-32], KC1[]
113; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
114; EG-NEXT:    CF_END
115; EG-NEXT:    PAD
116; EG-NEXT:    Fetch clause starting at 6:
117; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
118; EG-NEXT:    ALU clause starting at 8:
119; EG-NEXT:     LSHL * T0.W, T0.X, 1,
120; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
121; EG-NEXT:    ALU clause starting at 10:
122; EG-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
123; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
124; EG-NEXT:     BCNT_INT T1.W, T0.X,
125; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
126; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
127; EG-NEXT:     LSHL T0.X, PV.W, PS,
128; EG-NEXT:     LSHL * T0.W, literal.x, PS,
129; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
130; EG-NEXT:     MOV T0.Y, 0.0,
131; EG-NEXT:     MOV * T0.Z, 0.0,
132; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
134  %tid = call i32 @llvm.amdgcn.workitem.id.x()
135  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
136  %val = load i16, i16 addrspace(1)* %in.gep, align 4
137  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
138  store i16 %ctpop, i16 addrspace(1)* %out, align 4
139  ret void
140}
141
142define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind {
143; SI-LABEL: v_ctpop_add_chain_i16:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
146; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
147; SI-NEXT:    s_mov_b32 s3, 0xf000
148; SI-NEXT:    s_mov_b32 s6, 0
149; SI-NEXT:    s_mov_b32 s7, s3
150; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
151; SI-NEXT:    v_mov_b32_e32 v1, 0
152; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
153; SI-NEXT:    s_waitcnt lgkmcnt(0)
154; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 glc
157; SI-NEXT:    s_waitcnt vmcnt(0)
158; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
159; SI-NEXT:    s_mov_b32 s2, -1
160; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
161; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v2, v0
162; SI-NEXT:    s_waitcnt lgkmcnt(0)
163; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
164; SI-NEXT:    s_endpgm
165;
166; VI-LABEL: v_ctpop_add_chain_i16:
167; VI:       ; %bb.0:
168; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
169; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
170; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
171; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    v_mov_b32_e32 v1, s3
174; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
175; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
176; VI-NEXT:    v_mov_b32_e32 v3, s5
177; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
178; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
179; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
180; VI-NEXT:    s_waitcnt vmcnt(0)
181; VI-NEXT:    flat_load_ushort v1, v[2:3] glc
182; VI-NEXT:    s_waitcnt vmcnt(0)
183; VI-NEXT:    s_mov_b32 s3, 0xf000
184; VI-NEXT:    s_mov_b32 s2, -1
185; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
186; VI-NEXT:    v_bcnt_u32_b32 v0, v0, v1
187; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
188; VI-NEXT:    s_endpgm
189;
190; EG-LABEL: v_ctpop_add_chain_i16:
191; EG:       ; %bb.0:
192; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
193; EG-NEXT:    TEX 0 @8
194; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
195; EG-NEXT:    TEX 0 @10
196; EG-NEXT:    ALU 16, @15, KC0[CB0:0-32], KC1[]
197; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
198; EG-NEXT:    CF_END
199; EG-NEXT:    PAD
200; EG-NEXT:    Fetch clause starting at 8:
201; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
202; EG-NEXT:    Fetch clause starting at 10:
203; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
204; EG-NEXT:    ALU clause starting at 12:
205; EG-NEXT:     LSHL * T0.W, T0.X, 1,
206; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
207; EG-NEXT:    ALU clause starting at 14:
208; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, T0.W,
209; EG-NEXT:    ALU clause starting at 15:
210; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
211; EG-NEXT:     AND_INT * T1.W, T1.X, literal.x,
212; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
213; EG-NEXT:     BCNT_INT T0.Z, PS,
214; EG-NEXT:     BCNT_INT T0.W, PV.W,
215; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
216; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
217; EG-NEXT:     ADD_INT T0.W, PV.W, PV.Z,
218; EG-NEXT:     LSHL * T1.W, PS, literal.x,
219; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
220; EG-NEXT:     LSHL T0.X, PV.W, PS,
221; EG-NEXT:     LSHL * T0.W, literal.x, PS,
222; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
223; EG-NEXT:     MOV T0.Y, 0.0,
224; EG-NEXT:     MOV * T0.Z, 0.0,
225; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
226; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
227  %tid = call i32 @llvm.amdgcn.workitem.id.x()
228  %in0.gep = getelementptr i16, i16 addrspace(1)* %in0, i32 %tid
229  %in1.gep = getelementptr i16, i16 addrspace(1)* %in1, i32 %tid
230  %val0 = load volatile i16, i16 addrspace(1)* %in0.gep, align 4
231  %val1 = load volatile i16, i16 addrspace(1)* %in1.gep, align 4
232  %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone
233  %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone
234  %add = add i16 %ctpop0, %ctpop1
235  store i16 %add, i16 addrspace(1)* %out, align 4
236  ret void
237}
238
239define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind {
240; SI-LABEL: v_ctpop_add_sgpr_i16:
241; SI:       ; %bb.0:
242; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
243; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
244; SI-NEXT:    s_mov_b32 s3, 0xf000
245; SI-NEXT:    s_mov_b32 s6, 0
246; SI-NEXT:    s_mov_b32 s7, s3
247; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
248; SI-NEXT:    v_mov_b32_e32 v1, 0
249; SI-NEXT:    s_waitcnt lgkmcnt(0)
250; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
251; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
252; SI-NEXT:    s_mov_b32 s2, -1
253; SI-NEXT:    s_waitcnt vmcnt(0)
254; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s8
255; SI-NEXT:    s_waitcnt lgkmcnt(0)
256; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
257; SI-NEXT:    s_endpgm
258;
259; VI-LABEL: v_ctpop_add_sgpr_i16:
260; VI:       ; %bb.0:
261; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
262; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
263; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
264; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
265; VI-NEXT:    s_waitcnt lgkmcnt(0)
266; VI-NEXT:    v_mov_b32_e32 v1, s3
267; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
268; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
269; VI-NEXT:    flat_load_ushort v0, v[0:1]
270; VI-NEXT:    s_mov_b32 s3, 0xf000
271; VI-NEXT:    s_mov_b32 s2, -1
272; VI-NEXT:    s_waitcnt vmcnt(0)
273; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
274; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
275; VI-NEXT:    s_endpgm
276;
277; EG-LABEL: v_ctpop_add_sgpr_i16:
278; EG:       ; %bb.0:
279; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
280; EG-NEXT:    TEX 0 @8
281; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
282; EG-NEXT:    TEX 0 @10
283; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
284; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
285; EG-NEXT:    CF_END
286; EG-NEXT:    PAD
287; EG-NEXT:    Fetch clause starting at 8:
288; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
289; EG-NEXT:    Fetch clause starting at 10:
290; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
291; EG-NEXT:    ALU clause starting at 12:
292; EG-NEXT:     LSHL * T0.W, T0.X, 1,
293; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
294; EG-NEXT:    ALU clause starting at 14:
295; EG-NEXT:     MOV * T1.X, 0.0,
296; EG-NEXT:    ALU clause starting at 15:
297; EG-NEXT:     BCNT_INT T0.W, T0.X,
298; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
299; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
300; EG-NEXT:     ADD_INT * T0.W, PV.W, T1.X,
301; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
302; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
303; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
304; EG-NEXT:     LSHL T0.X, PV.W, PS,
305; EG-NEXT:     LSHL * T0.W, literal.x, PS,
306; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
307; EG-NEXT:     MOV T0.Y, 0.0,
308; EG-NEXT:     MOV * T0.Z, 0.0,
309; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
310; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
311  %tid = call i32 @llvm.amdgcn.workitem.id.x()
312  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
313  %val = load i16, i16 addrspace(1)* %in.gep, align 4
314  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
315  %add = add i16 %ctpop, %sval
316  store i16 %add, i16 addrspace(1)* %out, align 4
317  ret void
318}
319
320define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind {
321; SI-LABEL: v_ctpop_v2i16:
322; SI:       ; %bb.0:
323; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
324; SI-NEXT:    s_mov_b32 s3, 0xf000
325; SI-NEXT:    s_mov_b32 s6, 0
326; SI-NEXT:    s_mov_b32 s7, s3
327; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
328; SI-NEXT:    v_mov_b32_e32 v1, 0
329; SI-NEXT:    s_waitcnt lgkmcnt(0)
330; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
331; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
332; SI-NEXT:    s_mov_b32 s2, -1
333; SI-NEXT:    s_waitcnt vmcnt(0)
334; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
335; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
336; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
337; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
338; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
339; SI-NEXT:    v_or_b32_e32 v0, v1, v0
340; SI-NEXT:    s_waitcnt lgkmcnt(0)
341; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
342; SI-NEXT:    s_endpgm
343;
344; VI-LABEL: v_ctpop_v2i16:
345; VI:       ; %bb.0:
346; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
347; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
348; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
349; VI-NEXT:    s_waitcnt lgkmcnt(0)
350; VI-NEXT:    v_mov_b32_e32 v1, s3
351; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
352; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
353; VI-NEXT:    flat_load_dword v0, v[0:1]
354; VI-NEXT:    s_mov_b32 s3, 0xf000
355; VI-NEXT:    s_mov_b32 s2, -1
356; VI-NEXT:    s_waitcnt vmcnt(0)
357; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
358; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
359; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
360; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
361; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
362; VI-NEXT:    v_or_b32_e32 v0, v0, v1
363; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
364; VI-NEXT:    s_endpgm
365;
366; EG-LABEL: v_ctpop_v2i16:
367; EG:       ; %bb.0:
368; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
369; EG-NEXT:    TEX 0 @6
370; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
371; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
372; EG-NEXT:    CF_END
373; EG-NEXT:    PAD
374; EG-NEXT:    Fetch clause starting at 6:
375; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
376; EG-NEXT:    ALU clause starting at 8:
377; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
378; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
379; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
380; EG-NEXT:    ALU clause starting at 11:
381; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
382; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
383; EG-NEXT:     BCNT_INT T0.W, PV.W,
384; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
385; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
386; EG-NEXT:     BCNT_INT T1.W, PS,
387; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
388; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
389; EG-NEXT:     OR_INT T0.X, PV.W, PS,
390; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
391; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
392  %tid = call i32 @llvm.amdgcn.workitem.id.x()
393  %in.gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
394  %val = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep, align 8
395  %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone
396  store <2 x i16> %ctpop, <2 x i16> addrspace(1)* %out, align 8
397  ret void
398}
399
400define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind {
401; SI-LABEL: v_ctpop_v4i16:
402; SI:       ; %bb.0:
403; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
404; SI-NEXT:    s_mov_b32 s3, 0xf000
405; SI-NEXT:    s_mov_b32 s6, 0
406; SI-NEXT:    s_mov_b32 s7, s3
407; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
408; SI-NEXT:    v_mov_b32_e32 v1, 0
409; SI-NEXT:    s_waitcnt lgkmcnt(0)
410; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
411; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
412; SI-NEXT:    s_mov_b32 s2, -1
413; SI-NEXT:    s_waitcnt vmcnt(0)
414; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
415; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
416; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
417; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
418; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
419; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
420; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
421; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
422; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
423; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
424; SI-NEXT:    v_or_b32_e32 v1, v3, v1
425; SI-NEXT:    v_or_b32_e32 v0, v2, v0
426; SI-NEXT:    s_waitcnt lgkmcnt(0)
427; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
428; SI-NEXT:    s_endpgm
429;
430; VI-LABEL: v_ctpop_v4i16:
431; VI:       ; %bb.0:
432; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
433; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
434; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
435; VI-NEXT:    s_waitcnt lgkmcnt(0)
436; VI-NEXT:    v_mov_b32_e32 v1, s3
437; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
438; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
439; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
440; VI-NEXT:    s_mov_b32 s3, 0xf000
441; VI-NEXT:    s_mov_b32 s2, -1
442; VI-NEXT:    s_waitcnt vmcnt(0)
443; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
444; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
445; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
446; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
447; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
448; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
449; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
450; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
451; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
452; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
453; VI-NEXT:    v_or_b32_e32 v1, v1, v2
454; VI-NEXT:    v_or_b32_e32 v0, v0, v3
455; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
456; VI-NEXT:    s_endpgm
457;
458; EG-LABEL: v_ctpop_v4i16:
459; EG:       ; %bb.0:
460; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
461; EG-NEXT:    TEX 0 @6
462; EG-NEXT:    ALU 42, @11, KC0[CB0:0-32], KC1[]
463; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
464; EG-NEXT:    CF_END
465; EG-NEXT:    PAD
466; EG-NEXT:    Fetch clause starting at 6:
467; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
468; EG-NEXT:    ALU clause starting at 8:
469; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
470; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
471; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
472; EG-NEXT:    ALU clause starting at 11:
473; EG-NEXT:     MOV T2.X, T0.X,
474; EG-NEXT:     MOV * T3.X, T0.Y,
475; EG-NEXT:     MOV T0.X, T4.X,
476; EG-NEXT:     MOV * T0.Y, PV.X,
477; EG-NEXT:     AND_INT * T0.W, PV.Y, literal.x,
478; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
479; EG-NEXT:     BCNT_INT T0.W, PV.W,
480; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
481; EG-NEXT:    -65536(nan), 0(0.000000e+00)
482; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
483; EG-NEXT:     MOV T0.X, T3.X,
484; EG-NEXT:     MOV * T4.X, PV.W,
485; EG-NEXT:     MOV T0.Z, PS,
486; EG-NEXT:     LSHR * T0.W, T0.Y, literal.x,
487; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
488; EG-NEXT:     BCNT_INT T0.W, PV.W,
489; EG-NEXT:     AND_INT * T1.W, PV.Z, literal.x,
490; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
491; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
492; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
493; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
494; EG-NEXT:     MOV T4.X, PV.W,
495; EG-NEXT:     MOV T0.Y, T5.X,
496; EG-NEXT:     AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
497; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
498; EG-NEXT:     BCNT_INT T0.W, PV.W,
499; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
500; EG-NEXT:    -65536(nan), 0(0.000000e+00)
501; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
502; EG-NEXT:     MOV * T5.X, PV.W,
503; EG-NEXT:     MOV T0.Y, PV.X,
504; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
505; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
506; EG-NEXT:     BCNT_INT T0.W, PV.W,
507; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
508; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
509; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
510; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
511; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
512; EG-NEXT:     OR_INT * T8.Y, T1.W, PV.W,
513; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
514; EG-NEXT:     MOV T5.X, PV.Y,
515; EG-NEXT:     MOV * T8.X, T4.X,
516  %tid = call i32 @llvm.amdgcn.workitem.id.x()
517  %in.gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
518  %val = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep, align 16
519  %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone
520  store <4 x i16> %ctpop, <4 x i16> addrspace(1)* %out, align 16
521  ret void
522}
523
524define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind {
525; SI-LABEL: v_ctpop_v8i16:
526; SI:       ; %bb.0:
527; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
528; SI-NEXT:    s_mov_b32 s3, 0xf000
529; SI-NEXT:    s_mov_b32 s6, 0
530; SI-NEXT:    s_mov_b32 s7, s3
531; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
532; SI-NEXT:    v_mov_b32_e32 v1, 0
533; SI-NEXT:    s_waitcnt lgkmcnt(0)
534; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
535; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
536; SI-NEXT:    s_mov_b32 s2, -1
537; SI-NEXT:    s_waitcnt vmcnt(0)
538; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
539; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
540; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v1
541; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
542; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v2
543; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
544; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v3
545; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
546; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
547; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
548; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
549; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
550; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
551; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
552; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
553; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
554; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
555; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
556; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
557; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
558; SI-NEXT:    v_or_b32_e32 v3, v7, v3
559; SI-NEXT:    v_or_b32_e32 v2, v6, v2
560; SI-NEXT:    v_or_b32_e32 v1, v5, v1
561; SI-NEXT:    v_or_b32_e32 v0, v4, v0
562; SI-NEXT:    s_waitcnt lgkmcnt(0)
563; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
564; SI-NEXT:    s_endpgm
565;
566; VI-LABEL: v_ctpop_v8i16:
567; VI:       ; %bb.0:
568; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
569; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
570; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
571; VI-NEXT:    s_waitcnt lgkmcnt(0)
572; VI-NEXT:    v_mov_b32_e32 v1, s3
573; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
574; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
575; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
576; VI-NEXT:    s_mov_b32 s3, 0xf000
577; VI-NEXT:    s_mov_b32 s2, -1
578; VI-NEXT:    s_waitcnt vmcnt(0)
579; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
580; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
581; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
582; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
583; VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
584; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
585; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
586; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
587; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
588; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
589; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
590; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
591; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
592; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
593; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
594; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
595; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
596; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
597; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
598; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
599; VI-NEXT:    v_or_b32_e32 v3, v3, v4
600; VI-NEXT:    v_or_b32_e32 v2, v2, v5
601; VI-NEXT:    v_or_b32_e32 v1, v1, v6
602; VI-NEXT:    v_or_b32_e32 v0, v0, v7
603; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
604; VI-NEXT:    s_endpgm
605;
606; EG-LABEL: v_ctpop_v8i16:
607; EG:       ; %bb.0:
608; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
609; EG-NEXT:    TEX 0 @6
610; EG-NEXT:    ALU 73, @12, KC0[CB0:0-32], KC1[]
611; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
612; EG-NEXT:    CF_END
613; EG-NEXT:    PAD
614; EG-NEXT:    Fetch clause starting at 6:
615; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 0, #1
616; EG-NEXT:    ALU clause starting at 8:
617; EG-NEXT:     MOV T0.Y, T4.X,
618; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
619; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
620; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
621; EG-NEXT:    ALU clause starting at 12:
622; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
623; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
624; EG-NEXT:     BCNT_INT * T0.W, PV.W,
625; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
626; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
627; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
628; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
629; EG-NEXT:     MOV * T4.X, PV.W,
630; EG-NEXT:     MOV T0.X, PV.X,
631; EG-NEXT:     AND_INT * T0.W, T12.X, literal.x,
632; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
633; EG-NEXT:     BCNT_INT T0.W, PV.W,
634; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
635; EG-NEXT:    -65536(nan), 0(0.000000e+00)
636; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
637; EG-NEXT:     MOV T4.X, PV.W,
638; EG-NEXT:     MOV * T0.X, T5.X,
639; EG-NEXT:     LSHR * T0.W, T12.Y, literal.x,
640; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
641; EG-NEXT:     BCNT_INT T0.W, PV.W,
642; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
643; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
644; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
645; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
646; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
647; EG-NEXT:     MOV * T5.X, PV.W,
648; EG-NEXT:     MOV T0.X, PV.X,
649; EG-NEXT:     AND_INT * T0.W, T12.Y, literal.x,
650; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
651; EG-NEXT:     BCNT_INT T0.W, PV.W,
652; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
653; EG-NEXT:    -65536(nan), 0(0.000000e+00)
654; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
655; EG-NEXT:     MOV T5.X, PV.Y,
656; EG-NEXT:     MOV * T0.X, T8.X,
657; EG-NEXT:     LSHR * T0.W, T12.Z, literal.x,
658; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
659; EG-NEXT:     BCNT_INT T0.W, PV.W,
660; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
661; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
662; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
663; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
664; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
665; EG-NEXT:     MOV * T8.X, PV.W,
666; EG-NEXT:     MOV T0.X, PV.X,
667; EG-NEXT:     AND_INT * T0.W, T12.Z, literal.x,
668; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
669; EG-NEXT:     BCNT_INT T0.W, PV.W,
670; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
671; EG-NEXT:    -65536(nan), 0(0.000000e+00)
672; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
673; EG-NEXT:     MOV T8.X, PV.W,
674; EG-NEXT:     MOV * T0.X, T9.X,
675; EG-NEXT:     LSHR * T0.W, T12.W, literal.x,
676; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
677; EG-NEXT:     BCNT_INT T0.W, PV.W,
678; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
679; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
680; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
681; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
682; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
683; EG-NEXT:     MOV * T9.X, PV.W,
684; EG-NEXT:     MOV T0.X, PV.X,
685; EG-NEXT:     AND_INT * T0.W, T12.W, literal.x,
686; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
687; EG-NEXT:     BCNT_INT T0.W, PV.W,
688; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
689; EG-NEXT:    -65536(nan), 0(0.000000e+00)
690; EG-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
691; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
692; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
693; EG-NEXT:     MOV T9.X, PV.W,
694; EG-NEXT:     MOV * T0.X, T4.X,
695; EG-NEXT:     MOV * T0.Z, T8.X,
696  %tid = call i32 @llvm.amdgcn.workitem.id.x()
697  %in.gep = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %in, i32 %tid
698  %val = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep, align 32
699  %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone
700  store <8 x i16> %ctpop, <8 x i16> addrspace(1)* %out, align 32
701  ret void
702}
703
704define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind {
705; SI-LABEL: v_ctpop_v16i16:
706; SI:       ; %bb.0:
707; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
708; SI-NEXT:    s_mov_b32 s3, 0xf000
709; SI-NEXT:    s_mov_b32 s6, 0
710; SI-NEXT:    s_mov_b32 s7, s3
711; SI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
712; SI-NEXT:    v_mov_b32_e32 v5, 0
713; SI-NEXT:    s_waitcnt lgkmcnt(0)
714; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
715; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
716; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
717; SI-NEXT:    s_mov_b32 s2, -1
718; SI-NEXT:    s_waitcnt vmcnt(1)
719; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
720; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
721; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v1
722; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
723; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v2
724; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
725; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v3
726; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
727; SI-NEXT:    s_waitcnt vmcnt(0)
728; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v4
729; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
730; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v5
731; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
732; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v6
733; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
734; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
735; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
736; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
737; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
738; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
739; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
740; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
741; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
742; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
743; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
744; SI-NEXT:    v_bcnt_u32_b32_e64 v15, v15, 0
745; SI-NEXT:    v_bcnt_u32_b32_e64 v14, v14, 0
746; SI-NEXT:    v_bcnt_u32_b32_e64 v13, v13, 0
747; SI-NEXT:    v_bcnt_u32_b32_e64 v12, v12, 0
748; SI-NEXT:    v_bcnt_u32_b32_e64 v11, v11, 0
749; SI-NEXT:    v_bcnt_u32_b32_e64 v10, v10, 0
750; SI-NEXT:    v_bcnt_u32_b32_e64 v9, v9, 0
751; SI-NEXT:    v_bcnt_u32_b32_e64 v8, v8, 0
752; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
753; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
754; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
755; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
756; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
757; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
758; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
759; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
760; SI-NEXT:    v_or_b32_e32 v3, v15, v7
761; SI-NEXT:    v_or_b32_e32 v2, v14, v6
762; SI-NEXT:    v_or_b32_e32 v1, v13, v5
763; SI-NEXT:    v_or_b32_e32 v0, v12, v4
764; SI-NEXT:    v_or_b32_e32 v7, v11, v16
765; SI-NEXT:    v_or_b32_e32 v6, v10, v17
766; SI-NEXT:    v_or_b32_e32 v5, v9, v18
767; SI-NEXT:    v_or_b32_e32 v4, v8, v19
768; SI-NEXT:    s_waitcnt lgkmcnt(0)
769; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
770; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
771; SI-NEXT:    s_endpgm
772;
773; VI-LABEL: v_ctpop_v16i16:
774; VI:       ; %bb.0:
775; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
776; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
777; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
778; VI-NEXT:    s_waitcnt lgkmcnt(0)
779; VI-NEXT:    v_mov_b32_e32 v1, s3
780; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
781; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
782; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
783; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
784; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
785; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
786; VI-NEXT:    s_mov_b32 s3, 0xf000
787; VI-NEXT:    s_mov_b32 s2, -1
788; VI-NEXT:    s_waitcnt vmcnt(1)
789; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
790; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
791; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
792; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
793; VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
794; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
795; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
796; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
797; VI-NEXT:    s_waitcnt vmcnt(0)
798; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
799; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
800; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
801; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
802; VI-NEXT:    v_bcnt_u32_b32 v8, v8, 0
803; VI-NEXT:    v_bcnt_u32_b32 v9, v9, 0
804; VI-NEXT:    v_bcnt_u32_b32 v10, v10, 0
805; VI-NEXT:    v_bcnt_u32_b32 v11, v11, 0
806; VI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
807; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
808; VI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
809; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
810; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
811; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
812; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
813; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
814; VI-NEXT:    v_bcnt_u32_b32 v12, v12, 0
815; VI-NEXT:    v_bcnt_u32_b32 v13, v13, 0
816; VI-NEXT:    v_bcnt_u32_b32 v14, v14, 0
817; VI-NEXT:    v_bcnt_u32_b32 v15, v15, 0
818; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
819; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
820; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
821; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
822; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
823; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
824; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
825; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
826; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
827; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
828; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
829; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
830; VI-NEXT:    v_or_b32_e32 v3, v3, v8
831; VI-NEXT:    v_or_b32_e32 v2, v2, v9
832; VI-NEXT:    v_or_b32_e32 v1, v1, v10
833; VI-NEXT:    v_or_b32_e32 v0, v0, v11
834; VI-NEXT:    v_or_b32_e32 v7, v7, v12
835; VI-NEXT:    v_or_b32_e32 v6, v6, v13
836; VI-NEXT:    v_or_b32_e32 v5, v5, v14
837; VI-NEXT:    v_or_b32_e32 v4, v4, v15
838; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
839; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
840; VI-NEXT:    s_endpgm
841;
842; EG-LABEL: v_ctpop_v16i16:
843; EG:       ; %bb.0:
844; EG-NEXT:    ALU 3, @12, KC0[CB0:0-32], KC1[]
845; EG-NEXT:    TEX 1 @8
846; EG-NEXT:    ALU 114, @16, KC0[], KC1[]
847; EG-NEXT:    ALU 34, @131, KC0[CB0:0-32], KC1[]
848; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
849; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
850; EG-NEXT:    CF_END
851; EG-NEXT:    PAD
852; EG-NEXT:    Fetch clause starting at 8:
853; EG-NEXT:     VTX_READ_128 T20.XYZW, T0.X, 16, #1
854; EG-NEXT:     VTX_READ_128 T21.XYZW, T0.X, 0, #1
855; EG-NEXT:    ALU clause starting at 12:
856; EG-NEXT:     MOV T0.Y, T4.X,
857; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
858; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
859; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
860; EG-NEXT:    ALU clause starting at 16:
861; EG-NEXT:     LSHR * T0.W, T20.X, literal.x,
862; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
863; EG-NEXT:     BCNT_INT * T0.W, PV.W,
864; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
865; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
866; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
867; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
868; EG-NEXT:     MOV * T4.X, PV.W,
869; EG-NEXT:     MOV T0.X, PV.X,
870; EG-NEXT:     AND_INT * T0.W, T20.X, literal.x,
871; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
872; EG-NEXT:     BCNT_INT T0.W, PV.W,
873; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
874; EG-NEXT:    -65536(nan), 0(0.000000e+00)
875; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
876; EG-NEXT:     MOV T4.X, PV.W,
877; EG-NEXT:     MOV * T0.X, T5.X,
878; EG-NEXT:     LSHR * T0.W, T20.Y, literal.x,
879; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
880; EG-NEXT:     BCNT_INT T0.W, PV.W,
881; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
882; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
883; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
884; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
885; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
886; EG-NEXT:     MOV * T5.X, PV.W,
887; EG-NEXT:     MOV T0.X, PV.X,
888; EG-NEXT:     AND_INT * T0.W, T20.Y, literal.x,
889; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
890; EG-NEXT:     BCNT_INT T0.W, PV.W,
891; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
892; EG-NEXT:    -65536(nan), 0(0.000000e+00)
893; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
894; EG-NEXT:     MOV T5.X, PV.Y,
895; EG-NEXT:     MOV * T0.X, T8.X,
896; EG-NEXT:     LSHR * T0.W, T20.Z, literal.x,
897; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
898; EG-NEXT:     BCNT_INT T0.W, PV.W,
899; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
900; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
901; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
902; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
903; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
904; EG-NEXT:     MOV * T8.X, PV.W,
905; EG-NEXT:     MOV T0.X, PV.X,
906; EG-NEXT:     AND_INT * T0.W, T20.Z, literal.x,
907; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
908; EG-NEXT:     BCNT_INT T0.W, PV.W,
909; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
910; EG-NEXT:    -65536(nan), 0(0.000000e+00)
911; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
912; EG-NEXT:     MOV T8.X, PV.W,
913; EG-NEXT:     MOV * T0.X, T9.X,
914; EG-NEXT:     LSHR * T0.W, T20.W, literal.x,
915; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
916; EG-NEXT:     BCNT_INT T0.W, PV.W,
917; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
918; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
919; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
920; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
921; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
922; EG-NEXT:     MOV * T9.X, PV.W,
923; EG-NEXT:     MOV T0.X, PV.X,
924; EG-NEXT:     AND_INT * T0.W, T20.W, literal.x,
925; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
926; EG-NEXT:     BCNT_INT T0.W, PV.W,
927; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
928; EG-NEXT:    -65536(nan), 0(0.000000e+00)
929; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
930; EG-NEXT:     MOV T9.X, PV.W,
931; EG-NEXT:     MOV * T0.X, T12.X,
932; EG-NEXT:     LSHR * T1.W, T21.X, literal.x,
933; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
934; EG-NEXT:     BCNT_INT T1.W, PV.W,
935; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
936; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
937; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
938; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
939; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
940; EG-NEXT:     MOV * T12.X, PV.W,
941; EG-NEXT:     MOV T0.X, PV.X,
942; EG-NEXT:     AND_INT * T1.W, T21.X, literal.x,
943; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
944; EG-NEXT:     BCNT_INT T1.W, PV.W,
945; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
946; EG-NEXT:    -65536(nan), 0(0.000000e+00)
947; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
948; EG-NEXT:     MOV T12.X, PV.W,
949; EG-NEXT:     MOV * T0.X, T13.X,
950; EG-NEXT:     LSHR * T1.W, T21.Y, literal.x,
951; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
952; EG-NEXT:     BCNT_INT T1.W, PV.W,
953; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
954; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
955; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
956; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
957; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
958; EG-NEXT:     MOV * T13.X, PV.W,
959; EG-NEXT:     MOV T0.X, PV.X,
960; EG-NEXT:     AND_INT * T1.W, T21.Y, literal.x,
961; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
962; EG-NEXT:     BCNT_INT T1.W, PV.W,
963; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
964; EG-NEXT:    -65536(nan), 0(0.000000e+00)
965; EG-NEXT:     OR_INT * T20.Y, PS, PV.W,
966; EG-NEXT:     MOV T13.X, PV.Y,
967; EG-NEXT:     MOV * T0.X, T16.X,
968; EG-NEXT:     LSHR * T1.W, T21.Z, literal.x,
969; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
970; EG-NEXT:     BCNT_INT T1.W, PV.W,
971; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
972; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
973; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
974; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
975; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
976; EG-NEXT:    ALU clause starting at 131:
977; EG-NEXT:     MOV * T16.X, T1.W,
978; EG-NEXT:     MOV T0.X, PV.X,
979; EG-NEXT:     AND_INT * T1.W, T21.Z, literal.x,
980; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
981; EG-NEXT:     BCNT_INT T1.W, PV.W,
982; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
983; EG-NEXT:    -65536(nan), 0(0.000000e+00)
984; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
985; EG-NEXT:     MOV T16.X, PV.W,
986; EG-NEXT:     MOV * T0.X, T17.X,
987; EG-NEXT:     LSHR * T1.W, T21.W, literal.x,
988; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
989; EG-NEXT:     BCNT_INT T1.W, PV.W,
990; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
991; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
992; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
993; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
994; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
995; EG-NEXT:     MOV * T17.X, PV.W,
996; EG-NEXT:     MOV T0.X, PV.X,
997; EG-NEXT:     AND_INT T1.W, T21.W, literal.x,
998; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.y,
999; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1000; EG-NEXT:     AND_INT T0.Z, PV.X, literal.x,
1001; EG-NEXT:     BCNT_INT T1.W, PV.W,
1002; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
1003; EG-NEXT:    -65536(nan), 16(2.242078e-44)
1004; EG-NEXT:     LSHR T22.X, PS, literal.x,
1005; EG-NEXT:     OR_INT * T20.W, PV.Z, PV.W,
1006; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1007; EG-NEXT:     MOV T17.X, PV.W,
1008; EG-NEXT:     MOV * T0.X, T4.X,
1009; EG-NEXT:     MOV * T0.Z, T8.X,
1010; EG-NEXT:     MOV T20.X, T12.X,
1011; EG-NEXT:     MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
1012  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1013  %in.gep = getelementptr <16 x i16>, <16 x i16> addrspace(1)* %in, i32 %tid
1014  %val = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep, align 32
1015  %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone
1016  store <16 x i16> %ctpop, <16 x i16> addrspace(1)* %out, align 32
1017  ret void
1018}
1019
1020define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
1021; SI-LABEL: v_ctpop_i16_add_inline_constant:
1022; SI:       ; %bb.0:
1023; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1024; SI-NEXT:    s_mov_b32 s3, 0xf000
1025; SI-NEXT:    s_mov_b32 s6, 0
1026; SI-NEXT:    s_mov_b32 s7, s3
1027; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1028; SI-NEXT:    v_mov_b32_e32 v1, 0
1029; SI-NEXT:    s_waitcnt lgkmcnt(0)
1030; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1031; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1032; SI-NEXT:    s_mov_b32 s2, -1
1033; SI-NEXT:    s_waitcnt vmcnt(0)
1034; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 4
1035; SI-NEXT:    s_waitcnt lgkmcnt(0)
1036; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1037; SI-NEXT:    s_endpgm
1038;
1039; VI-LABEL: v_ctpop_i16_add_inline_constant:
1040; VI:       ; %bb.0:
1041; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1042; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1043; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1044; VI-NEXT:    s_waitcnt lgkmcnt(0)
1045; VI-NEXT:    v_mov_b32_e32 v1, s3
1046; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1047; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1048; VI-NEXT:    flat_load_ushort v0, v[0:1]
1049; VI-NEXT:    s_mov_b32 s3, 0xf000
1050; VI-NEXT:    s_mov_b32 s2, -1
1051; VI-NEXT:    s_waitcnt vmcnt(0)
1052; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
1053; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1054; VI-NEXT:    s_endpgm
1055;
1056; EG-LABEL: v_ctpop_i16_add_inline_constant:
1057; EG:       ; %bb.0:
1058; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1059; EG-NEXT:    TEX 0 @6
1060; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1061; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1062; EG-NEXT:    CF_END
1063; EG-NEXT:    PAD
1064; EG-NEXT:    Fetch clause starting at 6:
1065; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1066; EG-NEXT:    ALU clause starting at 8:
1067; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1068; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1069; EG-NEXT:    ALU clause starting at 10:
1070; EG-NEXT:     BCNT_INT T0.W, T0.X,
1071; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1072; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1073; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1074; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1075; EG-NEXT:    4(5.605194e-45), 3(4.203895e-45)
1076; EG-NEXT:     LSHL T0.X, PV.W, PS,
1077; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1078; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1079; EG-NEXT:     MOV T0.Y, 0.0,
1080; EG-NEXT:     MOV * T0.Z, 0.0,
1081; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1082; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1083  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1084  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1085  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1086  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1087  %add = add i16 %ctpop, 4
1088  store i16 %add, i16 addrspace(1)* %out, align 4
1089  ret void
1090}
1091
1092define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
1093; SI-LABEL: v_ctpop_i16_add_inline_constant_inv:
1094; SI:       ; %bb.0:
1095; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1096; SI-NEXT:    s_mov_b32 s3, 0xf000
1097; SI-NEXT:    s_mov_b32 s6, 0
1098; SI-NEXT:    s_mov_b32 s7, s3
1099; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1100; SI-NEXT:    v_mov_b32_e32 v1, 0
1101; SI-NEXT:    s_waitcnt lgkmcnt(0)
1102; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1103; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1104; SI-NEXT:    s_mov_b32 s2, -1
1105; SI-NEXT:    s_waitcnt vmcnt(0)
1106; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 4
1107; SI-NEXT:    s_waitcnt lgkmcnt(0)
1108; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1109; SI-NEXT:    s_endpgm
1110;
1111; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
1112; VI:       ; %bb.0:
1113; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1114; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1115; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1116; VI-NEXT:    s_waitcnt lgkmcnt(0)
1117; VI-NEXT:    v_mov_b32_e32 v1, s3
1118; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1119; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1120; VI-NEXT:    flat_load_ushort v0, v[0:1]
1121; VI-NEXT:    s_mov_b32 s3, 0xf000
1122; VI-NEXT:    s_mov_b32 s2, -1
1123; VI-NEXT:    s_waitcnt vmcnt(0)
1124; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
1125; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1126; VI-NEXT:    s_endpgm
1127;
1128; EG-LABEL: v_ctpop_i16_add_inline_constant_inv:
1129; EG:       ; %bb.0:
1130; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1131; EG-NEXT:    TEX 0 @6
1132; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1133; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1134; EG-NEXT:    CF_END
1135; EG-NEXT:    PAD
1136; EG-NEXT:    Fetch clause starting at 6:
1137; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1138; EG-NEXT:    ALU clause starting at 8:
1139; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1140; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1141; EG-NEXT:    ALU clause starting at 10:
1142; EG-NEXT:     BCNT_INT T0.W, T0.X,
1143; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1144; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1145; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1146; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1147; EG-NEXT:    4(5.605194e-45), 3(4.203895e-45)
1148; EG-NEXT:     LSHL T0.X, PV.W, PS,
1149; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1150; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1151; EG-NEXT:     MOV T0.Y, 0.0,
1152; EG-NEXT:     MOV * T0.Z, 0.0,
1153; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1154; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1155  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1156  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1157  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1158  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1159  %add = add i16 4, %ctpop
1160  store i16 %add, i16 addrspace(1)* %out, align 4
1161  ret void
1162}
1163
1164define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
1165; SI-LABEL: v_ctpop_i16_add_literal:
1166; SI:       ; %bb.0:
1167; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1168; SI-NEXT:    s_mov_b32 s3, 0xf000
1169; SI-NEXT:    s_mov_b32 s6, 0
1170; SI-NEXT:    s_mov_b32 s7, s3
1171; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1172; SI-NEXT:    v_mov_b32_e32 v1, 0
1173; SI-NEXT:    s_waitcnt lgkmcnt(0)
1174; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1175; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1176; SI-NEXT:    s_movk_i32 s4, 0x3e7
1177; SI-NEXT:    s_mov_b32 s2, -1
1178; SI-NEXT:    s_waitcnt vmcnt(0)
1179; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s4
1180; SI-NEXT:    s_waitcnt lgkmcnt(0)
1181; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1182; SI-NEXT:    s_endpgm
1183;
1184; VI-LABEL: v_ctpop_i16_add_literal:
1185; VI:       ; %bb.0:
1186; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1187; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1188; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1189; VI-NEXT:    s_movk_i32 s4, 0x3e7
1190; VI-NEXT:    s_waitcnt lgkmcnt(0)
1191; VI-NEXT:    v_mov_b32_e32 v1, s3
1192; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1193; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1194; VI-NEXT:    flat_load_ushort v0, v[0:1]
1195; VI-NEXT:    s_mov_b32 s3, 0xf000
1196; VI-NEXT:    s_mov_b32 s2, -1
1197; VI-NEXT:    s_waitcnt vmcnt(0)
1198; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1199; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1200; VI-NEXT:    s_endpgm
1201;
1202; EG-LABEL: v_ctpop_i16_add_literal:
1203; EG:       ; %bb.0:
1204; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1205; EG-NEXT:    TEX 0 @6
1206; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1207; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1208; EG-NEXT:    CF_END
1209; EG-NEXT:    PAD
1210; EG-NEXT:    Fetch clause starting at 6:
1211; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1212; EG-NEXT:    ALU clause starting at 8:
1213; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1214; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1215; EG-NEXT:    ALU clause starting at 10:
1216; EG-NEXT:     BCNT_INT T0.W, T0.X,
1217; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1218; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1219; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1220; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1221; EG-NEXT:    999(1.399897e-42), 3(4.203895e-45)
1222; EG-NEXT:     LSHL T0.X, PV.W, PS,
1223; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1224; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1225; EG-NEXT:     MOV T0.Y, 0.0,
1226; EG-NEXT:     MOV * T0.Z, 0.0,
1227; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1228; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1229  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1230  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1231  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1232  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1233  %add = add i16 %ctpop, 999
1234  store i16 %add, i16 addrspace(1)* %out, align 4
1235  ret void
1236}
1237
1238define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
1239; SI-LABEL: v_ctpop_i16_add_var:
1240; SI:       ; %bb.0:
1241; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1242; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
1243; SI-NEXT:    s_mov_b32 s3, 0xf000
1244; SI-NEXT:    s_mov_b32 s6, 0
1245; SI-NEXT:    s_mov_b32 s7, s3
1246; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1247; SI-NEXT:    v_mov_b32_e32 v1, 0
1248; SI-NEXT:    s_waitcnt lgkmcnt(0)
1249; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1250; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1251; SI-NEXT:    s_mov_b32 s2, -1
1252; SI-NEXT:    s_waitcnt vmcnt(0)
1253; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s8
1254; SI-NEXT:    s_waitcnt lgkmcnt(0)
1255; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1256; SI-NEXT:    s_endpgm
1257;
1258; VI-LABEL: v_ctpop_i16_add_var:
1259; VI:       ; %bb.0:
1260; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1261; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1262; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1263; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1264; VI-NEXT:    s_waitcnt lgkmcnt(0)
1265; VI-NEXT:    v_mov_b32_e32 v1, s3
1266; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1267; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1268; VI-NEXT:    flat_load_ushort v0, v[0:1]
1269; VI-NEXT:    s_mov_b32 s3, 0xf000
1270; VI-NEXT:    s_mov_b32 s2, -1
1271; VI-NEXT:    s_waitcnt vmcnt(0)
1272; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1273; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1274; VI-NEXT:    s_endpgm
1275;
1276; EG-LABEL: v_ctpop_i16_add_var:
1277; EG:       ; %bb.0:
1278; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1279; EG-NEXT:    TEX 0 @8
1280; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1281; EG-NEXT:    TEX 0 @10
1282; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1283; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1284; EG-NEXT:    CF_END
1285; EG-NEXT:    PAD
1286; EG-NEXT:    Fetch clause starting at 8:
1287; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1288; EG-NEXT:    Fetch clause starting at 10:
1289; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
1290; EG-NEXT:    ALU clause starting at 12:
1291; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1292; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1293; EG-NEXT:    ALU clause starting at 14:
1294; EG-NEXT:     MOV * T1.X, 0.0,
1295; EG-NEXT:    ALU clause starting at 15:
1296; EG-NEXT:     BCNT_INT T0.W, T0.X,
1297; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1298; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1299; EG-NEXT:     ADD_INT * T0.W, PV.W, T1.X,
1300; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1301; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1302; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1303; EG-NEXT:     LSHL T0.X, PV.W, PS,
1304; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1305; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1306; EG-NEXT:     MOV T0.Y, 0.0,
1307; EG-NEXT:     MOV * T0.Z, 0.0,
1308; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1309; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1310  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1311  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1312  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1313  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1314  %add = add i16 %ctpop, %const
1315  store i16 %add, i16 addrspace(1)* %out, align 4
1316  ret void
1317}
1318
1319define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
1320; SI-LABEL: v_ctpop_i16_add_var_inv:
1321; SI:       ; %bb.0:
1322; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1323; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
1324; SI-NEXT:    s_mov_b32 s3, 0xf000
1325; SI-NEXT:    s_mov_b32 s6, 0
1326; SI-NEXT:    s_mov_b32 s7, s3
1327; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1328; SI-NEXT:    v_mov_b32_e32 v1, 0
1329; SI-NEXT:    s_waitcnt lgkmcnt(0)
1330; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1331; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1332; SI-NEXT:    s_mov_b32 s2, -1
1333; SI-NEXT:    s_waitcnt vmcnt(0)
1334; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s8
1335; SI-NEXT:    s_waitcnt lgkmcnt(0)
1336; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1337; SI-NEXT:    s_endpgm
1338;
1339; VI-LABEL: v_ctpop_i16_add_var_inv:
1340; VI:       ; %bb.0:
1341; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1342; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1343; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1344; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1345; VI-NEXT:    s_waitcnt lgkmcnt(0)
1346; VI-NEXT:    v_mov_b32_e32 v1, s3
1347; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1348; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1349; VI-NEXT:    flat_load_ushort v0, v[0:1]
1350; VI-NEXT:    s_mov_b32 s3, 0xf000
1351; VI-NEXT:    s_mov_b32 s2, -1
1352; VI-NEXT:    s_waitcnt vmcnt(0)
1353; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1354; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1355; VI-NEXT:    s_endpgm
1356;
1357; EG-LABEL: v_ctpop_i16_add_var_inv:
1358; EG:       ; %bb.0:
1359; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1360; EG-NEXT:    TEX 0 @8
1361; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1362; EG-NEXT:    TEX 0 @10
1363; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1364; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1365; EG-NEXT:    CF_END
1366; EG-NEXT:    PAD
1367; EG-NEXT:    Fetch clause starting at 8:
1368; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1369; EG-NEXT:    Fetch clause starting at 10:
1370; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
1371; EG-NEXT:    ALU clause starting at 12:
1372; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1373; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1374; EG-NEXT:    ALU clause starting at 14:
1375; EG-NEXT:     MOV * T1.X, 0.0,
1376; EG-NEXT:    ALU clause starting at 15:
1377; EG-NEXT:     BCNT_INT T0.W, T0.X,
1378; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1379; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1380; EG-NEXT:     ADD_INT * T0.W, T1.X, PV.W,
1381; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1382; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1383; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1384; EG-NEXT:     LSHL T0.X, PV.W, PS,
1385; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1386; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1387; EG-NEXT:     MOV T0.Y, 0.0,
1388; EG-NEXT:     MOV * T0.Z, 0.0,
1389; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1390; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1391  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1392  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1393  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1394  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1395  %add = add i16 %const, %ctpop
1396  store i16 %add, i16 addrspace(1)* %out, align 4
1397  ret void
1398}
1399
1400define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind {
1401; SI-LABEL: v_ctpop_i16_add_vvar_inv:
1402; SI:       ; %bb.0:
1403; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1404; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1405; SI-NEXT:    s_mov_b32 s3, 0xf000
1406; SI-NEXT:    s_mov_b32 s6, 0
1407; SI-NEXT:    s_mov_b32 s7, s3
1408; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1409; SI-NEXT:    v_mov_b32_e32 v1, 0
1410; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1411; SI-NEXT:    s_waitcnt lgkmcnt(0)
1412; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
1413; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1414; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1415; SI-NEXT:    s_mov_b32 s2, -1
1416; SI-NEXT:    s_waitcnt vmcnt(0)
1417; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v2, v0
1418; SI-NEXT:    s_waitcnt lgkmcnt(0)
1419; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1420; SI-NEXT:    s_endpgm
1421;
1422; VI-LABEL: v_ctpop_i16_add_vvar_inv:
1423; VI:       ; %bb.0:
1424; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1425; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1426; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1427; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1428; VI-NEXT:    s_waitcnt lgkmcnt(0)
1429; VI-NEXT:    v_mov_b32_e32 v1, s3
1430; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1431; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1432; VI-NEXT:    flat_load_ushort v3, v[0:1]
1433; VI-NEXT:    v_mov_b32_e32 v1, s5
1434; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1435; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1436; VI-NEXT:    flat_load_ushort v0, v[0:1]
1437; VI-NEXT:    s_mov_b32 s3, 0xf000
1438; VI-NEXT:    s_mov_b32 s2, -1
1439; VI-NEXT:    s_waitcnt vmcnt(0)
1440; VI-NEXT:    v_bcnt_u32_b32 v0, v3, v0
1441; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1442; VI-NEXT:    s_endpgm
1443;
1444; EG-LABEL: v_ctpop_i16_add_vvar_inv:
1445; EG:       ; %bb.0:
1446; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1447; EG-NEXT:    TEX 0 @8
1448; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1449; EG-NEXT:    TEX 0 @10
1450; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1451; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1452; EG-NEXT:    CF_END
1453; EG-NEXT:    PAD
1454; EG-NEXT:    Fetch clause starting at 8:
1455; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1456; EG-NEXT:    Fetch clause starting at 10:
1457; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
1458; EG-NEXT:    ALU clause starting at 12:
1459; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1460; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1461; EG-NEXT:    ALU clause starting at 14:
1462; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, T0.W,
1463; EG-NEXT:    ALU clause starting at 15:
1464; EG-NEXT:     BCNT_INT T0.W, T0.X,
1465; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1466; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1467; EG-NEXT:     ADD_INT * T0.W, T1.X, PV.W,
1468; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1469; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1470; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1471; EG-NEXT:     LSHL T0.X, PV.W, PS,
1472; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1473; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1474; EG-NEXT:     MOV T0.Y, 0.0,
1475; EG-NEXT:     MOV * T0.Z, 0.0,
1476; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1477; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1478  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1479  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
1480  %val = load i16, i16 addrspace(1)* %in.gep, align 4
1481  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1482  %gep = getelementptr i16, i16 addrspace(1)* %constptr, i32 %tid
1483  %const = load i16, i16 addrspace(1)* %gep, align 4
1484  %add = add i16 %const, %ctpop
1485  store i16 %add, i16 addrspace(1)* %out, align 4
1486  ret void
1487}
1488
1489; FIXME: We currently disallow SALU instructions in all branches,
1490; but there are some cases when the should be allowed.
1491define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %ctpop_arg, i16 %cond) {
1492; SI-LABEL: ctpop_i16_in_br:
1493; SI:       ; %bb.0: ; %entry
1494; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
1495; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1496; SI-NEXT:    s_waitcnt lgkmcnt(0)
1497; SI-NEXT:    s_lshr_b32 s5, s4, 16
1498; SI-NEXT:    s_cmp_lg_u32 s5, 0
1499; SI-NEXT:    s_cbranch_scc0 .LBB14_4
1500; SI-NEXT:  ; %bb.1: ; %else
1501; SI-NEXT:    s_mov_b32 s11, 0xf000
1502; SI-NEXT:    s_mov_b32 s10, -1
1503; SI-NEXT:    s_mov_b32 s8, s2
1504; SI-NEXT:    s_mov_b32 s9, s3
1505; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1506; SI-NEXT:    s_mov_b64 s[2:3], 0
1507; SI-NEXT:    s_cbranch_execnz .LBB14_3
1508; SI-NEXT:  .LBB14_2: ; %if
1509; SI-NEXT:    s_and_b32 s2, s4, 0xffff
1510; SI-NEXT:    s_bcnt1_i32_b32 s2, s2
1511; SI-NEXT:    s_waitcnt vmcnt(0)
1512; SI-NEXT:    v_mov_b32_e32 v0, s2
1513; SI-NEXT:  .LBB14_3: ; %endif
1514; SI-NEXT:    s_mov_b32 s3, 0xf000
1515; SI-NEXT:    s_mov_b32 s2, -1
1516; SI-NEXT:    s_waitcnt vmcnt(0)
1517; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1518; SI-NEXT:    s_endpgm
1519; SI-NEXT:  .LBB14_4:
1520; SI-NEXT:    s_mov_b64 s[2:3], -1
1521; SI-NEXT:    v_mov_b32_e32 v0, 0
1522; SI-NEXT:    s_branch .LBB14_2
1523;
1524; VI-LABEL: ctpop_i16_in_br:
1525; VI:       ; %bb.0: ; %entry
1526; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
1527; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1528; VI-NEXT:    s_waitcnt lgkmcnt(0)
1529; VI-NEXT:    s_lshr_b32 s5, s4, 16
1530; VI-NEXT:    v_cmp_ne_u16_e64 s[6:7], s5, 0
1531; VI-NEXT:    s_and_b64 vcc, exec, s[6:7]
1532; VI-NEXT:    s_cbranch_vccz .LBB14_4
1533; VI-NEXT:  ; %bb.1: ; %else
1534; VI-NEXT:    s_mov_b32 s11, 0xf000
1535; VI-NEXT:    s_mov_b32 s10, -1
1536; VI-NEXT:    s_mov_b32 s8, s2
1537; VI-NEXT:    s_mov_b32 s9, s3
1538; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1539; VI-NEXT:    s_mov_b64 s[2:3], 0
1540; VI-NEXT:    s_cbranch_execnz .LBB14_3
1541; VI-NEXT:  .LBB14_2: ; %if
1542; VI-NEXT:    s_and_b32 s2, s4, 0xffff
1543; VI-NEXT:    s_bcnt1_i32_b32 s2, s2
1544; VI-NEXT:    s_waitcnt vmcnt(0)
1545; VI-NEXT:    v_mov_b32_e32 v0, s2
1546; VI-NEXT:  .LBB14_3: ; %endif
1547; VI-NEXT:    s_mov_b32 s3, 0xf000
1548; VI-NEXT:    s_mov_b32 s2, -1
1549; VI-NEXT:    s_waitcnt vmcnt(0)
1550; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1551; VI-NEXT:    s_endpgm
1552; VI-NEXT:  .LBB14_4:
1553; VI-NEXT:    s_mov_b64 s[2:3], -1
1554; VI-NEXT:    ; implicit-def: $vgpr0
1555; VI-NEXT:    s_branch .LBB14_2
1556;
1557; EG-LABEL: ctpop_i16_in_br:
1558; EG:       ; %bb.0: ; %entry
1559; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
1560; EG-NEXT:    TEX 0 @14
1561; EG-NEXT:    ALU_PUSH_BEFORE 6, @21, KC0[], KC1[]
1562; EG-NEXT:    JUMP @7 POP:1
1563; EG-NEXT:    ALU 0, @28, KC0[CB0:0-32], KC1[]
1564; EG-NEXT:    TEX 0 @16
1565; EG-NEXT:    ALU_POP_AFTER 1, @29, KC0[], KC1[]
1566; EG-NEXT:    ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
1567; EG-NEXT:    JUMP @11 POP:1
1568; EG-NEXT:    TEX 0 @18
1569; EG-NEXT:    ALU_POP_AFTER 0, @34, KC0[], KC1[]
1570; EG-NEXT:    ALU 11, @35, KC0[], KC1[]
1571; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
1572; EG-NEXT:    CF_END
1573; EG-NEXT:    Fetch clause starting at 14:
1574; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 46, #3
1575; EG-NEXT:    Fetch clause starting at 16:
1576; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 2, #1
1577; EG-NEXT:    Fetch clause starting at 18:
1578; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
1579; EG-NEXT:    ALU clause starting at 20:
1580; EG-NEXT:     MOV * T0.X, 0.0,
1581; EG-NEXT:    ALU clause starting at 21:
1582; EG-NEXT:     AND_INT * T0.W, T1.X, literal.x,
1583; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1584; EG-NEXT:     MOV T1.X, literal.x,
1585; EG-NEXT:     MOV T1.W, literal.y,
1586; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
1587; EG-NEXT:    0(0.000000e+00), 1(1.401298e-45)
1588; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1589; EG-NEXT:    ALU clause starting at 28:
1590; EG-NEXT:     MOV * T1.X, KC0[2].Z,
1591; EG-NEXT:    ALU clause starting at 29:
1592; EG-NEXT:     MOV * T1.W, literal.x,
1593; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1594; EG-NEXT:    ALU clause starting at 31:
1595; EG-NEXT:     MOV T0.W, KC0[2].Y,
1596; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
1597; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1598; EG-NEXT:    ALU clause starting at 34:
1599; EG-NEXT:     BCNT_INT * T1.X, T0.X,
1600; EG-NEXT:    ALU clause starting at 35:
1601; EG-NEXT:     LSHL * T1.W, T0.W, literal.x,
1602; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1603; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1604; EG-NEXT:     AND_INT * T2.W, T1.X, literal.y,
1605; EG-NEXT:    24(3.363116e-44), 65535(9.183409e-41)
1606; EG-NEXT:     LSHL T1.X, PS, PV.W,
1607; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
1608; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1609; EG-NEXT:     MOV T1.Y, 0.0,
1610; EG-NEXT:     MOV * T1.Z, 0.0,
1611; EG-NEXT:     LSHR * T0.X, T0.W, literal.x,
1612; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1613entry:
1614  %tmp0 = icmp eq i16 %cond, 0
1615  br i1 %tmp0, label %if, label %else
1616
1617if:
1618  %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg)
1619  br label %endif
1620
1621else:
1622  %tmp3 = getelementptr i16, i16 addrspace(1)* %in, i16 1
1623  %tmp4 = load i16, i16 addrspace(1)* %tmp3
1624  br label %endif
1625
1626endif:
1627  %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else]
1628  store i16 %tmp5, i16 addrspace(1)* %out
1629  ret void
1630}
1631