1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4
5define <vscale x 1 x i8> @ctpop_nxv1i8(<vscale x 1 x i8> %va) {
6; CHECK-LABEL: ctpop_nxv1i8:
7; CHECK:       # %bb.0:
8; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, mu
9; CHECK-NEXT:    vsrl.vi v9, v8, 1
10; CHECK-NEXT:    li a0, 85
11; CHECK-NEXT:    vand.vx v9, v9, a0
12; CHECK-NEXT:    vsub.vv v8, v8, v9
13; CHECK-NEXT:    li a0, 51
14; CHECK-NEXT:    vand.vx v9, v8, a0
15; CHECK-NEXT:    vsrl.vi v8, v8, 2
16; CHECK-NEXT:    vand.vx v8, v8, a0
17; CHECK-NEXT:    vadd.vv v8, v9, v8
18; CHECK-NEXT:    vsrl.vi v9, v8, 4
19; CHECK-NEXT:    vadd.vv v8, v8, v9
20; CHECK-NEXT:    vand.vi v8, v8, 15
21; CHECK-NEXT:    ret
22  %a = call <vscale x 1 x i8> @llvm.ctpop.nxv1i8(<vscale x 1 x i8> %va)
23  ret <vscale x 1 x i8> %a
24}
25declare <vscale x 1 x i8> @llvm.ctpop.nxv1i8(<vscale x 1 x i8>)
26
27define <vscale x 2 x i8> @ctpop_nxv2i8(<vscale x 2 x i8> %va) {
28; CHECK-LABEL: ctpop_nxv2i8:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
31; CHECK-NEXT:    vsrl.vi v9, v8, 1
32; CHECK-NEXT:    li a0, 85
33; CHECK-NEXT:    vand.vx v9, v9, a0
34; CHECK-NEXT:    vsub.vv v8, v8, v9
35; CHECK-NEXT:    li a0, 51
36; CHECK-NEXT:    vand.vx v9, v8, a0
37; CHECK-NEXT:    vsrl.vi v8, v8, 2
38; CHECK-NEXT:    vand.vx v8, v8, a0
39; CHECK-NEXT:    vadd.vv v8, v9, v8
40; CHECK-NEXT:    vsrl.vi v9, v8, 4
41; CHECK-NEXT:    vadd.vv v8, v8, v9
42; CHECK-NEXT:    vand.vi v8, v8, 15
43; CHECK-NEXT:    ret
44  %a = call <vscale x 2 x i8> @llvm.ctpop.nxv2i8(<vscale x 2 x i8> %va)
45  ret <vscale x 2 x i8> %a
46}
47declare <vscale x 2 x i8> @llvm.ctpop.nxv2i8(<vscale x 2 x i8>)
48
49define <vscale x 4 x i8> @ctpop_nxv4i8(<vscale x 4 x i8> %va) {
50; CHECK-LABEL: ctpop_nxv4i8:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, mu
53; CHECK-NEXT:    vsrl.vi v9, v8, 1
54; CHECK-NEXT:    li a0, 85
55; CHECK-NEXT:    vand.vx v9, v9, a0
56; CHECK-NEXT:    vsub.vv v8, v8, v9
57; CHECK-NEXT:    li a0, 51
58; CHECK-NEXT:    vand.vx v9, v8, a0
59; CHECK-NEXT:    vsrl.vi v8, v8, 2
60; CHECK-NEXT:    vand.vx v8, v8, a0
61; CHECK-NEXT:    vadd.vv v8, v9, v8
62; CHECK-NEXT:    vsrl.vi v9, v8, 4
63; CHECK-NEXT:    vadd.vv v8, v8, v9
64; CHECK-NEXT:    vand.vi v8, v8, 15
65; CHECK-NEXT:    ret
66  %a = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> %va)
67  ret <vscale x 4 x i8> %a
68}
69declare <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8>)
70
71define <vscale x 8 x i8> @ctpop_nxv8i8(<vscale x 8 x i8> %va) {
72; CHECK-LABEL: ctpop_nxv8i8:
73; CHECK:       # %bb.0:
74; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, mu
75; CHECK-NEXT:    vsrl.vi v9, v8, 1
76; CHECK-NEXT:    li a0, 85
77; CHECK-NEXT:    vand.vx v9, v9, a0
78; CHECK-NEXT:    vsub.vv v8, v8, v9
79; CHECK-NEXT:    li a0, 51
80; CHECK-NEXT:    vand.vx v9, v8, a0
81; CHECK-NEXT:    vsrl.vi v8, v8, 2
82; CHECK-NEXT:    vand.vx v8, v8, a0
83; CHECK-NEXT:    vadd.vv v8, v9, v8
84; CHECK-NEXT:    vsrl.vi v9, v8, 4
85; CHECK-NEXT:    vadd.vv v8, v8, v9
86; CHECK-NEXT:    vand.vi v8, v8, 15
87; CHECK-NEXT:    ret
88  %a = call <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8> %va)
89  ret <vscale x 8 x i8> %a
90}
91declare <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8>)
92
93define <vscale x 16 x i8> @ctpop_nxv16i8(<vscale x 16 x i8> %va) {
94; CHECK-LABEL: ctpop_nxv16i8:
95; CHECK:       # %bb.0:
96; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, mu
97; CHECK-NEXT:    vsrl.vi v10, v8, 1
98; CHECK-NEXT:    li a0, 85
99; CHECK-NEXT:    vand.vx v10, v10, a0
100; CHECK-NEXT:    vsub.vv v8, v8, v10
101; CHECK-NEXT:    li a0, 51
102; CHECK-NEXT:    vand.vx v10, v8, a0
103; CHECK-NEXT:    vsrl.vi v8, v8, 2
104; CHECK-NEXT:    vand.vx v8, v8, a0
105; CHECK-NEXT:    vadd.vv v8, v10, v8
106; CHECK-NEXT:    vsrl.vi v10, v8, 4
107; CHECK-NEXT:    vadd.vv v8, v8, v10
108; CHECK-NEXT:    vand.vi v8, v8, 15
109; CHECK-NEXT:    ret
110  %a = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> %va)
111  ret <vscale x 16 x i8> %a
112}
113declare <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8>)
114
115define <vscale x 32 x i8> @ctpop_nxv32i8(<vscale x 32 x i8> %va) {
116; CHECK-LABEL: ctpop_nxv32i8:
117; CHECK:       # %bb.0:
118; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, mu
119; CHECK-NEXT:    vsrl.vi v12, v8, 1
120; CHECK-NEXT:    li a0, 85
121; CHECK-NEXT:    vand.vx v12, v12, a0
122; CHECK-NEXT:    vsub.vv v8, v8, v12
123; CHECK-NEXT:    li a0, 51
124; CHECK-NEXT:    vand.vx v12, v8, a0
125; CHECK-NEXT:    vsrl.vi v8, v8, 2
126; CHECK-NEXT:    vand.vx v8, v8, a0
127; CHECK-NEXT:    vadd.vv v8, v12, v8
128; CHECK-NEXT:    vsrl.vi v12, v8, 4
129; CHECK-NEXT:    vadd.vv v8, v8, v12
130; CHECK-NEXT:    vand.vi v8, v8, 15
131; CHECK-NEXT:    ret
132  %a = call <vscale x 32 x i8> @llvm.ctpop.nxv32i8(<vscale x 32 x i8> %va)
133  ret <vscale x 32 x i8> %a
134}
135declare <vscale x 32 x i8> @llvm.ctpop.nxv32i8(<vscale x 32 x i8>)
136
137define <vscale x 64 x i8> @ctpop_nxv64i8(<vscale x 64 x i8> %va) {
138; CHECK-LABEL: ctpop_nxv64i8:
139; CHECK:       # %bb.0:
140; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, mu
141; CHECK-NEXT:    vsrl.vi v16, v8, 1
142; CHECK-NEXT:    li a0, 85
143; CHECK-NEXT:    vand.vx v16, v16, a0
144; CHECK-NEXT:    vsub.vv v8, v8, v16
145; CHECK-NEXT:    li a0, 51
146; CHECK-NEXT:    vand.vx v16, v8, a0
147; CHECK-NEXT:    vsrl.vi v8, v8, 2
148; CHECK-NEXT:    vand.vx v8, v8, a0
149; CHECK-NEXT:    vadd.vv v8, v16, v8
150; CHECK-NEXT:    vsrl.vi v16, v8, 4
151; CHECK-NEXT:    vadd.vv v8, v8, v16
152; CHECK-NEXT:    vand.vi v8, v8, 15
153; CHECK-NEXT:    ret
154  %a = call <vscale x 64 x i8> @llvm.ctpop.nxv64i8(<vscale x 64 x i8> %va)
155  ret <vscale x 64 x i8> %a
156}
157declare <vscale x 64 x i8> @llvm.ctpop.nxv64i8(<vscale x 64 x i8>)
158
159define <vscale x 1 x i16> @ctpop_nxv1i16(<vscale x 1 x i16> %va) {
160; RV32-LABEL: ctpop_nxv1i16:
161; RV32:       # %bb.0:
162; RV32-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
163; RV32-NEXT:    vsrl.vi v9, v8, 1
164; RV32-NEXT:    lui a0, 5
165; RV32-NEXT:    addi a0, a0, 1365
166; RV32-NEXT:    vand.vx v9, v9, a0
167; RV32-NEXT:    vsub.vv v8, v8, v9
168; RV32-NEXT:    lui a0, 3
169; RV32-NEXT:    addi a0, a0, 819
170; RV32-NEXT:    vand.vx v9, v8, a0
171; RV32-NEXT:    vsrl.vi v8, v8, 2
172; RV32-NEXT:    vand.vx v8, v8, a0
173; RV32-NEXT:    vadd.vv v8, v9, v8
174; RV32-NEXT:    vsrl.vi v9, v8, 4
175; RV32-NEXT:    vadd.vv v8, v8, v9
176; RV32-NEXT:    lui a0, 1
177; RV32-NEXT:    addi a0, a0, -241
178; RV32-NEXT:    vand.vx v8, v8, a0
179; RV32-NEXT:    li a0, 257
180; RV32-NEXT:    vmul.vx v8, v8, a0
181; RV32-NEXT:    vsrl.vi v8, v8, 8
182; RV32-NEXT:    ret
183;
184; RV64-LABEL: ctpop_nxv1i16:
185; RV64:       # %bb.0:
186; RV64-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
187; RV64-NEXT:    vsrl.vi v9, v8, 1
188; RV64-NEXT:    lui a0, 5
189; RV64-NEXT:    addiw a0, a0, 1365
190; RV64-NEXT:    vand.vx v9, v9, a0
191; RV64-NEXT:    vsub.vv v8, v8, v9
192; RV64-NEXT:    lui a0, 3
193; RV64-NEXT:    addiw a0, a0, 819
194; RV64-NEXT:    vand.vx v9, v8, a0
195; RV64-NEXT:    vsrl.vi v8, v8, 2
196; RV64-NEXT:    vand.vx v8, v8, a0
197; RV64-NEXT:    vadd.vv v8, v9, v8
198; RV64-NEXT:    vsrl.vi v9, v8, 4
199; RV64-NEXT:    vadd.vv v8, v8, v9
200; RV64-NEXT:    lui a0, 1
201; RV64-NEXT:    addiw a0, a0, -241
202; RV64-NEXT:    vand.vx v8, v8, a0
203; RV64-NEXT:    li a0, 257
204; RV64-NEXT:    vmul.vx v8, v8, a0
205; RV64-NEXT:    vsrl.vi v8, v8, 8
206; RV64-NEXT:    ret
207  %a = call <vscale x 1 x i16> @llvm.ctpop.nxv1i16(<vscale x 1 x i16> %va)
208  ret <vscale x 1 x i16> %a
209}
210declare <vscale x 1 x i16> @llvm.ctpop.nxv1i16(<vscale x 1 x i16>)
211
212define <vscale x 2 x i16> @ctpop_nxv2i16(<vscale x 2 x i16> %va) {
213; RV32-LABEL: ctpop_nxv2i16:
214; RV32:       # %bb.0:
215; RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
216; RV32-NEXT:    vsrl.vi v9, v8, 1
217; RV32-NEXT:    lui a0, 5
218; RV32-NEXT:    addi a0, a0, 1365
219; RV32-NEXT:    vand.vx v9, v9, a0
220; RV32-NEXT:    vsub.vv v8, v8, v9
221; RV32-NEXT:    lui a0, 3
222; RV32-NEXT:    addi a0, a0, 819
223; RV32-NEXT:    vand.vx v9, v8, a0
224; RV32-NEXT:    vsrl.vi v8, v8, 2
225; RV32-NEXT:    vand.vx v8, v8, a0
226; RV32-NEXT:    vadd.vv v8, v9, v8
227; RV32-NEXT:    vsrl.vi v9, v8, 4
228; RV32-NEXT:    vadd.vv v8, v8, v9
229; RV32-NEXT:    lui a0, 1
230; RV32-NEXT:    addi a0, a0, -241
231; RV32-NEXT:    vand.vx v8, v8, a0
232; RV32-NEXT:    li a0, 257
233; RV32-NEXT:    vmul.vx v8, v8, a0
234; RV32-NEXT:    vsrl.vi v8, v8, 8
235; RV32-NEXT:    ret
236;
237; RV64-LABEL: ctpop_nxv2i16:
238; RV64:       # %bb.0:
239; RV64-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
240; RV64-NEXT:    vsrl.vi v9, v8, 1
241; RV64-NEXT:    lui a0, 5
242; RV64-NEXT:    addiw a0, a0, 1365
243; RV64-NEXT:    vand.vx v9, v9, a0
244; RV64-NEXT:    vsub.vv v8, v8, v9
245; RV64-NEXT:    lui a0, 3
246; RV64-NEXT:    addiw a0, a0, 819
247; RV64-NEXT:    vand.vx v9, v8, a0
248; RV64-NEXT:    vsrl.vi v8, v8, 2
249; RV64-NEXT:    vand.vx v8, v8, a0
250; RV64-NEXT:    vadd.vv v8, v9, v8
251; RV64-NEXT:    vsrl.vi v9, v8, 4
252; RV64-NEXT:    vadd.vv v8, v8, v9
253; RV64-NEXT:    lui a0, 1
254; RV64-NEXT:    addiw a0, a0, -241
255; RV64-NEXT:    vand.vx v8, v8, a0
256; RV64-NEXT:    li a0, 257
257; RV64-NEXT:    vmul.vx v8, v8, a0
258; RV64-NEXT:    vsrl.vi v8, v8, 8
259; RV64-NEXT:    ret
260  %a = call <vscale x 2 x i16> @llvm.ctpop.nxv2i16(<vscale x 2 x i16> %va)
261  ret <vscale x 2 x i16> %a
262}
263declare <vscale x 2 x i16> @llvm.ctpop.nxv2i16(<vscale x 2 x i16>)
264
265define <vscale x 4 x i16> @ctpop_nxv4i16(<vscale x 4 x i16> %va) {
266; RV32-LABEL: ctpop_nxv4i16:
267; RV32:       # %bb.0:
268; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
269; RV32-NEXT:    vsrl.vi v9, v8, 1
270; RV32-NEXT:    lui a0, 5
271; RV32-NEXT:    addi a0, a0, 1365
272; RV32-NEXT:    vand.vx v9, v9, a0
273; RV32-NEXT:    vsub.vv v8, v8, v9
274; RV32-NEXT:    lui a0, 3
275; RV32-NEXT:    addi a0, a0, 819
276; RV32-NEXT:    vand.vx v9, v8, a0
277; RV32-NEXT:    vsrl.vi v8, v8, 2
278; RV32-NEXT:    vand.vx v8, v8, a0
279; RV32-NEXT:    vadd.vv v8, v9, v8
280; RV32-NEXT:    vsrl.vi v9, v8, 4
281; RV32-NEXT:    vadd.vv v8, v8, v9
282; RV32-NEXT:    lui a0, 1
283; RV32-NEXT:    addi a0, a0, -241
284; RV32-NEXT:    vand.vx v8, v8, a0
285; RV32-NEXT:    li a0, 257
286; RV32-NEXT:    vmul.vx v8, v8, a0
287; RV32-NEXT:    vsrl.vi v8, v8, 8
288; RV32-NEXT:    ret
289;
290; RV64-LABEL: ctpop_nxv4i16:
291; RV64:       # %bb.0:
292; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
293; RV64-NEXT:    vsrl.vi v9, v8, 1
294; RV64-NEXT:    lui a0, 5
295; RV64-NEXT:    addiw a0, a0, 1365
296; RV64-NEXT:    vand.vx v9, v9, a0
297; RV64-NEXT:    vsub.vv v8, v8, v9
298; RV64-NEXT:    lui a0, 3
299; RV64-NEXT:    addiw a0, a0, 819
300; RV64-NEXT:    vand.vx v9, v8, a0
301; RV64-NEXT:    vsrl.vi v8, v8, 2
302; RV64-NEXT:    vand.vx v8, v8, a0
303; RV64-NEXT:    vadd.vv v8, v9, v8
304; RV64-NEXT:    vsrl.vi v9, v8, 4
305; RV64-NEXT:    vadd.vv v8, v8, v9
306; RV64-NEXT:    lui a0, 1
307; RV64-NEXT:    addiw a0, a0, -241
308; RV64-NEXT:    vand.vx v8, v8, a0
309; RV64-NEXT:    li a0, 257
310; RV64-NEXT:    vmul.vx v8, v8, a0
311; RV64-NEXT:    vsrl.vi v8, v8, 8
312; RV64-NEXT:    ret
313  %a = call <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16> %va)
314  ret <vscale x 4 x i16> %a
315}
316declare <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16>)
317
318define <vscale x 8 x i16> @ctpop_nxv8i16(<vscale x 8 x i16> %va) {
319; RV32-LABEL: ctpop_nxv8i16:
320; RV32:       # %bb.0:
321; RV32-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
322; RV32-NEXT:    vsrl.vi v10, v8, 1
323; RV32-NEXT:    lui a0, 5
324; RV32-NEXT:    addi a0, a0, 1365
325; RV32-NEXT:    vand.vx v10, v10, a0
326; RV32-NEXT:    vsub.vv v8, v8, v10
327; RV32-NEXT:    lui a0, 3
328; RV32-NEXT:    addi a0, a0, 819
329; RV32-NEXT:    vand.vx v10, v8, a0
330; RV32-NEXT:    vsrl.vi v8, v8, 2
331; RV32-NEXT:    vand.vx v8, v8, a0
332; RV32-NEXT:    vadd.vv v8, v10, v8
333; RV32-NEXT:    vsrl.vi v10, v8, 4
334; RV32-NEXT:    vadd.vv v8, v8, v10
335; RV32-NEXT:    lui a0, 1
336; RV32-NEXT:    addi a0, a0, -241
337; RV32-NEXT:    vand.vx v8, v8, a0
338; RV32-NEXT:    li a0, 257
339; RV32-NEXT:    vmul.vx v8, v8, a0
340; RV32-NEXT:    vsrl.vi v8, v8, 8
341; RV32-NEXT:    ret
342;
343; RV64-LABEL: ctpop_nxv8i16:
344; RV64:       # %bb.0:
345; RV64-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
346; RV64-NEXT:    vsrl.vi v10, v8, 1
347; RV64-NEXT:    lui a0, 5
348; RV64-NEXT:    addiw a0, a0, 1365
349; RV64-NEXT:    vand.vx v10, v10, a0
350; RV64-NEXT:    vsub.vv v8, v8, v10
351; RV64-NEXT:    lui a0, 3
352; RV64-NEXT:    addiw a0, a0, 819
353; RV64-NEXT:    vand.vx v10, v8, a0
354; RV64-NEXT:    vsrl.vi v8, v8, 2
355; RV64-NEXT:    vand.vx v8, v8, a0
356; RV64-NEXT:    vadd.vv v8, v10, v8
357; RV64-NEXT:    vsrl.vi v10, v8, 4
358; RV64-NEXT:    vadd.vv v8, v8, v10
359; RV64-NEXT:    lui a0, 1
360; RV64-NEXT:    addiw a0, a0, -241
361; RV64-NEXT:    vand.vx v8, v8, a0
362; RV64-NEXT:    li a0, 257
363; RV64-NEXT:    vmul.vx v8, v8, a0
364; RV64-NEXT:    vsrl.vi v8, v8, 8
365; RV64-NEXT:    ret
366  %a = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> %va)
367  ret <vscale x 8 x i16> %a
368}
369declare <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16>)
370
371define <vscale x 16 x i16> @ctpop_nxv16i16(<vscale x 16 x i16> %va) {
372; RV32-LABEL: ctpop_nxv16i16:
373; RV32:       # %bb.0:
374; RV32-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
375; RV32-NEXT:    vsrl.vi v12, v8, 1
376; RV32-NEXT:    lui a0, 5
377; RV32-NEXT:    addi a0, a0, 1365
378; RV32-NEXT:    vand.vx v12, v12, a0
379; RV32-NEXT:    vsub.vv v8, v8, v12
380; RV32-NEXT:    lui a0, 3
381; RV32-NEXT:    addi a0, a0, 819
382; RV32-NEXT:    vand.vx v12, v8, a0
383; RV32-NEXT:    vsrl.vi v8, v8, 2
384; RV32-NEXT:    vand.vx v8, v8, a0
385; RV32-NEXT:    vadd.vv v8, v12, v8
386; RV32-NEXT:    vsrl.vi v12, v8, 4
387; RV32-NEXT:    vadd.vv v8, v8, v12
388; RV32-NEXT:    lui a0, 1
389; RV32-NEXT:    addi a0, a0, -241
390; RV32-NEXT:    vand.vx v8, v8, a0
391; RV32-NEXT:    li a0, 257
392; RV32-NEXT:    vmul.vx v8, v8, a0
393; RV32-NEXT:    vsrl.vi v8, v8, 8
394; RV32-NEXT:    ret
395;
396; RV64-LABEL: ctpop_nxv16i16:
397; RV64:       # %bb.0:
398; RV64-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
399; RV64-NEXT:    vsrl.vi v12, v8, 1
400; RV64-NEXT:    lui a0, 5
401; RV64-NEXT:    addiw a0, a0, 1365
402; RV64-NEXT:    vand.vx v12, v12, a0
403; RV64-NEXT:    vsub.vv v8, v8, v12
404; RV64-NEXT:    lui a0, 3
405; RV64-NEXT:    addiw a0, a0, 819
406; RV64-NEXT:    vand.vx v12, v8, a0
407; RV64-NEXT:    vsrl.vi v8, v8, 2
408; RV64-NEXT:    vand.vx v8, v8, a0
409; RV64-NEXT:    vadd.vv v8, v12, v8
410; RV64-NEXT:    vsrl.vi v12, v8, 4
411; RV64-NEXT:    vadd.vv v8, v8, v12
412; RV64-NEXT:    lui a0, 1
413; RV64-NEXT:    addiw a0, a0, -241
414; RV64-NEXT:    vand.vx v8, v8, a0
415; RV64-NEXT:    li a0, 257
416; RV64-NEXT:    vmul.vx v8, v8, a0
417; RV64-NEXT:    vsrl.vi v8, v8, 8
418; RV64-NEXT:    ret
419  %a = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> %va)
420  ret <vscale x 16 x i16> %a
421}
422declare <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16>)
423
424define <vscale x 32 x i16> @ctpop_nxv32i16(<vscale x 32 x i16> %va) {
425; RV32-LABEL: ctpop_nxv32i16:
426; RV32:       # %bb.0:
427; RV32-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
428; RV32-NEXT:    vsrl.vi v16, v8, 1
429; RV32-NEXT:    lui a0, 5
430; RV32-NEXT:    addi a0, a0, 1365
431; RV32-NEXT:    vand.vx v16, v16, a0
432; RV32-NEXT:    vsub.vv v8, v8, v16
433; RV32-NEXT:    lui a0, 3
434; RV32-NEXT:    addi a0, a0, 819
435; RV32-NEXT:    vand.vx v16, v8, a0
436; RV32-NEXT:    vsrl.vi v8, v8, 2
437; RV32-NEXT:    vand.vx v8, v8, a0
438; RV32-NEXT:    vadd.vv v8, v16, v8
439; RV32-NEXT:    vsrl.vi v16, v8, 4
440; RV32-NEXT:    vadd.vv v8, v8, v16
441; RV32-NEXT:    lui a0, 1
442; RV32-NEXT:    addi a0, a0, -241
443; RV32-NEXT:    vand.vx v8, v8, a0
444; RV32-NEXT:    li a0, 257
445; RV32-NEXT:    vmul.vx v8, v8, a0
446; RV32-NEXT:    vsrl.vi v8, v8, 8
447; RV32-NEXT:    ret
448;
449; RV64-LABEL: ctpop_nxv32i16:
450; RV64:       # %bb.0:
451; RV64-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
452; RV64-NEXT:    vsrl.vi v16, v8, 1
453; RV64-NEXT:    lui a0, 5
454; RV64-NEXT:    addiw a0, a0, 1365
455; RV64-NEXT:    vand.vx v16, v16, a0
456; RV64-NEXT:    vsub.vv v8, v8, v16
457; RV64-NEXT:    lui a0, 3
458; RV64-NEXT:    addiw a0, a0, 819
459; RV64-NEXT:    vand.vx v16, v8, a0
460; RV64-NEXT:    vsrl.vi v8, v8, 2
461; RV64-NEXT:    vand.vx v8, v8, a0
462; RV64-NEXT:    vadd.vv v8, v16, v8
463; RV64-NEXT:    vsrl.vi v16, v8, 4
464; RV64-NEXT:    vadd.vv v8, v8, v16
465; RV64-NEXT:    lui a0, 1
466; RV64-NEXT:    addiw a0, a0, -241
467; RV64-NEXT:    vand.vx v8, v8, a0
468; RV64-NEXT:    li a0, 257
469; RV64-NEXT:    vmul.vx v8, v8, a0
470; RV64-NEXT:    vsrl.vi v8, v8, 8
471; RV64-NEXT:    ret
472  %a = call <vscale x 32 x i16> @llvm.ctpop.nxv32i16(<vscale x 32 x i16> %va)
473  ret <vscale x 32 x i16> %a
474}
475declare <vscale x 32 x i16> @llvm.ctpop.nxv32i16(<vscale x 32 x i16>)
476
477define <vscale x 1 x i32> @ctpop_nxv1i32(<vscale x 1 x i32> %va) {
478; RV32-LABEL: ctpop_nxv1i32:
479; RV32:       # %bb.0:
480; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
481; RV32-NEXT:    vsrl.vi v9, v8, 1
482; RV32-NEXT:    lui a0, 349525
483; RV32-NEXT:    addi a0, a0, 1365
484; RV32-NEXT:    vand.vx v9, v9, a0
485; RV32-NEXT:    vsub.vv v8, v8, v9
486; RV32-NEXT:    lui a0, 209715
487; RV32-NEXT:    addi a0, a0, 819
488; RV32-NEXT:    vand.vx v9, v8, a0
489; RV32-NEXT:    vsrl.vi v8, v8, 2
490; RV32-NEXT:    vand.vx v8, v8, a0
491; RV32-NEXT:    vadd.vv v8, v9, v8
492; RV32-NEXT:    vsrl.vi v9, v8, 4
493; RV32-NEXT:    vadd.vv v8, v8, v9
494; RV32-NEXT:    lui a0, 61681
495; RV32-NEXT:    addi a0, a0, -241
496; RV32-NEXT:    vand.vx v8, v8, a0
497; RV32-NEXT:    lui a0, 4112
498; RV32-NEXT:    addi a0, a0, 257
499; RV32-NEXT:    vmul.vx v8, v8, a0
500; RV32-NEXT:    vsrl.vi v8, v8, 24
501; RV32-NEXT:    ret
502;
503; RV64-LABEL: ctpop_nxv1i32:
504; RV64:       # %bb.0:
505; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
506; RV64-NEXT:    vsrl.vi v9, v8, 1
507; RV64-NEXT:    lui a0, 349525
508; RV64-NEXT:    addiw a0, a0, 1365
509; RV64-NEXT:    vand.vx v9, v9, a0
510; RV64-NEXT:    vsub.vv v8, v8, v9
511; RV64-NEXT:    lui a0, 209715
512; RV64-NEXT:    addiw a0, a0, 819
513; RV64-NEXT:    vand.vx v9, v8, a0
514; RV64-NEXT:    vsrl.vi v8, v8, 2
515; RV64-NEXT:    vand.vx v8, v8, a0
516; RV64-NEXT:    vadd.vv v8, v9, v8
517; RV64-NEXT:    vsrl.vi v9, v8, 4
518; RV64-NEXT:    vadd.vv v8, v8, v9
519; RV64-NEXT:    lui a0, 61681
520; RV64-NEXT:    addiw a0, a0, -241
521; RV64-NEXT:    vand.vx v8, v8, a0
522; RV64-NEXT:    lui a0, 4112
523; RV64-NEXT:    addiw a0, a0, 257
524; RV64-NEXT:    vmul.vx v8, v8, a0
525; RV64-NEXT:    vsrl.vi v8, v8, 24
526; RV64-NEXT:    ret
527  %a = call <vscale x 1 x i32> @llvm.ctpop.nxv1i32(<vscale x 1 x i32> %va)
528  ret <vscale x 1 x i32> %a
529}
530declare <vscale x 1 x i32> @llvm.ctpop.nxv1i32(<vscale x 1 x i32>)
531
532define <vscale x 2 x i32> @ctpop_nxv2i32(<vscale x 2 x i32> %va) {
533; RV32-LABEL: ctpop_nxv2i32:
534; RV32:       # %bb.0:
535; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
536; RV32-NEXT:    vsrl.vi v9, v8, 1
537; RV32-NEXT:    lui a0, 349525
538; RV32-NEXT:    addi a0, a0, 1365
539; RV32-NEXT:    vand.vx v9, v9, a0
540; RV32-NEXT:    vsub.vv v8, v8, v9
541; RV32-NEXT:    lui a0, 209715
542; RV32-NEXT:    addi a0, a0, 819
543; RV32-NEXT:    vand.vx v9, v8, a0
544; RV32-NEXT:    vsrl.vi v8, v8, 2
545; RV32-NEXT:    vand.vx v8, v8, a0
546; RV32-NEXT:    vadd.vv v8, v9, v8
547; RV32-NEXT:    vsrl.vi v9, v8, 4
548; RV32-NEXT:    vadd.vv v8, v8, v9
549; RV32-NEXT:    lui a0, 61681
550; RV32-NEXT:    addi a0, a0, -241
551; RV32-NEXT:    vand.vx v8, v8, a0
552; RV32-NEXT:    lui a0, 4112
553; RV32-NEXT:    addi a0, a0, 257
554; RV32-NEXT:    vmul.vx v8, v8, a0
555; RV32-NEXT:    vsrl.vi v8, v8, 24
556; RV32-NEXT:    ret
557;
558; RV64-LABEL: ctpop_nxv2i32:
559; RV64:       # %bb.0:
560; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
561; RV64-NEXT:    vsrl.vi v9, v8, 1
562; RV64-NEXT:    lui a0, 349525
563; RV64-NEXT:    addiw a0, a0, 1365
564; RV64-NEXT:    vand.vx v9, v9, a0
565; RV64-NEXT:    vsub.vv v8, v8, v9
566; RV64-NEXT:    lui a0, 209715
567; RV64-NEXT:    addiw a0, a0, 819
568; RV64-NEXT:    vand.vx v9, v8, a0
569; RV64-NEXT:    vsrl.vi v8, v8, 2
570; RV64-NEXT:    vand.vx v8, v8, a0
571; RV64-NEXT:    vadd.vv v8, v9, v8
572; RV64-NEXT:    vsrl.vi v9, v8, 4
573; RV64-NEXT:    vadd.vv v8, v8, v9
574; RV64-NEXT:    lui a0, 61681
575; RV64-NEXT:    addiw a0, a0, -241
576; RV64-NEXT:    vand.vx v8, v8, a0
577; RV64-NEXT:    lui a0, 4112
578; RV64-NEXT:    addiw a0, a0, 257
579; RV64-NEXT:    vmul.vx v8, v8, a0
580; RV64-NEXT:    vsrl.vi v8, v8, 24
581; RV64-NEXT:    ret
582  %a = call <vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> %va)
583  ret <vscale x 2 x i32> %a
584}
585declare <vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32>)
586
587define <vscale x 4 x i32> @ctpop_nxv4i32(<vscale x 4 x i32> %va) {
588; RV32-LABEL: ctpop_nxv4i32:
589; RV32:       # %bb.0:
590; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
591; RV32-NEXT:    vsrl.vi v10, v8, 1
592; RV32-NEXT:    lui a0, 349525
593; RV32-NEXT:    addi a0, a0, 1365
594; RV32-NEXT:    vand.vx v10, v10, a0
595; RV32-NEXT:    vsub.vv v8, v8, v10
596; RV32-NEXT:    lui a0, 209715
597; RV32-NEXT:    addi a0, a0, 819
598; RV32-NEXT:    vand.vx v10, v8, a0
599; RV32-NEXT:    vsrl.vi v8, v8, 2
600; RV32-NEXT:    vand.vx v8, v8, a0
601; RV32-NEXT:    vadd.vv v8, v10, v8
602; RV32-NEXT:    vsrl.vi v10, v8, 4
603; RV32-NEXT:    vadd.vv v8, v8, v10
604; RV32-NEXT:    lui a0, 61681
605; RV32-NEXT:    addi a0, a0, -241
606; RV32-NEXT:    vand.vx v8, v8, a0
607; RV32-NEXT:    lui a0, 4112
608; RV32-NEXT:    addi a0, a0, 257
609; RV32-NEXT:    vmul.vx v8, v8, a0
610; RV32-NEXT:    vsrl.vi v8, v8, 24
611; RV32-NEXT:    ret
612;
613; RV64-LABEL: ctpop_nxv4i32:
614; RV64:       # %bb.0:
615; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
616; RV64-NEXT:    vsrl.vi v10, v8, 1
617; RV64-NEXT:    lui a0, 349525
618; RV64-NEXT:    addiw a0, a0, 1365
619; RV64-NEXT:    vand.vx v10, v10, a0
620; RV64-NEXT:    vsub.vv v8, v8, v10
621; RV64-NEXT:    lui a0, 209715
622; RV64-NEXT:    addiw a0, a0, 819
623; RV64-NEXT:    vand.vx v10, v8, a0
624; RV64-NEXT:    vsrl.vi v8, v8, 2
625; RV64-NEXT:    vand.vx v8, v8, a0
626; RV64-NEXT:    vadd.vv v8, v10, v8
627; RV64-NEXT:    vsrl.vi v10, v8, 4
628; RV64-NEXT:    vadd.vv v8, v8, v10
629; RV64-NEXT:    lui a0, 61681
630; RV64-NEXT:    addiw a0, a0, -241
631; RV64-NEXT:    vand.vx v8, v8, a0
632; RV64-NEXT:    lui a0, 4112
633; RV64-NEXT:    addiw a0, a0, 257
634; RV64-NEXT:    vmul.vx v8, v8, a0
635; RV64-NEXT:    vsrl.vi v8, v8, 24
636; RV64-NEXT:    ret
637  %a = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> %va)
638  ret <vscale x 4 x i32> %a
639}
640declare <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32>)
641
642define <vscale x 8 x i32> @ctpop_nxv8i32(<vscale x 8 x i32> %va) {
643; RV32-LABEL: ctpop_nxv8i32:
644; RV32:       # %bb.0:
645; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
646; RV32-NEXT:    vsrl.vi v12, v8, 1
647; RV32-NEXT:    lui a0, 349525
648; RV32-NEXT:    addi a0, a0, 1365
649; RV32-NEXT:    vand.vx v12, v12, a0
650; RV32-NEXT:    vsub.vv v8, v8, v12
651; RV32-NEXT:    lui a0, 209715
652; RV32-NEXT:    addi a0, a0, 819
653; RV32-NEXT:    vand.vx v12, v8, a0
654; RV32-NEXT:    vsrl.vi v8, v8, 2
655; RV32-NEXT:    vand.vx v8, v8, a0
656; RV32-NEXT:    vadd.vv v8, v12, v8
657; RV32-NEXT:    vsrl.vi v12, v8, 4
658; RV32-NEXT:    vadd.vv v8, v8, v12
659; RV32-NEXT:    lui a0, 61681
660; RV32-NEXT:    addi a0, a0, -241
661; RV32-NEXT:    vand.vx v8, v8, a0
662; RV32-NEXT:    lui a0, 4112
663; RV32-NEXT:    addi a0, a0, 257
664; RV32-NEXT:    vmul.vx v8, v8, a0
665; RV32-NEXT:    vsrl.vi v8, v8, 24
666; RV32-NEXT:    ret
667;
668; RV64-LABEL: ctpop_nxv8i32:
669; RV64:       # %bb.0:
670; RV64-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
671; RV64-NEXT:    vsrl.vi v12, v8, 1
672; RV64-NEXT:    lui a0, 349525
673; RV64-NEXT:    addiw a0, a0, 1365
674; RV64-NEXT:    vand.vx v12, v12, a0
675; RV64-NEXT:    vsub.vv v8, v8, v12
676; RV64-NEXT:    lui a0, 209715
677; RV64-NEXT:    addiw a0, a0, 819
678; RV64-NEXT:    vand.vx v12, v8, a0
679; RV64-NEXT:    vsrl.vi v8, v8, 2
680; RV64-NEXT:    vand.vx v8, v8, a0
681; RV64-NEXT:    vadd.vv v8, v12, v8
682; RV64-NEXT:    vsrl.vi v12, v8, 4
683; RV64-NEXT:    vadd.vv v8, v8, v12
684; RV64-NEXT:    lui a0, 61681
685; RV64-NEXT:    addiw a0, a0, -241
686; RV64-NEXT:    vand.vx v8, v8, a0
687; RV64-NEXT:    lui a0, 4112
688; RV64-NEXT:    addiw a0, a0, 257
689; RV64-NEXT:    vmul.vx v8, v8, a0
690; RV64-NEXT:    vsrl.vi v8, v8, 24
691; RV64-NEXT:    ret
692  %a = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> %va)
693  ret <vscale x 8 x i32> %a
694}
695declare <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32>)
696
697define <vscale x 16 x i32> @ctpop_nxv16i32(<vscale x 16 x i32> %va) {
698; RV32-LABEL: ctpop_nxv16i32:
699; RV32:       # %bb.0:
700; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
701; RV32-NEXT:    vsrl.vi v16, v8, 1
702; RV32-NEXT:    lui a0, 349525
703; RV32-NEXT:    addi a0, a0, 1365
704; RV32-NEXT:    vand.vx v16, v16, a0
705; RV32-NEXT:    vsub.vv v8, v8, v16
706; RV32-NEXT:    lui a0, 209715
707; RV32-NEXT:    addi a0, a0, 819
708; RV32-NEXT:    vand.vx v16, v8, a0
709; RV32-NEXT:    vsrl.vi v8, v8, 2
710; RV32-NEXT:    vand.vx v8, v8, a0
711; RV32-NEXT:    vadd.vv v8, v16, v8
712; RV32-NEXT:    vsrl.vi v16, v8, 4
713; RV32-NEXT:    vadd.vv v8, v8, v16
714; RV32-NEXT:    lui a0, 61681
715; RV32-NEXT:    addi a0, a0, -241
716; RV32-NEXT:    vand.vx v8, v8, a0
717; RV32-NEXT:    lui a0, 4112
718; RV32-NEXT:    addi a0, a0, 257
719; RV32-NEXT:    vmul.vx v8, v8, a0
720; RV32-NEXT:    vsrl.vi v8, v8, 24
721; RV32-NEXT:    ret
722;
723; RV64-LABEL: ctpop_nxv16i32:
724; RV64:       # %bb.0:
725; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
726; RV64-NEXT:    vsrl.vi v16, v8, 1
727; RV64-NEXT:    lui a0, 349525
728; RV64-NEXT:    addiw a0, a0, 1365
729; RV64-NEXT:    vand.vx v16, v16, a0
730; RV64-NEXT:    vsub.vv v8, v8, v16
731; RV64-NEXT:    lui a0, 209715
732; RV64-NEXT:    addiw a0, a0, 819
733; RV64-NEXT:    vand.vx v16, v8, a0
734; RV64-NEXT:    vsrl.vi v8, v8, 2
735; RV64-NEXT:    vand.vx v8, v8, a0
736; RV64-NEXT:    vadd.vv v8, v16, v8
737; RV64-NEXT:    vsrl.vi v16, v8, 4
738; RV64-NEXT:    vadd.vv v8, v8, v16
739; RV64-NEXT:    lui a0, 61681
740; RV64-NEXT:    addiw a0, a0, -241
741; RV64-NEXT:    vand.vx v8, v8, a0
742; RV64-NEXT:    lui a0, 4112
743; RV64-NEXT:    addiw a0, a0, 257
744; RV64-NEXT:    vmul.vx v8, v8, a0
745; RV64-NEXT:    vsrl.vi v8, v8, 24
746; RV64-NEXT:    ret
747  %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
748  ret <vscale x 16 x i32> %a
749}
750declare <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32>)
751
752define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
753; RV32-LABEL: ctpop_nxv1i64:
754; RV32:       # %bb.0:
755; RV32-NEXT:    addi sp, sp, -16
756; RV32-NEXT:    .cfi_def_cfa_offset 16
757; RV32-NEXT:    lui a0, 349525
758; RV32-NEXT:    addi a0, a0, 1365
759; RV32-NEXT:    sw a0, 12(sp)
760; RV32-NEXT:    sw a0, 8(sp)
761; RV32-NEXT:    lui a0, 209715
762; RV32-NEXT:    addi a0, a0, 819
763; RV32-NEXT:    sw a0, 12(sp)
764; RV32-NEXT:    sw a0, 8(sp)
765; RV32-NEXT:    lui a0, 61681
766; RV32-NEXT:    addi a0, a0, -241
767; RV32-NEXT:    sw a0, 12(sp)
768; RV32-NEXT:    sw a0, 8(sp)
769; RV32-NEXT:    lui a0, 4112
770; RV32-NEXT:    addi a0, a0, 257
771; RV32-NEXT:    sw a0, 12(sp)
772; RV32-NEXT:    sw a0, 8(sp)
773; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
774; RV32-NEXT:    addi a0, sp, 8
775; RV32-NEXT:    vlse64.v v9, (a0), zero
776; RV32-NEXT:    vlse64.v v10, (a0), zero
777; RV32-NEXT:    vsrl.vi v11, v8, 1
778; RV32-NEXT:    vand.vv v9, v11, v9
779; RV32-NEXT:    vsub.vv v8, v8, v9
780; RV32-NEXT:    vand.vv v9, v8, v10
781; RV32-NEXT:    vsrl.vi v8, v8, 2
782; RV32-NEXT:    vand.vv v8, v8, v10
783; RV32-NEXT:    vadd.vv v8, v9, v8
784; RV32-NEXT:    vlse64.v v9, (a0), zero
785; RV32-NEXT:    vlse64.v v10, (a0), zero
786; RV32-NEXT:    vsrl.vi v11, v8, 4
787; RV32-NEXT:    vadd.vv v8, v8, v11
788; RV32-NEXT:    vand.vv v8, v8, v9
789; RV32-NEXT:    vmul.vv v8, v8, v10
790; RV32-NEXT:    li a0, 56
791; RV32-NEXT:    vsrl.vx v8, v8, a0
792; RV32-NEXT:    addi sp, sp, 16
793; RV32-NEXT:    ret
794;
795; RV64-LABEL: ctpop_nxv1i64:
796; RV64:       # %bb.0:
797; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
798; RV64-NEXT:    lui a0, %hi(.LCPI18_0)
799; RV64-NEXT:    ld a0, %lo(.LCPI18_0)(a0)
800; RV64-NEXT:    lui a1, %hi(.LCPI18_1)
801; RV64-NEXT:    ld a1, %lo(.LCPI18_1)(a1)
802; RV64-NEXT:    vsrl.vi v9, v8, 1
803; RV64-NEXT:    vand.vx v9, v9, a0
804; RV64-NEXT:    vsub.vv v8, v8, v9
805; RV64-NEXT:    vand.vx v9, v8, a1
806; RV64-NEXT:    vsrl.vi v8, v8, 2
807; RV64-NEXT:    vand.vx v8, v8, a1
808; RV64-NEXT:    vadd.vv v8, v9, v8
809; RV64-NEXT:    lui a0, %hi(.LCPI18_2)
810; RV64-NEXT:    ld a0, %lo(.LCPI18_2)(a0)
811; RV64-NEXT:    lui a1, %hi(.LCPI18_3)
812; RV64-NEXT:    ld a1, %lo(.LCPI18_3)(a1)
813; RV64-NEXT:    vsrl.vi v9, v8, 4
814; RV64-NEXT:    vadd.vv v8, v8, v9
815; RV64-NEXT:    vand.vx v8, v8, a0
816; RV64-NEXT:    vmul.vx v8, v8, a1
817; RV64-NEXT:    li a0, 56
818; RV64-NEXT:    vsrl.vx v8, v8, a0
819; RV64-NEXT:    ret
820  %a = call <vscale x 1 x i64> @llvm.ctpop.nxv1i64(<vscale x 1 x i64> %va)
821  ret <vscale x 1 x i64> %a
822}
823declare <vscale x 1 x i64> @llvm.ctpop.nxv1i64(<vscale x 1 x i64>)
824
825define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
826; RV32-LABEL: ctpop_nxv2i64:
827; RV32:       # %bb.0:
828; RV32-NEXT:    addi sp, sp, -16
829; RV32-NEXT:    .cfi_def_cfa_offset 16
830; RV32-NEXT:    lui a0, 349525
831; RV32-NEXT:    addi a0, a0, 1365
832; RV32-NEXT:    sw a0, 12(sp)
833; RV32-NEXT:    sw a0, 8(sp)
834; RV32-NEXT:    lui a0, 209715
835; RV32-NEXT:    addi a0, a0, 819
836; RV32-NEXT:    sw a0, 12(sp)
837; RV32-NEXT:    sw a0, 8(sp)
838; RV32-NEXT:    lui a0, 61681
839; RV32-NEXT:    addi a0, a0, -241
840; RV32-NEXT:    sw a0, 12(sp)
841; RV32-NEXT:    sw a0, 8(sp)
842; RV32-NEXT:    lui a0, 4112
843; RV32-NEXT:    addi a0, a0, 257
844; RV32-NEXT:    sw a0, 12(sp)
845; RV32-NEXT:    sw a0, 8(sp)
846; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
847; RV32-NEXT:    addi a0, sp, 8
848; RV32-NEXT:    vlse64.v v10, (a0), zero
849; RV32-NEXT:    vlse64.v v12, (a0), zero
850; RV32-NEXT:    vsrl.vi v14, v8, 1
851; RV32-NEXT:    vand.vv v10, v14, v10
852; RV32-NEXT:    vsub.vv v8, v8, v10
853; RV32-NEXT:    vand.vv v10, v8, v12
854; RV32-NEXT:    vsrl.vi v8, v8, 2
855; RV32-NEXT:    vand.vv v8, v8, v12
856; RV32-NEXT:    vadd.vv v8, v10, v8
857; RV32-NEXT:    vlse64.v v10, (a0), zero
858; RV32-NEXT:    vlse64.v v12, (a0), zero
859; RV32-NEXT:    vsrl.vi v14, v8, 4
860; RV32-NEXT:    vadd.vv v8, v8, v14
861; RV32-NEXT:    vand.vv v8, v8, v10
862; RV32-NEXT:    vmul.vv v8, v8, v12
863; RV32-NEXT:    li a0, 56
864; RV32-NEXT:    vsrl.vx v8, v8, a0
865; RV32-NEXT:    addi sp, sp, 16
866; RV32-NEXT:    ret
867;
868; RV64-LABEL: ctpop_nxv2i64:
869; RV64:       # %bb.0:
870; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
871; RV64-NEXT:    lui a0, %hi(.LCPI19_0)
872; RV64-NEXT:    ld a0, %lo(.LCPI19_0)(a0)
873; RV64-NEXT:    lui a1, %hi(.LCPI19_1)
874; RV64-NEXT:    ld a1, %lo(.LCPI19_1)(a1)
875; RV64-NEXT:    vsrl.vi v10, v8, 1
876; RV64-NEXT:    vand.vx v10, v10, a0
877; RV64-NEXT:    vsub.vv v8, v8, v10
878; RV64-NEXT:    vand.vx v10, v8, a1
879; RV64-NEXT:    vsrl.vi v8, v8, 2
880; RV64-NEXT:    vand.vx v8, v8, a1
881; RV64-NEXT:    vadd.vv v8, v10, v8
882; RV64-NEXT:    lui a0, %hi(.LCPI19_2)
883; RV64-NEXT:    ld a0, %lo(.LCPI19_2)(a0)
884; RV64-NEXT:    lui a1, %hi(.LCPI19_3)
885; RV64-NEXT:    ld a1, %lo(.LCPI19_3)(a1)
886; RV64-NEXT:    vsrl.vi v10, v8, 4
887; RV64-NEXT:    vadd.vv v8, v8, v10
888; RV64-NEXT:    vand.vx v8, v8, a0
889; RV64-NEXT:    vmul.vx v8, v8, a1
890; RV64-NEXT:    li a0, 56
891; RV64-NEXT:    vsrl.vx v8, v8, a0
892; RV64-NEXT:    ret
893  %a = call <vscale x 2 x i64> @llvm.ctpop.nxv2i64(<vscale x 2 x i64> %va)
894  ret <vscale x 2 x i64> %a
895}
896declare <vscale x 2 x i64> @llvm.ctpop.nxv2i64(<vscale x 2 x i64>)
897
898define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
899; RV32-LABEL: ctpop_nxv4i64:
900; RV32:       # %bb.0:
901; RV32-NEXT:    addi sp, sp, -16
902; RV32-NEXT:    .cfi_def_cfa_offset 16
903; RV32-NEXT:    lui a0, 349525
904; RV32-NEXT:    addi a0, a0, 1365
905; RV32-NEXT:    sw a0, 12(sp)
906; RV32-NEXT:    sw a0, 8(sp)
907; RV32-NEXT:    lui a0, 209715
908; RV32-NEXT:    addi a0, a0, 819
909; RV32-NEXT:    sw a0, 12(sp)
910; RV32-NEXT:    sw a0, 8(sp)
911; RV32-NEXT:    lui a0, 61681
912; RV32-NEXT:    addi a0, a0, -241
913; RV32-NEXT:    sw a0, 12(sp)
914; RV32-NEXT:    sw a0, 8(sp)
915; RV32-NEXT:    lui a0, 4112
916; RV32-NEXT:    addi a0, a0, 257
917; RV32-NEXT:    sw a0, 12(sp)
918; RV32-NEXT:    sw a0, 8(sp)
919; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
920; RV32-NEXT:    addi a0, sp, 8
921; RV32-NEXT:    vlse64.v v12, (a0), zero
922; RV32-NEXT:    vlse64.v v16, (a0), zero
923; RV32-NEXT:    vsrl.vi v20, v8, 1
924; RV32-NEXT:    vand.vv v12, v20, v12
925; RV32-NEXT:    vsub.vv v8, v8, v12
926; RV32-NEXT:    vand.vv v12, v8, v16
927; RV32-NEXT:    vsrl.vi v8, v8, 2
928; RV32-NEXT:    vand.vv v8, v8, v16
929; RV32-NEXT:    vadd.vv v8, v12, v8
930; RV32-NEXT:    vlse64.v v12, (a0), zero
931; RV32-NEXT:    vlse64.v v16, (a0), zero
932; RV32-NEXT:    vsrl.vi v20, v8, 4
933; RV32-NEXT:    vadd.vv v8, v8, v20
934; RV32-NEXT:    vand.vv v8, v8, v12
935; RV32-NEXT:    vmul.vv v8, v8, v16
936; RV32-NEXT:    li a0, 56
937; RV32-NEXT:    vsrl.vx v8, v8, a0
938; RV32-NEXT:    addi sp, sp, 16
939; RV32-NEXT:    ret
940;
941; RV64-LABEL: ctpop_nxv4i64:
942; RV64:       # %bb.0:
943; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
944; RV64-NEXT:    lui a0, %hi(.LCPI20_0)
945; RV64-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
946; RV64-NEXT:    lui a1, %hi(.LCPI20_1)
947; RV64-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
948; RV64-NEXT:    vsrl.vi v12, v8, 1
949; RV64-NEXT:    vand.vx v12, v12, a0
950; RV64-NEXT:    vsub.vv v8, v8, v12
951; RV64-NEXT:    vand.vx v12, v8, a1
952; RV64-NEXT:    vsrl.vi v8, v8, 2
953; RV64-NEXT:    vand.vx v8, v8, a1
954; RV64-NEXT:    vadd.vv v8, v12, v8
955; RV64-NEXT:    lui a0, %hi(.LCPI20_2)
956; RV64-NEXT:    ld a0, %lo(.LCPI20_2)(a0)
957; RV64-NEXT:    lui a1, %hi(.LCPI20_3)
958; RV64-NEXT:    ld a1, %lo(.LCPI20_3)(a1)
959; RV64-NEXT:    vsrl.vi v12, v8, 4
960; RV64-NEXT:    vadd.vv v8, v8, v12
961; RV64-NEXT:    vand.vx v8, v8, a0
962; RV64-NEXT:    vmul.vx v8, v8, a1
963; RV64-NEXT:    li a0, 56
964; RV64-NEXT:    vsrl.vx v8, v8, a0
965; RV64-NEXT:    ret
966  %a = call <vscale x 4 x i64> @llvm.ctpop.nxv4i64(<vscale x 4 x i64> %va)
967  ret <vscale x 4 x i64> %a
968}
969declare <vscale x 4 x i64> @llvm.ctpop.nxv4i64(<vscale x 4 x i64>)
970
971define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
972; RV32-LABEL: ctpop_nxv8i64:
973; RV32:       # %bb.0:
974; RV32-NEXT:    addi sp, sp, -16
975; RV32-NEXT:    .cfi_def_cfa_offset 16
976; RV32-NEXT:    lui a0, 349525
977; RV32-NEXT:    addi a0, a0, 1365
978; RV32-NEXT:    sw a0, 12(sp)
979; RV32-NEXT:    sw a0, 8(sp)
980; RV32-NEXT:    lui a0, 209715
981; RV32-NEXT:    addi a0, a0, 819
982; RV32-NEXT:    sw a0, 12(sp)
983; RV32-NEXT:    sw a0, 8(sp)
984; RV32-NEXT:    lui a0, 61681
985; RV32-NEXT:    addi a0, a0, -241
986; RV32-NEXT:    sw a0, 12(sp)
987; RV32-NEXT:    sw a0, 8(sp)
988; RV32-NEXT:    lui a0, 4112
989; RV32-NEXT:    addi a0, a0, 257
990; RV32-NEXT:    sw a0, 12(sp)
991; RV32-NEXT:    sw a0, 8(sp)
992; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
993; RV32-NEXT:    addi a0, sp, 8
994; RV32-NEXT:    vlse64.v v16, (a0), zero
995; RV32-NEXT:    vlse64.v v24, (a0), zero
996; RV32-NEXT:    vsrl.vi v0, v8, 1
997; RV32-NEXT:    vand.vv v16, v0, v16
998; RV32-NEXT:    vsub.vv v8, v8, v16
999; RV32-NEXT:    vand.vv v16, v8, v24
1000; RV32-NEXT:    vsrl.vi v8, v8, 2
1001; RV32-NEXT:    vand.vv v8, v8, v24
1002; RV32-NEXT:    vadd.vv v8, v16, v8
1003; RV32-NEXT:    vlse64.v v16, (a0), zero
1004; RV32-NEXT:    vlse64.v v24, (a0), zero
1005; RV32-NEXT:    vsrl.vi v0, v8, 4
1006; RV32-NEXT:    vadd.vv v8, v8, v0
1007; RV32-NEXT:    vand.vv v8, v8, v16
1008; RV32-NEXT:    vmul.vv v8, v8, v24
1009; RV32-NEXT:    li a0, 56
1010; RV32-NEXT:    vsrl.vx v8, v8, a0
1011; RV32-NEXT:    addi sp, sp, 16
1012; RV32-NEXT:    ret
1013;
1014; RV64-LABEL: ctpop_nxv8i64:
1015; RV64:       # %bb.0:
1016; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
1017; RV64-NEXT:    lui a0, %hi(.LCPI21_0)
1018; RV64-NEXT:    ld a0, %lo(.LCPI21_0)(a0)
1019; RV64-NEXT:    lui a1, %hi(.LCPI21_1)
1020; RV64-NEXT:    ld a1, %lo(.LCPI21_1)(a1)
1021; RV64-NEXT:    vsrl.vi v16, v8, 1
1022; RV64-NEXT:    vand.vx v16, v16, a0
1023; RV64-NEXT:    vsub.vv v8, v8, v16
1024; RV64-NEXT:    vand.vx v16, v8, a1
1025; RV64-NEXT:    vsrl.vi v8, v8, 2
1026; RV64-NEXT:    vand.vx v8, v8, a1
1027; RV64-NEXT:    vadd.vv v8, v16, v8
1028; RV64-NEXT:    lui a0, %hi(.LCPI21_2)
1029; RV64-NEXT:    ld a0, %lo(.LCPI21_2)(a0)
1030; RV64-NEXT:    lui a1, %hi(.LCPI21_3)
1031; RV64-NEXT:    ld a1, %lo(.LCPI21_3)(a1)
1032; RV64-NEXT:    vsrl.vi v16, v8, 4
1033; RV64-NEXT:    vadd.vv v8, v8, v16
1034; RV64-NEXT:    vand.vx v8, v8, a0
1035; RV64-NEXT:    vmul.vx v8, v8, a1
1036; RV64-NEXT:    li a0, 56
1037; RV64-NEXT:    vsrl.vx v8, v8, a0
1038; RV64-NEXT:    ret
1039  %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
1040  ret <vscale x 8 x i64> %a
1041}
1042declare <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64>)
1043