1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2   | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop  | FileCheck %s --check-prefixes=X64-AVX,X64-XOP
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ
10
11;
12; PowOf2 (uniform)
13;
14
15define <2 x i64> @mul_v2i64_8(<2 x i64> %a0) nounwind {
16; SSE-LABEL: mul_v2i64_8:
17; SSE:       # %bb.0:
18; SSE-NEXT:    psllq $3, %xmm0
19; SSE-NEXT:    ret{{[l|q]}}
20;
21; X64-AVX-LABEL: mul_v2i64_8:
22; X64-AVX:       # %bb.0:
23; X64-AVX-NEXT:    vpsllq $3, %xmm0, %xmm0
24; X64-AVX-NEXT:    retq
25  %1 = mul <2 x i64> %a0, <i64 8, i64 8>
26  ret <2 x i64> %1
27}
28
29define <4 x i32> @mul_v4i32_8(<4 x i32> %a0) nounwind {
30; SSE-LABEL: mul_v4i32_8:
31; SSE:       # %bb.0:
32; SSE-NEXT:    pslld $3, %xmm0
33; SSE-NEXT:    ret{{[l|q]}}
34;
35; X64-AVX-LABEL: mul_v4i32_8:
36; X64-AVX:       # %bb.0:
37; X64-AVX-NEXT:    vpslld $3, %xmm0, %xmm0
38; X64-AVX-NEXT:    retq
39  %1 = mul <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
40  ret <4 x i32> %1
41}
42
43define <8 x i16> @mul_v8i16_8(<8 x i16> %a0) nounwind {
44; SSE-LABEL: mul_v8i16_8:
45; SSE:       # %bb.0:
46; SSE-NEXT:    psllw $3, %xmm0
47; SSE-NEXT:    ret{{[l|q]}}
48;
49; X64-AVX-LABEL: mul_v8i16_8:
50; X64-AVX:       # %bb.0:
51; X64-AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
52; X64-AVX-NEXT:    retq
53  %1 = mul <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
54  ret <8 x i16> %1
55}
56
57define <16 x i8> @mul_v16i8_32(<16 x i8> %a0) nounwind {
58; X86-SSE-LABEL: mul_v16i8_32:
59; X86-SSE:       # %bb.0:
60; X86-SSE-NEXT:    psllw $5, %xmm0
61; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
62; X86-SSE-NEXT:    retl
63;
64; X64-SSE-LABEL: mul_v16i8_32:
65; X64-SSE:       # %bb.0:
66; X64-SSE-NEXT:    psllw $5, %xmm0
67; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
68; X64-SSE-NEXT:    retq
69;
70; X64-XOP-LABEL: mul_v16i8_32:
71; X64-XOP:       # %bb.0:
72; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
73; X64-XOP-NEXT:    retq
74;
75; X64-AVX2-LABEL: mul_v16i8_32:
76; X64-AVX2:       # %bb.0:
77; X64-AVX2-NEXT:    vpsllw $5, %xmm0, %xmm0
78; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
79; X64-AVX2-NEXT:    retq
80;
81; X64-AVX512DQ-LABEL: mul_v16i8_32:
82; X64-AVX512DQ:       # %bb.0:
83; X64-AVX512DQ-NEXT:    vpsllw $5, %xmm0, %xmm0
84; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
85; X64-AVX512DQ-NEXT:    retq
86  %1 = mul <16 x i8> %a0, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
87  ret <16 x i8> %1
88}
89
90;
91; PowOf2 (non-uniform)
92;
93
94define <2 x i64> @mul_v2i64_32_8(<2 x i64> %a0) nounwind {
95; SSE2-LABEL: mul_v2i64_32_8:
96; SSE2:       # %bb.0:
97; SSE2-NEXT:    movdqa %xmm0, %xmm1
98; SSE2-NEXT:    psllq $5, %xmm1
99; SSE2-NEXT:    psllq $3, %xmm0
100; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
101; SSE2-NEXT:    ret{{[l|q]}}
102;
103; SSE4-LABEL: mul_v2i64_32_8:
104; SSE4:       # %bb.0:
105; SSE4-NEXT:    movdqa %xmm0, %xmm1
106; SSE4-NEXT:    psllq $3, %xmm1
107; SSE4-NEXT:    psllq $5, %xmm0
108; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
109; SSE4-NEXT:    ret{{[l|q]}}
110;
111; X64-XOP-LABEL: mul_v2i64_32_8:
112; X64-XOP:       # %bb.0:
113; X64-XOP-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
114; X64-XOP-NEXT:    retq
115;
116; X64-AVX2-LABEL: mul_v2i64_32_8:
117; X64-AVX2:       # %bb.0:
118; X64-AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
119; X64-AVX2-NEXT:    retq
120;
121; X64-AVX512DQ-LABEL: mul_v2i64_32_8:
122; X64-AVX512DQ:       # %bb.0:
123; X64-AVX512DQ-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
124; X64-AVX512DQ-NEXT:    retq
125  %1 = mul <2 x i64> %a0, <i64 32, i64 8>
126  ret <2 x i64> %1
127}
128
129define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
130; X86-SSE2-LABEL: mul_v4i32_1_2_4_8:
131; X86-SSE2:       # %bb.0:
132; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
133; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
134; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
135; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
136; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
137; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
138; X86-SSE2-NEXT:    retl
139;
140; X86-SSE4-LABEL: mul_v4i32_1_2_4_8:
141; X86-SSE4:       # %bb.0:
142; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
143; X86-SSE4-NEXT:    retl
144;
145; X64-SSE2-LABEL: mul_v4i32_1_2_4_8:
146; X64-SSE2:       # %bb.0:
147; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
148; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
149; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
150; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
151; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
152; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
153; X64-SSE2-NEXT:    retq
154;
155; X64-SSE4-LABEL: mul_v4i32_1_2_4_8:
156; X64-SSE4:       # %bb.0:
157; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
158; X64-SSE4-NEXT:    retq
159;
160; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
161; X64-XOP:       # %bb.0:
162; X64-XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
163; X64-XOP-NEXT:    retq
164;
165; X64-AVX2-LABEL: mul_v4i32_1_2_4_8:
166; X64-AVX2:       # %bb.0:
167; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
168; X64-AVX2-NEXT:    retq
169;
170; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8:
171; X64-AVX512DQ:       # %bb.0:
172; X64-AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
173; X64-AVX512DQ-NEXT:    retq
174  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
175  ret <4 x i32> %1
176}
177
178define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize {
179; SSE2-LABEL: mul_v4i32_1_2_4_8_optsize:
180; SSE2:       # %bb.0:
181; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8]
182; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
183; SSE2-NEXT:    pmuludq %xmm1, %xmm0
184; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
185; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
186; SSE2-NEXT:    pmuludq %xmm2, %xmm1
187; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
188; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189; SSE2-NEXT:    ret{{[l|q]}}
190;
191; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
192; X86-SSE4:       # %bb.0:
193; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
194; X86-SSE4-NEXT:    retl
195;
196; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
197; X64-SSE4:       # %bb.0:
198; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
199; X64-SSE4-NEXT:    retq
200;
201; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize:
202; X64-XOP:       # %bb.0:
203; X64-XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
204; X64-XOP-NEXT:    retq
205;
206; X64-AVX2-LABEL: mul_v4i32_1_2_4_8_optsize:
207; X64-AVX2:       # %bb.0:
208; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
209; X64-AVX2-NEXT:    retq
210;
211; X64-AVX512DQ-LABEL: mul_v4i32_1_2_4_8_optsize:
212; X64-AVX512DQ:       # %bb.0:
213; X64-AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
214; X64-AVX512DQ-NEXT:    retq
215  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
216  ret <4 x i32> %1
217}
218
219define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind {
220; X86-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
221; X86-SSE:       # %bb.0:
222; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
223; X86-SSE-NEXT:    retl
224;
225; X64-SSE-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
226; X64-SSE:       # %bb.0:
227; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
228; X64-SSE-NEXT:    retq
229;
230; X64-XOP-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
231; X64-XOP:       # %bb.0:
232; X64-XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
233; X64-XOP-NEXT:    retq
234;
235; X64-AVX2-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
236; X64-AVX2:       # %bb.0:
237; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
238; X64-AVX2-NEXT:    retq
239;
240; X64-AVX512DQ-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
241; X64-AVX512DQ:       # %bb.0:
242; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
243; X64-AVX512DQ-NEXT:    retq
244  %1 = mul <8 x i16> %a0, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
245  ret <8 x i16> %1
246}
247
248define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind {
249; SSE2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
250; SSE2:       # %bb.0:
251; SSE2-NEXT:    movdqa %xmm0, %xmm1
252; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
253; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
254; SSE2-NEXT:    pmullw %xmm2, %xmm1
255; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
256; SSE2-NEXT:    pand %xmm3, %xmm1
257; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
258; SSE2-NEXT:    pmullw %xmm2, %xmm0
259; SSE2-NEXT:    pand %xmm3, %xmm0
260; SSE2-NEXT:    packuswb %xmm1, %xmm0
261; SSE2-NEXT:    ret{{[l|q]}}
262;
263; SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
264; SSE4:       # %bb.0:
265; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
266; SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
267; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
268; SSE4-NEXT:    pmullw %xmm2, %xmm0
269; SSE4-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
270; SSE4-NEXT:    pand %xmm3, %xmm0
271; SSE4-NEXT:    pmullw %xmm2, %xmm1
272; SSE4-NEXT:    pand %xmm3, %xmm1
273; SSE4-NEXT:    packuswb %xmm0, %xmm1
274; SSE4-NEXT:    movdqa %xmm1, %xmm0
275; SSE4-NEXT:    ret{{[l|q]}}
276;
277; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
278; X64-XOP:       # %bb.0:
279; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
280; X64-XOP-NEXT:    retq
281;
282; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
283; X64-AVX2:       # %bb.0:
284; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
285; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
286; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
287; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
288; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
289; X64-AVX2-NEXT:    vzeroupper
290; X64-AVX2-NEXT:    retq
291;
292; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
293; X64-AVX512DQ:       # %bb.0:
294; X64-AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
295; X64-AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
296; X64-AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
297; X64-AVX512DQ-NEXT:    vzeroupper
298; X64-AVX512DQ-NEXT:    retq
299  %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8>
300  ret <16 x i8> %1
301}
302
303;
304; PowOf2 + 1 (uniform)
305;
306
307define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
308; SSE-LABEL: mul_v2i64_17:
309; SSE:       # %bb.0:
310; SSE-NEXT:    movdqa %xmm0, %xmm1
311; SSE-NEXT:    psllq $4, %xmm1
312; SSE-NEXT:    paddq %xmm1, %xmm0
313; SSE-NEXT:    ret{{[l|q]}}
314;
315; X64-AVX-LABEL: mul_v2i64_17:
316; X64-AVX:       # %bb.0:
317; X64-AVX-NEXT:    vpsllq $4, %xmm0, %xmm1
318; X64-AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
319; X64-AVX-NEXT:    retq
320  %1 = mul <2 x i64> %a0, <i64 17, i64 17>
321  ret <2 x i64> %1
322}
323
324define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
325; SSE2-LABEL: mul_v4i32_17:
326; SSE2:       # %bb.0:
327; SSE2-NEXT:    movdqa %xmm0, %xmm1
328; SSE2-NEXT:    pslld $4, %xmm1
329; SSE2-NEXT:    paddd %xmm1, %xmm0
330; SSE2-NEXT:    ret{{[l|q]}}
331;
332; X86-SSE4-LABEL: mul_v4i32_17:
333; X86-SSE4:       # %bb.0:
334; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
335; X86-SSE4-NEXT:    retl
336;
337; X64-SSE4-FAST-LABEL: mul_v4i32_17:
338; X64-SSE4-FAST:       # %bb.0:
339; X64-SSE4-FAST-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
340; X64-SSE4-FAST-NEXT:    retq
341;
342; X64-SSE4-SLOW-LABEL: mul_v4i32_17:
343; X64-SSE4-SLOW:       # %bb.0:
344; X64-SSE4-SLOW-NEXT:    movdqa %xmm0, %xmm1
345; X64-SSE4-SLOW-NEXT:    pslld $4, %xmm1
346; X64-SSE4-SLOW-NEXT:    paddd %xmm1, %xmm0
347; X64-SSE4-SLOW-NEXT:    retq
348;
349; X64-XOP-LABEL: mul_v4i32_17:
350; X64-XOP:       # %bb.0:
351; X64-XOP-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
352; X64-XOP-NEXT:    retq
353;
354; X64-AVX2-LABEL: mul_v4i32_17:
355; X64-AVX2:       # %bb.0:
356; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17]
357; X64-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
358; X64-AVX2-NEXT:    retq
359;
360; X64-AVX512DQ-LABEL: mul_v4i32_17:
361; X64-AVX512DQ:       # %bb.0:
362; X64-AVX512DQ-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
363; X64-AVX512DQ-NEXT:    retq
364  %1 = mul <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
365  ret <4 x i32> %1
366}
367
368define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind {
369; X86-SSE-LABEL: mul_v8i16_17:
370; X86-SSE:       # %bb.0:
371; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
372; X86-SSE-NEXT:    retl
373;
374; X64-SSE-LABEL: mul_v8i16_17:
375; X64-SSE:       # %bb.0:
376; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
377; X64-SSE-NEXT:    retq
378;
379; X64-AVX-LABEL: mul_v8i16_17:
380; X64-AVX:       # %bb.0:
381; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
382; X64-AVX-NEXT:    retq
383  %1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
384  ret <8 x i16> %1
385}
386
387define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind {
388; X86-SSE-LABEL: mul_v16i8_17:
389; X86-SSE:       # %bb.0:
390; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
391; X86-SSE-NEXT:    psllw $4, %xmm1
392; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
393; X86-SSE-NEXT:    paddb %xmm1, %xmm0
394; X86-SSE-NEXT:    retl
395;
396; X64-SSE-LABEL: mul_v16i8_17:
397; X64-SSE:       # %bb.0:
398; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
399; X64-SSE-NEXT:    psllw $4, %xmm1
400; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
401; X64-SSE-NEXT:    paddb %xmm1, %xmm0
402; X64-SSE-NEXT:    retq
403;
404; X64-XOP-LABEL: mul_v16i8_17:
405; X64-XOP:       # %bb.0:
406; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
407; X64-XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
408; X64-XOP-NEXT:    retq
409;
410; X64-AVX2-LABEL: mul_v16i8_17:
411; X64-AVX2:       # %bb.0:
412; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm1
413; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
414; X64-AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
415; X64-AVX2-NEXT:    retq
416;
417; X64-AVX512DQ-LABEL: mul_v16i8_17:
418; X64-AVX512DQ:       # %bb.0:
419; X64-AVX512DQ-NEXT:    vpsllw $4, %xmm0, %xmm1
420; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
421; X64-AVX512DQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
422; X64-AVX512DQ-NEXT:    retq
423  %1 = mul <16 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
424  ret <16 x i8> %1
425}
426
427define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
428; SSE-LABEL: mul_v4i64_17:
429; SSE:       # %bb.0:
430; SSE-NEXT:    movdqa %xmm0, %xmm2
431; SSE-NEXT:    psllq $4, %xmm2
432; SSE-NEXT:    paddq %xmm2, %xmm0
433; SSE-NEXT:    movdqa %xmm1, %xmm2
434; SSE-NEXT:    psllq $4, %xmm2
435; SSE-NEXT:    paddq %xmm2, %xmm1
436; SSE-NEXT:    ret{{[l|q]}}
437;
438; X64-XOP-LABEL: mul_v4i64_17:
439; X64-XOP:       # %bb.0:
440; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
441; X64-XOP-NEXT:    vpsllq $4, %xmm1, %xmm2
442; X64-XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
443; X64-XOP-NEXT:    vpsllq $4, %xmm0, %xmm2
444; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
445; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
446; X64-XOP-NEXT:    retq
447;
448; X64-AVX2-LABEL: mul_v4i64_17:
449; X64-AVX2:       # %bb.0:
450; X64-AVX2-NEXT:    vpsllq $4, %ymm0, %ymm1
451; X64-AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
452; X64-AVX2-NEXT:    retq
453;
454; X64-AVX512DQ-LABEL: mul_v4i64_17:
455; X64-AVX512DQ:       # %bb.0:
456; X64-AVX512DQ-NEXT:    vpsllq $4, %ymm0, %ymm1
457; X64-AVX512DQ-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
458; X64-AVX512DQ-NEXT:    retq
459  %1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17>
460  ret <4 x i64> %1
461}
462
463define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
464; SSE2-LABEL: mul_v8i32_17:
465; SSE2:       # %bb.0:
466; SSE2-NEXT:    movdqa %xmm0, %xmm2
467; SSE2-NEXT:    pslld $4, %xmm2
468; SSE2-NEXT:    paddd %xmm2, %xmm0
469; SSE2-NEXT:    movdqa %xmm1, %xmm2
470; SSE2-NEXT:    pslld $4, %xmm2
471; SSE2-NEXT:    paddd %xmm2, %xmm1
472; SSE2-NEXT:    ret{{[l|q]}}
473;
474; X86-SSE4-LABEL: mul_v8i32_17:
475; X86-SSE4:       # %bb.0:
476; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17]
477; X86-SSE4-NEXT:    pmulld %xmm2, %xmm0
478; X86-SSE4-NEXT:    pmulld %xmm2, %xmm1
479; X86-SSE4-NEXT:    retl
480;
481; X64-SSE4-FAST-LABEL: mul_v8i32_17:
482; X64-SSE4-FAST:       # %bb.0:
483; X64-SSE4-FAST-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17]
484; X64-SSE4-FAST-NEXT:    pmulld %xmm2, %xmm0
485; X64-SSE4-FAST-NEXT:    pmulld %xmm2, %xmm1
486; X64-SSE4-FAST-NEXT:    retq
487;
488; X64-SSE4-SLOW-LABEL: mul_v8i32_17:
489; X64-SSE4-SLOW:       # %bb.0:
490; X64-SSE4-SLOW-NEXT:    movdqa %xmm0, %xmm2
491; X64-SSE4-SLOW-NEXT:    pslld $4, %xmm2
492; X64-SSE4-SLOW-NEXT:    paddd %xmm2, %xmm0
493; X64-SSE4-SLOW-NEXT:    movdqa %xmm1, %xmm2
494; X64-SSE4-SLOW-NEXT:    pslld $4, %xmm2
495; X64-SSE4-SLOW-NEXT:    paddd %xmm2, %xmm1
496; X64-SSE4-SLOW-NEXT:    retq
497;
498; X64-XOP-LABEL: mul_v8i32_17:
499; X64-XOP:       # %bb.0:
500; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
501; X64-XOP-NEXT:    vpslld $4, %xmm1, %xmm2
502; X64-XOP-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
503; X64-XOP-NEXT:    vpslld $4, %xmm0, %xmm2
504; X64-XOP-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
505; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
506; X64-XOP-NEXT:    retq
507;
508; X64-AVX2-LABEL: mul_v8i32_17:
509; X64-AVX2:       # %bb.0:
510; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17]
511; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
512; X64-AVX2-NEXT:    retq
513;
514; X64-AVX512DQ-LABEL: mul_v8i32_17:
515; X64-AVX512DQ:       # %bb.0:
516; X64-AVX512DQ-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
517; X64-AVX512DQ-NEXT:    retq
518  %1 = mul <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
519  ret <8 x i32> %1
520}
521
522define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
523; SSE-LABEL: mul_v16i16_17:
524; SSE:       # %bb.0:
525; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
526; SSE-NEXT:    pmullw %xmm2, %xmm0
527; SSE-NEXT:    pmullw %xmm2, %xmm1
528; SSE-NEXT:    ret{{[l|q]}}
529;
530; X64-XOP-LABEL: mul_v16i16_17:
531; X64-XOP:       # %bb.0:
532; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
533; X64-XOP-NEXT:    vpsllw $4, %xmm1, %xmm2
534; X64-XOP-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
535; X64-XOP-NEXT:    vpsllw $4, %xmm0, %xmm2
536; X64-XOP-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
537; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
538; X64-XOP-NEXT:    retq
539;
540; X64-AVX2-LABEL: mul_v16i16_17:
541; X64-AVX2:       # %bb.0:
542; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
543; X64-AVX2-NEXT:    retq
544;
545; X64-AVX512DQ-LABEL: mul_v16i16_17:
546; X64-AVX512DQ:       # %bb.0:
547; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
548; X64-AVX512DQ-NEXT:    retq
549  %1 = mul <16 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
550  ret <16 x i16> %1
551}
552
553define <32 x i8> @mul_v32i8_17(<32 x i8> %a0) nounwind {
554; SSE-LABEL: mul_v32i8_17:
555; SSE:       # %bb.0:
556; SSE-NEXT:    movdqa %xmm0, %xmm2
557; SSE-NEXT:    psllw $4, %xmm2
558; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
559; SSE-NEXT:    pand %xmm3, %xmm2
560; SSE-NEXT:    paddb %xmm2, %xmm0
561; SSE-NEXT:    movdqa %xmm1, %xmm2
562; SSE-NEXT:    psllw $4, %xmm2
563; SSE-NEXT:    pand %xmm3, %xmm2
564; SSE-NEXT:    paddb %xmm2, %xmm1
565; SSE-NEXT:    ret{{[l|q]}}
566;
567; X64-XOP-LABEL: mul_v32i8_17:
568; X64-XOP:       # %bb.0:
569; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
570; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
571; X64-XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
572; X64-XOP-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
573; X64-XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
574; X64-XOP-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
575; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
576; X64-XOP-NEXT:    retq
577;
578; X64-AVX2-LABEL: mul_v32i8_17:
579; X64-AVX2:       # %bb.0:
580; X64-AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
581; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
582; X64-AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
583; X64-AVX2-NEXT:    retq
584;
585; X64-AVX512DQ-LABEL: mul_v32i8_17:
586; X64-AVX512DQ:       # %bb.0:
587; X64-AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm1
588; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
589; X64-AVX512DQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
590; X64-AVX512DQ-NEXT:    retq
591  %1 = mul <32 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
592  ret <32 x i8> %1
593}
594
595;
596; -(PowOf2 + 1) (uniform)
597;
598
599define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
600; SSE-LABEL: mul_v2i64_neg1025:
601; SSE:       # %bb.0:
602; SSE-NEXT:    movdqa %xmm0, %xmm1
603; SSE-NEXT:    psllq $10, %xmm1
604; SSE-NEXT:    paddq %xmm0, %xmm1
605; SSE-NEXT:    pxor %xmm0, %xmm0
606; SSE-NEXT:    psubq %xmm1, %xmm0
607; SSE-NEXT:    ret{{[l|q]}}
608;
609; X64-AVX-LABEL: mul_v2i64_neg1025:
610; X64-AVX:       # %bb.0:
611; X64-AVX-NEXT:    vpsllq $10, %xmm0, %xmm1
612; X64-AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
613; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
614; X64-AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
615; X64-AVX-NEXT:    retq
616  %1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025>
617  ret <2 x i64> %1
618}
619
620define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
621; SSE2-LABEL: mul_v4i32_neg33:
622; SSE2:       # %bb.0:
623; SSE2-NEXT:    movdqa %xmm0, %xmm1
624; SSE2-NEXT:    pslld $5, %xmm1
625; SSE2-NEXT:    paddd %xmm0, %xmm1
626; SSE2-NEXT:    pxor %xmm0, %xmm0
627; SSE2-NEXT:    psubd %xmm1, %xmm0
628; SSE2-NEXT:    ret{{[l|q]}}
629;
630; X86-SSE4-LABEL: mul_v4i32_neg33:
631; X86-SSE4:       # %bb.0:
632; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
633; X86-SSE4-NEXT:    retl
634;
635; X64-SSE4-FAST-LABEL: mul_v4i32_neg33:
636; X64-SSE4-FAST:       # %bb.0:
637; X64-SSE4-FAST-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
638; X64-SSE4-FAST-NEXT:    retq
639;
640; X64-SSE4-SLOW-LABEL: mul_v4i32_neg33:
641; X64-SSE4-SLOW:       # %bb.0:
642; X64-SSE4-SLOW-NEXT:    movdqa %xmm0, %xmm1
643; X64-SSE4-SLOW-NEXT:    pslld $5, %xmm1
644; X64-SSE4-SLOW-NEXT:    paddd %xmm0, %xmm1
645; X64-SSE4-SLOW-NEXT:    pxor %xmm0, %xmm0
646; X64-SSE4-SLOW-NEXT:    psubd %xmm1, %xmm0
647; X64-SSE4-SLOW-NEXT:    retq
648;
649; X64-XOP-LABEL: mul_v4i32_neg33:
650; X64-XOP:       # %bb.0:
651; X64-XOP-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
652; X64-XOP-NEXT:    retq
653;
654; X64-AVX2-LABEL: mul_v4i32_neg33:
655; X64-AVX2:       # %bb.0:
656; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967263,4294967263,4294967263,4294967263]
657; X64-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
658; X64-AVX2-NEXT:    retq
659;
660; X64-AVX512DQ-LABEL: mul_v4i32_neg33:
661; X64-AVX512DQ:       # %bb.0:
662; X64-AVX512DQ-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
663; X64-AVX512DQ-NEXT:    retq
664  %1 = mul <4 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33>
665  ret <4 x i32> %1
666}
667
668define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind {
669; X86-SSE-LABEL: mul_v8i16_neg9:
670; X86-SSE:       # %bb.0:
671; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
672; X86-SSE-NEXT:    retl
673;
674; X64-SSE-LABEL: mul_v8i16_neg9:
675; X64-SSE:       # %bb.0:
676; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
677; X64-SSE-NEXT:    retq
678;
679; X64-AVX-LABEL: mul_v8i16_neg9:
680; X64-AVX:       # %bb.0:
681; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
682; X64-AVX-NEXT:    retq
683  %1 = mul <8 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
684  ret <8 x i16> %1
685}
686
687define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind {
688; X86-SSE-LABEL: mul_v16i8_neg5:
689; X86-SSE:       # %bb.0:
690; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
691; X86-SSE-NEXT:    psllw $2, %xmm1
692; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
693; X86-SSE-NEXT:    paddb %xmm0, %xmm1
694; X86-SSE-NEXT:    pxor %xmm0, %xmm0
695; X86-SSE-NEXT:    psubb %xmm1, %xmm0
696; X86-SSE-NEXT:    retl
697;
698; X64-SSE-LABEL: mul_v16i8_neg5:
699; X64-SSE:       # %bb.0:
700; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
701; X64-SSE-NEXT:    psllw $2, %xmm1
702; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
703; X64-SSE-NEXT:    paddb %xmm0, %xmm1
704; X64-SSE-NEXT:    pxor %xmm0, %xmm0
705; X64-SSE-NEXT:    psubb %xmm1, %xmm0
706; X64-SSE-NEXT:    retq
707;
708; X64-XOP-LABEL: mul_v16i8_neg5:
709; X64-XOP:       # %bb.0:
710; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
711; X64-XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
712; X64-XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
713; X64-XOP-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
714; X64-XOP-NEXT:    retq
715;
716; X64-AVX2-LABEL: mul_v16i8_neg5:
717; X64-AVX2:       # %bb.0:
718; X64-AVX2-NEXT:    vpsllw $2, %xmm0, %xmm1
719; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
720; X64-AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
721; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
722; X64-AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
723; X64-AVX2-NEXT:    retq
724;
725; X64-AVX512DQ-LABEL: mul_v16i8_neg5:
726; X64-AVX512DQ:       # %bb.0:
727; X64-AVX512DQ-NEXT:    vpsllw $2, %xmm0, %xmm1
728; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
729; X64-AVX512DQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
730; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
731; X64-AVX512DQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
732; X64-AVX512DQ-NEXT:    retq
733  %1 = mul <16 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
734  ret <16 x i8> %1
735}
736
737define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
738; SSE-LABEL: mul_v4i64_neg1025:
739; SSE:       # %bb.0:
740; SSE-NEXT:    movdqa %xmm0, %xmm3
741; SSE-NEXT:    psllq $10, %xmm3
742; SSE-NEXT:    paddq %xmm0, %xmm3
743; SSE-NEXT:    pxor %xmm2, %xmm2
744; SSE-NEXT:    pxor %xmm0, %xmm0
745; SSE-NEXT:    psubq %xmm3, %xmm0
746; SSE-NEXT:    movdqa %xmm1, %xmm3
747; SSE-NEXT:    psllq $10, %xmm3
748; SSE-NEXT:    paddq %xmm1, %xmm3
749; SSE-NEXT:    psubq %xmm3, %xmm2
750; SSE-NEXT:    movdqa %xmm2, %xmm1
751; SSE-NEXT:    ret{{[l|q]}}
752;
753; X64-XOP-LABEL: mul_v4i64_neg1025:
754; X64-XOP:       # %bb.0:
755; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
756; X64-XOP-NEXT:    vpsllq $10, %xmm1, %xmm2
757; X64-XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
758; X64-XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
759; X64-XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
760; X64-XOP-NEXT:    vpsllq $10, %xmm0, %xmm3
761; X64-XOP-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
762; X64-XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
763; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
764; X64-XOP-NEXT:    retq
765;
766; X64-AVX2-LABEL: mul_v4i64_neg1025:
767; X64-AVX2:       # %bb.0:
768; X64-AVX2-NEXT:    vpsllq $10, %ymm0, %ymm1
769; X64-AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
770; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
771; X64-AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
772; X64-AVX2-NEXT:    retq
773;
774; X64-AVX512DQ-LABEL: mul_v4i64_neg1025:
775; X64-AVX512DQ:       # %bb.0:
776; X64-AVX512DQ-NEXT:    vpsllq $10, %ymm0, %ymm1
777; X64-AVX512DQ-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
778; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
779; X64-AVX512DQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
780; X64-AVX512DQ-NEXT:    retq
781  %1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025>
782  ret <4 x i64> %1
783}
784
785define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
786; SSE2-LABEL: mul_v8i32_neg33:
787; SSE2:       # %bb.0:
788; SSE2-NEXT:    movdqa %xmm0, %xmm3
789; SSE2-NEXT:    pslld $5, %xmm3
790; SSE2-NEXT:    paddd %xmm0, %xmm3
791; SSE2-NEXT:    pxor %xmm2, %xmm2
792; SSE2-NEXT:    pxor %xmm0, %xmm0
793; SSE2-NEXT:    psubd %xmm3, %xmm0
794; SSE2-NEXT:    movdqa %xmm1, %xmm3
795; SSE2-NEXT:    pslld $5, %xmm3
796; SSE2-NEXT:    paddd %xmm1, %xmm3
797; SSE2-NEXT:    psubd %xmm3, %xmm2
798; SSE2-NEXT:    movdqa %xmm2, %xmm1
799; SSE2-NEXT:    ret{{[l|q]}}
800;
801; X86-SSE4-LABEL: mul_v8i32_neg33:
802; X86-SSE4:       # %bb.0:
803; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
804; X86-SSE4-NEXT:    pmulld %xmm2, %xmm0
805; X86-SSE4-NEXT:    pmulld %xmm2, %xmm1
806; X86-SSE4-NEXT:    retl
807;
808; X64-SSE4-FAST-LABEL: mul_v8i32_neg33:
809; X64-SSE4-FAST:       # %bb.0:
810; X64-SSE4-FAST-NEXT:    movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
811; X64-SSE4-FAST-NEXT:    pmulld %xmm2, %xmm0
812; X64-SSE4-FAST-NEXT:    pmulld %xmm2, %xmm1
813; X64-SSE4-FAST-NEXT:    retq
814;
815; X64-SSE4-SLOW-LABEL: mul_v8i32_neg33:
816; X64-SSE4-SLOW:       # %bb.0:
817; X64-SSE4-SLOW-NEXT:    movdqa %xmm0, %xmm3
818; X64-SSE4-SLOW-NEXT:    pslld $5, %xmm3
819; X64-SSE4-SLOW-NEXT:    paddd %xmm0, %xmm3
820; X64-SSE4-SLOW-NEXT:    pxor %xmm2, %xmm2
821; X64-SSE4-SLOW-NEXT:    pxor %xmm0, %xmm0
822; X64-SSE4-SLOW-NEXT:    psubd %xmm3, %xmm0
823; X64-SSE4-SLOW-NEXT:    movdqa %xmm1, %xmm3
824; X64-SSE4-SLOW-NEXT:    pslld $5, %xmm3
825; X64-SSE4-SLOW-NEXT:    paddd %xmm1, %xmm3
826; X64-SSE4-SLOW-NEXT:    psubd %xmm3, %xmm2
827; X64-SSE4-SLOW-NEXT:    movdqa %xmm2, %xmm1
828; X64-SSE4-SLOW-NEXT:    retq
829;
830; X64-XOP-LABEL: mul_v8i32_neg33:
831; X64-XOP:       # %bb.0:
832; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
833; X64-XOP-NEXT:    vpslld $5, %xmm1, %xmm2
834; X64-XOP-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
835; X64-XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
836; X64-XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
837; X64-XOP-NEXT:    vpslld $5, %xmm0, %xmm3
838; X64-XOP-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
839; X64-XOP-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
840; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
841; X64-XOP-NEXT:    retq
842;
843; X64-AVX2-LABEL: mul_v8i32_neg33:
844; X64-AVX2:       # %bb.0:
845; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263]
846; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
847; X64-AVX2-NEXT:    retq
848;
849; X64-AVX512DQ-LABEL: mul_v8i32_neg33:
850; X64-AVX512DQ:       # %bb.0:
851; X64-AVX512DQ-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
852; X64-AVX512DQ-NEXT:    retq
853  %1 = mul <8 x i32> %a0, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33>
854  ret <8 x i32> %1
855}
856
857define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
858; SSE-LABEL: mul_v16i16_neg9:
859; SSE:       # %bb.0:
860; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
861; SSE-NEXT:    pmullw %xmm2, %xmm0
862; SSE-NEXT:    pmullw %xmm2, %xmm1
863; SSE-NEXT:    ret{{[l|q]}}
864;
865; X64-XOP-LABEL: mul_v16i16_neg9:
866; X64-XOP:       # %bb.0:
867; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
868; X64-XOP-NEXT:    vpsllw $3, %xmm1, %xmm2
869; X64-XOP-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
870; X64-XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
871; X64-XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
872; X64-XOP-NEXT:    vpsllw $3, %xmm0, %xmm3
873; X64-XOP-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
874; X64-XOP-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
875; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
876; X64-XOP-NEXT:    retq
877;
878; X64-AVX2-LABEL: mul_v16i16_neg9:
879; X64-AVX2:       # %bb.0:
880; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
881; X64-AVX2-NEXT:    retq
882;
883; X64-AVX512DQ-LABEL: mul_v16i16_neg9:
884; X64-AVX512DQ:       # %bb.0:
885; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
886; X64-AVX512DQ-NEXT:    retq
887  %1 = mul <16 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
888  ret <16 x i16> %1
889}
890
891define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind {
892; SSE-LABEL: mul_v32i8_neg5:
893; SSE:       # %bb.0:
894; SSE-NEXT:    movdqa %xmm0, %xmm3
895; SSE-NEXT:    psllw $2, %xmm3
896; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
897; SSE-NEXT:    pand %xmm4, %xmm3
898; SSE-NEXT:    paddb %xmm0, %xmm3
899; SSE-NEXT:    pxor %xmm2, %xmm2
900; SSE-NEXT:    pxor %xmm0, %xmm0
901; SSE-NEXT:    psubb %xmm3, %xmm0
902; SSE-NEXT:    movdqa %xmm1, %xmm3
903; SSE-NEXT:    psllw $2, %xmm3
904; SSE-NEXT:    pand %xmm4, %xmm3
905; SSE-NEXT:    paddb %xmm1, %xmm3
906; SSE-NEXT:    psubb %xmm3, %xmm2
907; SSE-NEXT:    movdqa %xmm2, %xmm1
908; SSE-NEXT:    ret{{[l|q]}}
909;
910; X64-XOP-LABEL: mul_v32i8_neg5:
911; X64-XOP:       # %bb.0:
912; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
913; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
914; X64-XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm3
915; X64-XOP-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
916; X64-XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
917; X64-XOP-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
918; X64-XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm2
919; X64-XOP-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
920; X64-XOP-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
921; X64-XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
922; X64-XOP-NEXT:    retq
923;
924; X64-AVX2-LABEL: mul_v32i8_neg5:
925; X64-AVX2:       # %bb.0:
926; X64-AVX2-NEXT:    vpsllw $2, %ymm0, %ymm1
927; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
928; X64-AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
929; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
930; X64-AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm0
931; X64-AVX2-NEXT:    retq
932;
933; X64-AVX512DQ-LABEL: mul_v32i8_neg5:
934; X64-AVX512DQ:       # %bb.0:
935; X64-AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm1
936; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
937; X64-AVX512DQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
938; X64-AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
939; X64-AVX512DQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm0
940; X64-AVX512DQ-NEXT:    retq
941  %1 = mul <32 x i8> %a0, <i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5, i8 -5>
942  ret <32 x i8> %1
943}
944
945;
946; PowOf2 + 1 (non-uniform)
947;
948
949define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
950; X86-SSE-LABEL: mul_v2i64_17_65:
951; X86-SSE:       # %bb.0:
952; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [17,0,65,0]
953; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
954; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
955; X86-SSE-NEXT:    psrlq $32, %xmm0
956; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
957; X86-SSE-NEXT:    psllq $32, %xmm0
958; X86-SSE-NEXT:    paddq %xmm2, %xmm0
959; X86-SSE-NEXT:    retl
960;
961; X64-SSE-LABEL: mul_v2i64_17_65:
962; X64-SSE:       # %bb.0:
963; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [17,65]
964; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
965; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
966; X64-SSE-NEXT:    psrlq $32, %xmm0
967; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
968; X64-SSE-NEXT:    psllq $32, %xmm0
969; X64-SSE-NEXT:    paddq %xmm2, %xmm0
970; X64-SSE-NEXT:    retq
971;
972; X64-XOP-LABEL: mul_v2i64_17_65:
973; X64-XOP:       # %bb.0:
974; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [17,65]
975; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
976; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
977; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
978; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
979; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
980; X64-XOP-NEXT:    retq
981;
982; X64-AVX2-LABEL: mul_v2i64_17_65:
983; X64-AVX2:       # %bb.0:
984; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [17,65]
985; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
986; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
987; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
988; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
989; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
990; X64-AVX2-NEXT:    retq
991;
992; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
993; X64-AVX512DQ:       # %bb.0:
994; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
995; X64-AVX512DQ-NEXT:    retq
996  %1 = mul <2 x i64> %a0, <i64 17, i64 65>
997  ret <2 x i64> %1
998}
999
1000define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
1001; X86-SSE2-LABEL: mul_v4i32_5_17_33_65:
1002; X86-SSE2:       # %bb.0:
1003; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1004; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1005; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1006; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1007; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1008; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1009; X86-SSE2-NEXT:    retl
1010;
1011; X86-SSE4-LABEL: mul_v4i32_5_17_33_65:
1012; X86-SSE4:       # %bb.0:
1013; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1014; X86-SSE4-NEXT:    retl
1015;
1016; X64-SSE2-LABEL: mul_v4i32_5_17_33_65:
1017; X64-SSE2:       # %bb.0:
1018; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1019; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1020; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1021; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1022; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1023; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1024; X64-SSE2-NEXT:    retq
1025;
1026; X64-SSE4-LABEL: mul_v4i32_5_17_33_65:
1027; X64-SSE4:       # %bb.0:
1028; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1029; X64-SSE4-NEXT:    retq
1030;
1031; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
1032; X64-AVX:       # %bb.0:
1033; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1034; X64-AVX-NEXT:    retq
1035  %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
1036  ret <4 x i32> %1
1037}
1038
1039define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind {
1040; X86-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
1041; X86-SSE:       # %bb.0:
1042; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1043; X86-SSE-NEXT:    retl
1044;
1045; X64-SSE-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
1046; X64-SSE:       # %bb.0:
1047; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1048; X64-SSE-NEXT:    retq
1049;
1050; X64-AVX-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
1051; X64-AVX:       # %bb.0:
1052; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1053; X64-AVX-NEXT:    retq
1054  %1 = mul <8 x i16> %a0, <i16 2, i16 3, i16 9, i16 17, i16 33, i16 65, i16 129, i16 257>
1055  ret <8 x i16> %1
1056}
1057
1058define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind {
1059; X86-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1060; X86-SSE2:       # %bb.0:
1061; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1062; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1063; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1064; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1065; X86-SSE2-NEXT:    pand %xmm2, %xmm1
1066; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1067; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1068; X86-SSE2-NEXT:    pand %xmm2, %xmm0
1069; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
1070; X86-SSE2-NEXT:    retl
1071;
1072; X86-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1073; X86-SSE4:       # %bb.0:
1074; X86-SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1075; X86-SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1076; X86-SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1077; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1078; X86-SSE4-NEXT:    pand %xmm2, %xmm0
1079; X86-SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1080; X86-SSE4-NEXT:    pand %xmm2, %xmm1
1081; X86-SSE4-NEXT:    packuswb %xmm0, %xmm1
1082; X86-SSE4-NEXT:    movdqa %xmm1, %xmm0
1083; X86-SSE4-NEXT:    retl
1084;
1085; X64-SSE2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1086; X64-SSE2:       # %bb.0:
1087; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
1088; X64-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1089; X64-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1090; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1091; X64-SSE2-NEXT:    pand %xmm2, %xmm1
1092; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1093; X64-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1094; X64-SSE2-NEXT:    pand %xmm2, %xmm0
1095; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
1096; X64-SSE2-NEXT:    retq
1097;
1098; X64-SSE4-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1099; X64-SSE4:       # %bb.0:
1100; X64-SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1101; X64-SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1102; X64-SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1103; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1104; X64-SSE4-NEXT:    pand %xmm2, %xmm0
1105; X64-SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1106; X64-SSE4-NEXT:    pand %xmm2, %xmm1
1107; X64-SSE4-NEXT:    packuswb %xmm0, %xmm1
1108; X64-SSE4-NEXT:    movdqa %xmm1, %xmm0
1109; X64-SSE4-NEXT:    retq
1110;
1111; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1112; X64-XOP:       # %bb.0:
1113; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1114; X64-XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1115; X64-XOP-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1116; X64-XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1117; X64-XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
1118; X64-XOP-NEXT:    retq
1119;
1120; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1121; X64-AVX2:       # %bb.0:
1122; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1123; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1124; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1125; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1126; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1127; X64-AVX2-NEXT:    vzeroupper
1128; X64-AVX2-NEXT:    retq
1129;
1130; X64-AVX512DQ-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
1131; X64-AVX512DQ:       # %bb.0:
1132; X64-AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1133; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1134; X64-AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1135; X64-AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1136; X64-AVX512DQ-NEXT:    vzeroupper
1137; X64-AVX512DQ-NEXT:    retq
1138  %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3>
1139  ret <16 x i8> %1
1140}
1141
1142;
1143; PowOf2 - 1 (uniform)
1144;
1145
1146define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
1147; SSE-LABEL: mul_v2i64_7:
1148; SSE:       # %bb.0:
1149; SSE-NEXT:    movdqa %xmm0, %xmm1
1150; SSE-NEXT:    psllq $3, %xmm1
1151; SSE-NEXT:    psubq %xmm0, %xmm1
1152; SSE-NEXT:    movdqa %xmm1, %xmm0
1153; SSE-NEXT:    ret{{[l|q]}}
1154;
1155; X64-AVX-LABEL: mul_v2i64_7:
1156; X64-AVX:       # %bb.0:
1157; X64-AVX-NEXT:    vpsllq $3, %xmm0, %xmm1
1158; X64-AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
1159; X64-AVX-NEXT:    retq
1160  %1 = mul <2 x i64> %a0, <i64 7, i64 7>
1161  ret <2 x i64> %1
1162}
1163
1164define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
1165; SSE2-LABEL: mul_v4i32_7:
1166; SSE2:       # %bb.0:
1167; SSE2-NEXT:    movdqa %xmm0, %xmm1
1168; SSE2-NEXT:    pslld $3, %xmm1
1169; SSE2-NEXT:    psubd %xmm0, %xmm1
1170; SSE2-NEXT:    movdqa %xmm1, %xmm0
1171; SSE2-NEXT:    ret{{[l|q]}}
1172;
1173; X86-SSE4-LABEL: mul_v4i32_7:
1174; X86-SSE4:       # %bb.0:
1175; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1176; X86-SSE4-NEXT:    retl
1177;
1178; X64-SSE4-FAST-LABEL: mul_v4i32_7:
1179; X64-SSE4-FAST:       # %bb.0:
1180; X64-SSE4-FAST-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1181; X64-SSE4-FAST-NEXT:    retq
1182;
1183; X64-SSE4-SLOW-LABEL: mul_v4i32_7:
1184; X64-SSE4-SLOW:       # %bb.0:
1185; X64-SSE4-SLOW-NEXT:    movdqa %xmm0, %xmm1
1186; X64-SSE4-SLOW-NEXT:    pslld $3, %xmm1
1187; X64-SSE4-SLOW-NEXT:    psubd %xmm0, %xmm1
1188; X64-SSE4-SLOW-NEXT:    movdqa %xmm1, %xmm0
1189; X64-SSE4-SLOW-NEXT:    retq
1190;
1191; X64-XOP-LABEL: mul_v4i32_7:
1192; X64-XOP:       # %bb.0:
1193; X64-XOP-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1194; X64-XOP-NEXT:    retq
1195;
1196; X64-AVX2-LABEL: mul_v4i32_7:
1197; X64-AVX2:       # %bb.0:
1198; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
1199; X64-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1200; X64-AVX2-NEXT:    retq
1201;
1202; X64-AVX512DQ-LABEL: mul_v4i32_7:
1203; X64-AVX512DQ:       # %bb.0:
1204; X64-AVX512DQ-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1205; X64-AVX512DQ-NEXT:    retq
1206  %1 = mul <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
1207  ret <4 x i32> %1
1208}
1209
1210define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind {
1211; X86-SSE-LABEL: mul_v8i16_7:
1212; X86-SSE:       # %bb.0:
1213; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1214; X86-SSE-NEXT:    retl
1215;
1216; X64-SSE-LABEL: mul_v8i16_7:
1217; X64-SSE:       # %bb.0:
1218; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1219; X64-SSE-NEXT:    retq
1220;
1221; X64-AVX-LABEL: mul_v8i16_7:
1222; X64-AVX:       # %bb.0:
1223; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1224; X64-AVX-NEXT:    retq
1225  %1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1226  ret <8 x i16> %1
1227}
1228
1229define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind {
1230; X86-SSE-LABEL: mul_v16i8_31:
1231; X86-SSE:       # %bb.0:
1232; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1233; X86-SSE-NEXT:    psllw $5, %xmm1
1234; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1235; X86-SSE-NEXT:    psubb %xmm0, %xmm1
1236; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
1237; X86-SSE-NEXT:    retl
1238;
1239; X64-SSE-LABEL: mul_v16i8_31:
1240; X64-SSE:       # %bb.0:
1241; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
1242; X64-SSE-NEXT:    psllw $5, %xmm1
1243; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1244; X64-SSE-NEXT:    psubb %xmm0, %xmm1
1245; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
1246; X64-SSE-NEXT:    retq
1247;
1248; X64-XOP-LABEL: mul_v16i8_31:
1249; X64-XOP:       # %bb.0:
1250; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1251; X64-XOP-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1252; X64-XOP-NEXT:    retq
1253;
1254; X64-AVX2-LABEL: mul_v16i8_31:
1255; X64-AVX2:       # %bb.0:
1256; X64-AVX2-NEXT:    vpsllw $5, %xmm0, %xmm1
1257; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1258; X64-AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1259; X64-AVX2-NEXT:    retq
1260;
1261; X64-AVX512DQ-LABEL: mul_v16i8_31:
1262; X64-AVX512DQ:       # %bb.0:
1263; X64-AVX512DQ-NEXT:    vpsllw $5, %xmm0, %xmm1
1264; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1265; X64-AVX512DQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1266; X64-AVX512DQ-NEXT:    retq
1267  %1 = mul <16 x i8> %a0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
1268  ret <16 x i8> %1
1269}
1270
1271;
1272; -(PowOf2 - 1) (uniform)
1273;
1274
1275define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
1276; SSE-LABEL: mul_v2i64_neg7:
1277; SSE:       # %bb.0:
1278; SSE-NEXT:    movdqa %xmm0, %xmm1
1279; SSE-NEXT:    psllq $3, %xmm1
1280; SSE-NEXT:    psubq %xmm1, %xmm0
1281; SSE-NEXT:    ret{{[l|q]}}
1282;
1283; X64-AVX-LABEL: mul_v2i64_neg7:
1284; X64-AVX:       # %bb.0:
1285; X64-AVX-NEXT:    vpsllq $3, %xmm0, %xmm1
1286; X64-AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1287; X64-AVX-NEXT:    retq
1288  %1 = mul <2 x i64> %a0, <i64 -7, i64 -7>
1289  ret <2 x i64> %1
1290}
1291
1292define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
1293; SSE2-LABEL: mul_v4i32_neg63:
1294; SSE2:       # %bb.0:
1295; SSE2-NEXT:    movdqa %xmm0, %xmm1
1296; SSE2-NEXT:    pslld $6, %xmm1
1297; SSE2-NEXT:    psubd %xmm1, %xmm0
1298; SSE2-NEXT:    ret{{[l|q]}}
1299;
1300; X86-SSE4-LABEL: mul_v4i32_neg63:
1301; X86-SSE4:       # %bb.0:
1302; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1303; X86-SSE4-NEXT:    retl
1304;
1305; X64-SSE4-FAST-LABEL: mul_v4i32_neg63:
1306; X64-SSE4-FAST:       # %bb.0:
1307; X64-SSE4-FAST-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1308; X64-SSE4-FAST-NEXT:    retq
1309;
1310; X64-SSE4-SLOW-LABEL: mul_v4i32_neg63:
1311; X64-SSE4-SLOW:       # %bb.0:
1312; X64-SSE4-SLOW-NEXT:    movdqa %xmm0, %xmm1
1313; X64-SSE4-SLOW-NEXT:    pslld $6, %xmm1
1314; X64-SSE4-SLOW-NEXT:    psubd %xmm1, %xmm0
1315; X64-SSE4-SLOW-NEXT:    retq
1316;
1317; X64-XOP-LABEL: mul_v4i32_neg63:
1318; X64-XOP:       # %bb.0:
1319; X64-XOP-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1320; X64-XOP-NEXT:    retq
1321;
1322; X64-AVX2-LABEL: mul_v4i32_neg63:
1323; X64-AVX2:       # %bb.0:
1324; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967233,4294967233,4294967233,4294967233]
1325; X64-AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1326; X64-AVX2-NEXT:    retq
1327;
1328; X64-AVX512DQ-LABEL: mul_v4i32_neg63:
1329; X64-AVX512DQ:       # %bb.0:
1330; X64-AVX512DQ-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1331; X64-AVX512DQ-NEXT:    retq
1332  %1 = mul <4 x i32> %a0, <i32 -63, i32 -63, i32 -63, i32 -63>
1333  ret <4 x i32> %1
1334}
1335
1336define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind {
1337; X86-SSE-LABEL: mul_v8i16_neg31:
1338; X86-SSE:       # %bb.0:
1339; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1340; X86-SSE-NEXT:    retl
1341;
1342; X64-SSE-LABEL: mul_v8i16_neg31:
1343; X64-SSE:       # %bb.0:
1344; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1345; X64-SSE-NEXT:    retq
1346;
1347; X64-AVX-LABEL: mul_v8i16_neg31:
1348; X64-AVX:       # %bb.0:
1349; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1350; X64-AVX-NEXT:    retq
1351  %1 = mul <8 x i16> %a0, <i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31>
1352  ret <8 x i16> %1
1353}
1354
1355define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind {
1356; X86-SSE-LABEL: mul_v16i8_neg15:
1357; X86-SSE:       # %bb.0:
1358; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1359; X86-SSE-NEXT:    psllw $4, %xmm1
1360; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1361; X86-SSE-NEXT:    psubb %xmm1, %xmm0
1362; X86-SSE-NEXT:    retl
1363;
1364; X64-SSE-LABEL: mul_v16i8_neg15:
1365; X64-SSE:       # %bb.0:
1366; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
1367; X64-SSE-NEXT:    psllw $4, %xmm1
1368; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1369; X64-SSE-NEXT:    psubb %xmm1, %xmm0
1370; X64-SSE-NEXT:    retq
1371;
1372; X64-XOP-LABEL: mul_v16i8_neg15:
1373; X64-XOP:       # %bb.0:
1374; X64-XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1375; X64-XOP-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1376; X64-XOP-NEXT:    retq
1377;
1378; X64-AVX2-LABEL: mul_v16i8_neg15:
1379; X64-AVX2:       # %bb.0:
1380; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm1
1381; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1382; X64-AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1383; X64-AVX2-NEXT:    retq
1384;
1385; X64-AVX512DQ-LABEL: mul_v16i8_neg15:
1386; X64-AVX512DQ:       # %bb.0:
1387; X64-AVX512DQ-NEXT:    vpsllw $4, %xmm0, %xmm1
1388; X64-AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1389; X64-AVX512DQ-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1390; X64-AVX512DQ-NEXT:    retq
1391  %1 = mul <16 x i8> %a0, <i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15, i8 -15>
1392  ret <16 x i8> %1
1393}
1394
1395;
1396; PowOf2 - 1 (non-uniform)
1397;
1398
1399define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
1400; X86-SSE-LABEL: mul_v2i64_15_63:
1401; X86-SSE:       # %bb.0:
1402; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,63,0]
1403; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1404; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
1405; X86-SSE-NEXT:    psrlq $32, %xmm0
1406; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
1407; X86-SSE-NEXT:    psllq $32, %xmm0
1408; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1409; X86-SSE-NEXT:    retl
1410;
1411; X64-SSE-LABEL: mul_v2i64_15_63:
1412; X64-SSE:       # %bb.0:
1413; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,63]
1414; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1415; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1416; X64-SSE-NEXT:    psrlq $32, %xmm0
1417; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
1418; X64-SSE-NEXT:    psllq $32, %xmm0
1419; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1420; X64-SSE-NEXT:    retq
1421;
1422; X64-XOP-LABEL: mul_v2i64_15_63:
1423; X64-XOP:       # %bb.0:
1424; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,63]
1425; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1426; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
1427; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1428; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1429; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1430; X64-XOP-NEXT:    retq
1431;
1432; X64-AVX2-LABEL: mul_v2i64_15_63:
1433; X64-AVX2:       # %bb.0:
1434; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,63]
1435; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1436; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
1437; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1438; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1439; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1440; X64-AVX2-NEXT:    retq
1441;
1442; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
1443; X64-AVX512DQ:       # %bb.0:
1444; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1445; X64-AVX512DQ-NEXT:    retq
1446  %1 = mul <2 x i64> %a0, <i64 15, i64 63>
1447  ret <2 x i64> %1
1448}
1449
1450define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
1451; X86-SSE-LABEL: mul_v2i64_neg_15_63:
1452; X86-SSE:       # %bb.0:
1453; X86-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
1454; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
1455; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1456; X86-SSE-NEXT:    psrlq $32, %xmm2
1457; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295]
1458; X86-SSE-NEXT:    pmuludq %xmm3, %xmm2
1459; X86-SSE-NEXT:    paddq %xmm1, %xmm2
1460; X86-SSE-NEXT:    psllq $32, %xmm2
1461; X86-SSE-NEXT:    pmuludq %xmm3, %xmm0
1462; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1463; X86-SSE-NEXT:    retl
1464;
1465; X64-SSE-LABEL: mul_v2i64_neg_15_63:
1466; X64-SSE:       # %bb.0:
1467; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1468; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1469; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1470; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
1471; X64-SSE-NEXT:    psrlq $32, %xmm3
1472; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
1473; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1474; X64-SSE-NEXT:    paddq %xmm3, %xmm0
1475; X64-SSE-NEXT:    psllq $32, %xmm0
1476; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1477; X64-SSE-NEXT:    retq
1478;
1479; X64-XOP-LABEL: mul_v2i64_neg_15_63:
1480; X64-XOP:       # %bb.0:
1481; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1482; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1483; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1484; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1485; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1486; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1487; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1488; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1489; X64-XOP-NEXT:    retq
1490;
1491; X64-AVX2-LABEL: mul_v2i64_neg_15_63:
1492; X64-AVX2:       # %bb.0:
1493; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
1494; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1495; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1496; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1497; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1498; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1499; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1500; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1501; X64-AVX2-NEXT:    retq
1502;
1503; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
1504; X64-AVX512DQ:       # %bb.0:
1505; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1506; X64-AVX512DQ-NEXT:    retq
1507  %1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
1508  ret <2 x i64> %1
1509}
1510
1511define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
1512; X86-SSE-LABEL: mul_v2i64_neg_17_65:
1513; X86-SSE:       # %bb.0:
1514; X86-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
1515; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
1516; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1517; X86-SSE-NEXT:    psrlq $32, %xmm2
1518; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295]
1519; X86-SSE-NEXT:    pmuludq %xmm3, %xmm2
1520; X86-SSE-NEXT:    paddq %xmm1, %xmm2
1521; X86-SSE-NEXT:    psllq $32, %xmm2
1522; X86-SSE-NEXT:    pmuludq %xmm3, %xmm0
1523; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1524; X86-SSE-NEXT:    retl
1525;
1526; X64-SSE-LABEL: mul_v2i64_neg_17_65:
1527; X64-SSE:       # %bb.0:
1528; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1529; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1530; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1531; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
1532; X64-SSE-NEXT:    psrlq $32, %xmm3
1533; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
1534; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1535; X64-SSE-NEXT:    paddq %xmm3, %xmm0
1536; X64-SSE-NEXT:    psllq $32, %xmm0
1537; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1538; X64-SSE-NEXT:    retq
1539;
1540; X64-XOP-LABEL: mul_v2i64_neg_17_65:
1541; X64-XOP:       # %bb.0:
1542; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1543; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1544; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1545; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1546; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1547; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1548; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1549; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1550; X64-XOP-NEXT:    retq
1551;
1552; X64-AVX2-LABEL: mul_v2i64_neg_17_65:
1553; X64-AVX2:       # %bb.0:
1554; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
1555; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1556; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1557; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1558; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1559; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1560; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1561; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1562; X64-AVX2-NEXT:    retq
1563;
1564; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
1565; X64-AVX512DQ:       # %bb.0:
1566; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1567; X64-AVX512DQ-NEXT:    retq
1568  %1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
1569  ret <2 x i64> %1
1570}
1571
1572define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
1573; X86-SSE2-LABEL: mul_v2i64_0_1:
1574; X86-SSE2:       # %bb.0:
1575; X86-SSE2-NEXT:    xorpd %xmm1, %xmm1
1576; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1577; X86-SSE2-NEXT:    retl
1578;
1579; SSE4-LABEL: mul_v2i64_0_1:
1580; SSE4:       # %bb.0:
1581; SSE4-NEXT:    xorps %xmm1, %xmm1
1582; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1583; SSE4-NEXT:    ret{{[l|q]}}
1584;
1585; X64-SSE2-LABEL: mul_v2i64_0_1:
1586; X64-SSE2:       # %bb.0:
1587; X64-SSE2-NEXT:    xorps %xmm1, %xmm1
1588; X64-SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1589; X64-SSE2-NEXT:    movaps %xmm1, %xmm0
1590; X64-SSE2-NEXT:    retq
1591;
1592; X64-AVX-LABEL: mul_v2i64_0_1:
1593; X64-AVX:       # %bb.0:
1594; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1595; X64-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1596; X64-AVX-NEXT:    retq
1597  %1 = mul <2 x i64> %a0, <i64 0, i64 1>
1598  ret <2 x i64> %1
1599}
1600
1601define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
1602; X86-SSE-LABEL: mul_v2i64_neg_0_1:
1603; X86-SSE:       # %bb.0:
1604; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295]
1605; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1606; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
1607; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
1608; X86-SSE-NEXT:    psrlq $32, %xmm3
1609; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
1610; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1611; X86-SSE-NEXT:    paddq %xmm3, %xmm0
1612; X86-SSE-NEXT:    psllq $32, %xmm0
1613; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1614; X86-SSE-NEXT:    retl
1615;
1616; X64-SSE-LABEL: mul_v2i64_neg_0_1:
1617; X64-SSE:       # %bb.0:
1618; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1619; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1620; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1621; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
1622; X64-SSE-NEXT:    psrlq $32, %xmm3
1623; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
1624; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1625; X64-SSE-NEXT:    paddq %xmm3, %xmm0
1626; X64-SSE-NEXT:    psllq $32, %xmm0
1627; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1628; X64-SSE-NEXT:    retq
1629;
1630; X64-XOP-LABEL: mul_v2i64_neg_0_1:
1631; X64-XOP:       # %bb.0:
1632; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1633; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1634; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1635; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1636; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1637; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1638; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1639; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1640; X64-XOP-NEXT:    retq
1641;
1642; X64-AVX2-LABEL: mul_v2i64_neg_0_1:
1643; X64-AVX2:       # %bb.0:
1644; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
1645; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1646; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1647; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1648; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1649; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1650; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1651; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1652; X64-AVX2-NEXT:    retq
1653;
1654; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
1655; X64-AVX512DQ:       # %bb.0:
1656; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1657; X64-AVX512DQ-NEXT:    retq
1658  %1 = mul <2 x i64> %a0, <i64 0, i64 -1>
1659  ret <2 x i64> %1
1660}
1661
1662define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
1663; X86-SSE-LABEL: mul_v2i64_15_neg_63:
1664; X86-SSE:       # %bb.0:
1665; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295]
1666; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1667; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
1668; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
1669; X86-SSE-NEXT:    psrlq $32, %xmm3
1670; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
1671; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1672; X86-SSE-NEXT:    paddq %xmm3, %xmm0
1673; X86-SSE-NEXT:    psllq $32, %xmm0
1674; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1675; X86-SSE-NEXT:    retl
1676;
1677; X64-SSE-LABEL: mul_v2i64_15_neg_63:
1678; X64-SSE:       # %bb.0:
1679; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1680; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1681; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1682; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
1683; X64-SSE-NEXT:    psrlq $32, %xmm3
1684; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
1685; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1686; X64-SSE-NEXT:    paddq %xmm3, %xmm0
1687; X64-SSE-NEXT:    psllq $32, %xmm0
1688; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1689; X64-SSE-NEXT:    retq
1690;
1691; X64-XOP-LABEL: mul_v2i64_15_neg_63:
1692; X64-XOP:       # %bb.0:
1693; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1694; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1695; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
1696; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1697; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1698; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1699; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1700; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1701; X64-XOP-NEXT:    retq
1702;
1703; X64-AVX2-LABEL: mul_v2i64_15_neg_63:
1704; X64-AVX2:       # %bb.0:
1705; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553]
1706; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1707; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
1708; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1709; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1710; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1711; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1712; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1713; X64-AVX2-NEXT:    retq
1714;
1715; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
1716; X64-AVX512DQ:       # %bb.0:
1717; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1718; X64-AVX512DQ-NEXT:    retq
1719  %1 = mul <2 x i64> %a0, <i64 15, i64 -63>
1720  ret <2 x i64> %1
1721}
1722
1723define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
1724; X86-SSE2-LABEL: mul_v4i32_0_15_31_7:
1725; X86-SSE2:       # %bb.0:
1726; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1727; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1728; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1729; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1730; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1731; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1732; X86-SSE2-NEXT:    retl
1733;
1734; X86-SSE4-LABEL: mul_v4i32_0_15_31_7:
1735; X86-SSE4:       # %bb.0:
1736; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1737; X86-SSE4-NEXT:    retl
1738;
1739; X64-SSE2-LABEL: mul_v4i32_0_15_31_7:
1740; X64-SSE2:       # %bb.0:
1741; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1743; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1744; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1745; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1746; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1747; X64-SSE2-NEXT:    retq
1748;
1749; X64-SSE4-LABEL: mul_v4i32_0_15_31_7:
1750; X64-SSE4:       # %bb.0:
1751; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1752; X64-SSE4-NEXT:    retq
1753;
1754; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
1755; X64-AVX:       # %bb.0:
1756; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1757; X64-AVX-NEXT:    retq
1758  %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
1759  ret <4 x i32> %1
1760}
1761
1762define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind {
1763; X86-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1764; X86-SSE:       # %bb.0:
1765; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1766; X86-SSE-NEXT:    retl
1767;
1768; X64-SSE-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1769; X64-SSE:       # %bb.0:
1770; X64-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1771; X64-SSE-NEXT:    retq
1772;
1773; X64-AVX-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
1774; X64-AVX:       # %bb.0:
1775; X64-AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1776; X64-AVX-NEXT:    retq
1777  %1 = mul <8 x i16> %a0, <i16 0, i16 1, i16 7, i16 15, i16 31, i16 63, i16 127, i16 255>
1778  ret <8 x i16> %1
1779}
1780
1781define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind {
1782; SSE2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1783; SSE2:       # %bb.0:
1784; SSE2-NEXT:    movdqa %xmm0, %xmm1
1785; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1786; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1787; SSE2-NEXT:    pmullw %xmm2, %xmm1
1788; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1789; SSE2-NEXT:    pand %xmm3, %xmm1
1790; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1791; SSE2-NEXT:    pmullw %xmm2, %xmm0
1792; SSE2-NEXT:    pand %xmm3, %xmm0
1793; SSE2-NEXT:    packuswb %xmm1, %xmm0
1794; SSE2-NEXT:    ret{{[l|q]}}
1795;
1796; SSE4-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1797; SSE4:       # %bb.0:
1798; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1799; SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1800; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1801; SSE4-NEXT:    pmullw %xmm2, %xmm0
1802; SSE4-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1803; SSE4-NEXT:    pand %xmm3, %xmm0
1804; SSE4-NEXT:    pmullw %xmm2, %xmm1
1805; SSE4-NEXT:    pand %xmm3, %xmm1
1806; SSE4-NEXT:    packuswb %xmm0, %xmm1
1807; SSE4-NEXT:    movdqa %xmm1, %xmm0
1808; SSE4-NEXT:    ret{{[l|q]}}
1809;
1810; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1811; X64-XOP:       # %bb.0:
1812; X64-XOP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1813; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
1814; X64-XOP-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1815; X64-XOP-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1816; X64-XOP-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1817; X64-XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
1818; X64-XOP-NEXT:    retq
1819;
1820; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1821; X64-AVX2:       # %bb.0:
1822; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1823; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1824; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1825; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1826; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1827; X64-AVX2-NEXT:    vzeroupper
1828; X64-AVX2-NEXT:    retq
1829;
1830; X64-AVX512DQ-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
1831; X64-AVX512DQ:       # %bb.0:
1832; X64-AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1833; X64-AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1834; X64-AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1835; X64-AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1836; X64-AVX512DQ-NEXT:    vzeroupper
1837; X64-AVX512DQ-NEXT:    retq
1838  %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127>
1839  ret <16 x i8> %1
1840}
1841
1842define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
1843; X86-SSE-LABEL: mul_v2i64_68_132:
1844; X86-SSE:       # %bb.0:
1845; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [68,0,132,0]
1846; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1847; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
1848; X86-SSE-NEXT:    psrlq $32, %xmm0
1849; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
1850; X86-SSE-NEXT:    psllq $32, %xmm0
1851; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1852; X86-SSE-NEXT:    retl
1853;
1854; X64-SSE-LABEL: mul_v2i64_68_132:
1855; X64-SSE:       # %bb.0:
1856; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [68,132]
1857; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1858; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1859; X64-SSE-NEXT:    psrlq $32, %xmm0
1860; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
1861; X64-SSE-NEXT:    psllq $32, %xmm0
1862; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1863; X64-SSE-NEXT:    retq
1864;
1865; X64-XOP-LABEL: mul_v2i64_68_132:
1866; X64-XOP:       # %bb.0:
1867; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [68,132]
1868; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1869; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
1870; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1871; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1872; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1873; X64-XOP-NEXT:    retq
1874;
1875; X64-AVX2-LABEL: mul_v2i64_68_132:
1876; X64-AVX2:       # %bb.0:
1877; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [68,132]
1878; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1879; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
1880; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1881; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1882; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1883; X64-AVX2-NEXT:    retq
1884;
1885; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
1886; X64-AVX512DQ:       # %bb.0:
1887; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1888; X64-AVX512DQ-NEXT:    retq
1889  %mul = mul <2 x i64> %x, <i64 68, i64 132>
1890  ret <2 x i64> %mul
1891}
1892
1893define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
1894; X86-SSE-LABEL: mul_v2i64_60_120:
1895; X86-SSE:       # %bb.0:
1896; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [60,0,124,0]
1897; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1898; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
1899; X86-SSE-NEXT:    psrlq $32, %xmm0
1900; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
1901; X86-SSE-NEXT:    psllq $32, %xmm0
1902; X86-SSE-NEXT:    paddq %xmm2, %xmm0
1903; X86-SSE-NEXT:    retl
1904;
1905; X64-SSE-LABEL: mul_v2i64_60_120:
1906; X64-SSE:       # %bb.0:
1907; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [60,124]
1908; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
1909; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
1910; X64-SSE-NEXT:    psrlq $32, %xmm0
1911; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
1912; X64-SSE-NEXT:    psllq $32, %xmm0
1913; X64-SSE-NEXT:    paddq %xmm2, %xmm0
1914; X64-SSE-NEXT:    retq
1915;
1916; X64-XOP-LABEL: mul_v2i64_60_120:
1917; X64-XOP:       # %bb.0:
1918; X64-XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,124]
1919; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1920; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm0
1921; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1922; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
1923; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1924; X64-XOP-NEXT:    retq
1925;
1926; X64-AVX2-LABEL: mul_v2i64_60_120:
1927; X64-AVX2:       # %bb.0:
1928; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [60,124]
1929; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1930; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
1931; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1932; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
1933; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
1934; X64-AVX2-NEXT:    retq
1935;
1936; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
1937; X64-AVX512DQ:       # %bb.0:
1938; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1939; X64-AVX512DQ-NEXT:    retq
1940  %mul = mul <2 x i64> %x, <i64 60, i64 124>
1941  ret <2 x i64> %mul
1942}
1943
1944; We unfortunately can't see the zext that lives in the other basic block so we
1945; don't know that we only need one pmuludq to compute the full 64 bits. This
1946; sort of issue is more likely to occur when there is a loop and one of the
1947; multiply inputs is loop invariant.
1948; FIXME: We should be able to insert an AssertZExt for this.
1949define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) {
1950; X86-SSE2-LABEL: mul_v2i64_zext_cross_bb:
1951; X86-SSE2:       # %bb.0:
1952; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1953; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1954; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1955; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1956; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1957; X86-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
1958; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1]
1959; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1960; X86-SSE2-NEXT:    retl
1961;
1962; X86-SSE4-LABEL: mul_v2i64_zext_cross_bb:
1963; X86-SSE4:       # %bb.0:
1964; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
1965; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1966; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1967; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1968; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm0
1969; X86-SSE4-NEXT:    retl
1970;
1971; X64-SSE2-LABEL: mul_v2i64_zext_cross_bb:
1972; X64-SSE2:       # %bb.0:
1973; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1974; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
1975; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1976; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
1977; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
1978; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1979; X64-SSE2-NEXT:    retq
1980;
1981; X64-SSE4-LABEL: mul_v2i64_zext_cross_bb:
1982; X64-SSE4:       # %bb.0:
1983; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1984; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1985; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm0
1986; X64-SSE4-NEXT:    retq
1987;
1988; X64-AVX-LABEL: mul_v2i64_zext_cross_bb:
1989; X64-AVX:       # %bb.0:
1990; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1991; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
1992; X64-AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1993; X64-AVX-NEXT:    retq
1994  %a = load <2 x i32>, ptr %in
1995  %b = zext <2 x i32> %a to <2 x i64>
1996  br label %foo
1997
1998foo:
1999  %c = load <2 x i32>, ptr %y
2000  %d = zext <2 x i32> %c to <2 x i64>
2001  %e = mul <2 x i64> %b, %d
2002  ret <2 x i64> %e
2003}
2004
2005define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) {
2006; X86-SSE2-LABEL: mul_v4i64_zext_cross_bb:
2007; X86-SSE2:       # %bb.0:
2008; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2009; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2010; X86-SSE2-NEXT:    movdqa (%ecx), %xmm0
2011; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2012; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2013; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2014; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2015; X86-SSE2-NEXT:    movdqa (%eax), %xmm2
2016; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
2017; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm1
2018; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
2019; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm0
2020; X86-SSE2-NEXT:    retl
2021;
2022; X86-SSE4-LABEL: mul_v4i64_zext_cross_bb:
2023; X86-SSE4:       # %bb.0:
2024; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
2025; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2026; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2027; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2028; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2029; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm1
2030; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2031; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm0
2032; X86-SSE4-NEXT:    retl
2033;
2034; X64-SSE2-LABEL: mul_v4i64_zext_cross_bb:
2035; X64-SSE2:       # %bb.0:
2036; X64-SSE2-NEXT:    movdqa (%rdi), %xmm0
2037; X64-SSE2-NEXT:    pxor %xmm2, %xmm2
2038; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
2039; X64-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2040; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2041; X64-SSE2-NEXT:    movdqa (%rsi), %xmm2
2042; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
2043; X64-SSE2-NEXT:    pmuludq %xmm3, %xmm1
2044; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
2045; X64-SSE2-NEXT:    pmuludq %xmm2, %xmm0
2046; X64-SSE2-NEXT:    retq
2047;
2048; X64-SSE4-LABEL: mul_v4i64_zext_cross_bb:
2049; X64-SSE4:       # %bb.0:
2050; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2051; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2052; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2053; X64-SSE4-NEXT:    pmuludq %xmm2, %xmm1
2054; X64-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2055; X64-SSE4-NEXT:    pmuludq %xmm2, %xmm0
2056; X64-SSE4-NEXT:    retq
2057;
2058; X64-XOP-LABEL: mul_v4i64_zext_cross_bb:
2059; X64-XOP:       # %bb.0:
2060; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2061; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2062; X64-XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2063; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
2064; X64-XOP-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
2065; X64-XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
2066; X64-XOP-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2067; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
2068; X64-XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2069; X64-XOP-NEXT:    retq
2070;
2071; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb:
2072; X64-AVX2:       # %bb.0:
2073; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2074; X64-AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2075; X64-AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2076; X64-AVX2-NEXT:    retq
2077;
2078; X64-AVX512DQ-LABEL: mul_v4i64_zext_cross_bb:
2079; X64-AVX512DQ:       # %bb.0:
2080; X64-AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2081; X64-AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2082; X64-AVX512DQ-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2083; X64-AVX512DQ-NEXT:    retq
2084  %a = load <4 x i32>, ptr %in
2085  %b = zext <4 x i32> %a to <4 x i64>
2086  br label %foo
2087
2088foo:
2089  %c = load <4 x i32>, ptr %y
2090  %d = zext <4 x i32> %c to <4 x i64>
2091  %e = mul <4 x i64> %b, %d
2092  ret <4 x i64> %e
2093}
2094