1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
7
8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
9; SSE2-LABEL: mul_v16i8c:
10; SSE2:       # BB#0: # %entry
11; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
12; SSE2-NEXT:    psraw $8, %xmm1
13; SSE2-NEXT:    movdqa %xmm0, %xmm2
14; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
15; SSE2-NEXT:    psraw $8, %xmm2
16; SSE2-NEXT:    pmullw %xmm1, %xmm2
17; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
18; SSE2-NEXT:    pand %xmm3, %xmm2
19; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
20; SSE2-NEXT:    psraw $8, %xmm0
21; SSE2-NEXT:    pmullw %xmm1, %xmm0
22; SSE2-NEXT:    pand %xmm3, %xmm0
23; SSE2-NEXT:    packuswb %xmm2, %xmm0
24; SSE2-NEXT:    retq
25;
26; SSE41-LABEL: mul_v16i8c:
27; SSE41:       # BB#0: # %entry
28; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
29; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
30; SSE41-NEXT:    pmullw %xmm2, %xmm1
31; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
32; SSE41-NEXT:    pand %xmm3, %xmm1
33; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
34; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
35; SSE41-NEXT:    pmullw %xmm2, %xmm0
36; SSE41-NEXT:    pand %xmm3, %xmm0
37; SSE41-NEXT:    packuswb %xmm0, %xmm1
38; SSE41-NEXT:    movdqa %xmm1, %xmm0
39; SSE41-NEXT:    retq
40;
41; AVX2-LABEL: mul_v16i8c:
42; AVX2:       # BB#0: # %entry
43; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
44; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
45; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
46; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
47; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
48; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
49; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
50; AVX2-NEXT:    vzeroupper
51; AVX2-NEXT:    retq
52;
53; AVX512F-LABEL: mul_v16i8c:
54; AVX512F:       # BB#0: # %entry
55; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
56; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
57; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
58; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
59; AVX512F-NEXT:    retq
60;
61; AVX512BW-LABEL: mul_v16i8c:
62; AVX512BW:       # BB#0: # %entry
63; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
64; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
65; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
66; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
67; AVX512BW-NEXT:    retq
68entry:
69  %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
70  ret <16 x i8> %A
71}
72
73define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind  {
74; SSE-LABEL: mul_v8i16c:
75; SSE:       # BB#0: # %entry
76; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
77; SSE-NEXT:    retq
78;
79; AVX-LABEL: mul_v8i16c:
80; AVX:       # BB#0: # %entry
81; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
82; AVX-NEXT:    retq
83entry:
84  %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
85  ret <8 x i16> %A
86}
87
88define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind  {
89; SSE2-LABEL: mul_v4i32c:
90; SSE2:       # BB#0: # %entry
91; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [117,117,117,117]
92; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
93; SSE2-NEXT:    pmuludq %xmm1, %xmm0
94; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
95; SSE2-NEXT:    pmuludq %xmm1, %xmm2
96; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
97; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
98; SSE2-NEXT:    retq
99;
100; SSE41-LABEL: mul_v4i32c:
101; SSE41:       # BB#0: # %entry
102; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
103; SSE41-NEXT:    retq
104;
105; AVX-LABEL: mul_v4i32c:
106; AVX:       # BB#0: # %entry
107; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
108; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
109; AVX-NEXT:    retq
110entry:
111  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
112  ret <4 x i32> %A
113}
114
115define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind  {
116; SSE-LABEL: mul_v2i64c:
117; SSE:       # BB#0: # %entry
118; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [117,117]
119; SSE-NEXT:    movdqa %xmm0, %xmm2
120; SSE-NEXT:    pmuludq %xmm1, %xmm2
121; SSE-NEXT:    psrlq $32, %xmm0
122; SSE-NEXT:    pmuludq %xmm1, %xmm0
123; SSE-NEXT:    psllq $32, %xmm0
124; SSE-NEXT:    paddq %xmm2, %xmm0
125; SSE-NEXT:    retq
126;
127; AVX-LABEL: mul_v2i64c:
128; AVX:       # BB#0: # %entry
129; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [117,117]
130; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
131; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
132; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
133; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
134; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
135; AVX-NEXT:    retq
136entry:
137  %A = mul <2 x i64> %i, < i64 117, i64 117 >
138  ret <2 x i64> %A
139}
140
141define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
142; SSE2-LABEL: mul_v16i8:
143; SSE2:       # BB#0: # %entry
144; SSE2-NEXT:    movdqa %xmm1, %xmm2
145; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
146; SSE2-NEXT:    psraw $8, %xmm2
147; SSE2-NEXT:    movdqa %xmm0, %xmm3
148; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
149; SSE2-NEXT:    psraw $8, %xmm3
150; SSE2-NEXT:    pmullw %xmm2, %xmm3
151; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
152; SSE2-NEXT:    pand %xmm2, %xmm3
153; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
154; SSE2-NEXT:    psraw $8, %xmm1
155; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
156; SSE2-NEXT:    psraw $8, %xmm0
157; SSE2-NEXT:    pmullw %xmm1, %xmm0
158; SSE2-NEXT:    pand %xmm2, %xmm0
159; SSE2-NEXT:    packuswb %xmm3, %xmm0
160; SSE2-NEXT:    retq
161;
162; SSE41-LABEL: mul_v16i8:
163; SSE41:       # BB#0: # %entry
164; SSE41-NEXT:    pmovsxbw %xmm1, %xmm3
165; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
166; SSE41-NEXT:    pmullw %xmm3, %xmm2
167; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
168; SSE41-NEXT:    pand %xmm3, %xmm2
169; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
170; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
171; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
172; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
173; SSE41-NEXT:    pmullw %xmm1, %xmm0
174; SSE41-NEXT:    pand %xmm3, %xmm0
175; SSE41-NEXT:    packuswb %xmm0, %xmm2
176; SSE41-NEXT:    movdqa %xmm2, %xmm0
177; SSE41-NEXT:    retq
178;
179; AVX2-LABEL: mul_v16i8:
180; AVX2:       # BB#0: # %entry
181; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
182; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
183; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
184; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
185; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
186; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
187; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
188; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
189; AVX2-NEXT:    vzeroupper
190; AVX2-NEXT:    retq
191;
192; AVX512F-LABEL: mul_v16i8:
193; AVX512F:       # BB#0: # %entry
194; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
195; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
196; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
197; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
198; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
199; AVX512F-NEXT:    retq
200;
201; AVX512BW-LABEL: mul_v16i8:
202; AVX512BW:       # BB#0: # %entry
203; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
204; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
205; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
206; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
207; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
208; AVX512BW-NEXT:    retq
209entry:
210  %A = mul <16 x i8> %i, %j
211  ret <16 x i8> %A
212}
213
214define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind  {
215; SSE-LABEL: mul_v8i16:
216; SSE:       # BB#0: # %entry
217; SSE-NEXT:    pmullw %xmm1, %xmm0
218; SSE-NEXT:    retq
219;
220; AVX-LABEL: mul_v8i16:
221; AVX:       # BB#0: # %entry
222; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
223; AVX-NEXT:    retq
224entry:
225  %A = mul <8 x i16> %i, %j
226  ret <8 x i16> %A
227}
228
229define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind  {
230; SSE2-LABEL: mul_v4i32:
231; SSE2:       # BB#0: # %entry
232; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
233; SSE2-NEXT:    pmuludq %xmm1, %xmm0
234; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
235; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
236; SSE2-NEXT:    pmuludq %xmm2, %xmm1
237; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
238; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
239; SSE2-NEXT:    retq
240;
241; SSE41-LABEL: mul_v4i32:
242; SSE41:       # BB#0: # %entry
243; SSE41-NEXT:    pmulld %xmm1, %xmm0
244; SSE41-NEXT:    retq
245;
246; AVX-LABEL: mul_v4i32:
247; AVX:       # BB#0: # %entry
248; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
249; AVX-NEXT:    retq
250entry:
251  %A = mul <4 x i32> %i, %j
252  ret <4 x i32> %A
253}
254
255define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind  {
256; SSE-LABEL: mul_v2i64:
257; SSE:       # BB#0: # %entry
258; SSE-NEXT:    movdqa %xmm0, %xmm2
259; SSE-NEXT:    psrlq $32, %xmm2
260; SSE-NEXT:    pmuludq %xmm1, %xmm2
261; SSE-NEXT:    movdqa %xmm1, %xmm3
262; SSE-NEXT:    psrlq $32, %xmm3
263; SSE-NEXT:    pmuludq %xmm0, %xmm3
264; SSE-NEXT:    paddq %xmm2, %xmm3
265; SSE-NEXT:    psllq $32, %xmm3
266; SSE-NEXT:    pmuludq %xmm1, %xmm0
267; SSE-NEXT:    paddq %xmm3, %xmm0
268; SSE-NEXT:    retq
269;
270; AVX-LABEL: mul_v2i64:
271; AVX:       # BB#0: # %entry
272; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
273; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
274; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
275; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
276; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
277; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
278; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
279; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
280; AVX-NEXT:    retq
281entry:
282  %A = mul <2 x i64> %i, %j
283  ret <2 x i64> %A
284}
285
286declare void @foo()
287
288define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind  {
289; SSE2-LABEL: mul_v4i32spill:
290; SSE2:       # BB#0: # %entry
291; SSE2-NEXT:    subq $40, %rsp
292; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
293; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
294; SSE2-NEXT:    callq foo
295; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
296; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
297; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
298; SSE2-NEXT:    pmuludq %xmm2, %xmm0
299; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
300; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
301; SSE2-NEXT:    pmuludq %xmm1, %xmm2
302; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
303; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
304; SSE2-NEXT:    addq $40, %rsp
305; SSE2-NEXT:    retq
306;
307; SSE41-LABEL: mul_v4i32spill:
308; SSE41:       # BB#0: # %entry
309; SSE41-NEXT:    subq $40, %rsp
310; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
311; SSE41-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
312; SSE41-NEXT:    callq foo
313; SSE41-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
314; SSE41-NEXT:    pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
315; SSE41-NEXT:    addq $40, %rsp
316; SSE41-NEXT:    retq
317;
318; AVX-LABEL: mul_v4i32spill:
319; AVX:       # BB#0: # %entry
320; AVX-NEXT:    subq $40, %rsp
321; AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
322; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
323; AVX-NEXT:    callq foo
324; AVX-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
325; AVX-NEXT:    vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
326; AVX-NEXT:    addq $40, %rsp
327; AVX-NEXT:    retq
328entry:
329  ; Use a call to force spills.
330  call void @foo()
331  %A = mul <4 x i32> %i, %j
332  ret <4 x i32> %A
333}
334
335define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind  {
336; SSE-LABEL: mul_v2i64spill:
337; SSE:       # BB#0: # %entry
338; SSE-NEXT:    subq $40, %rsp
339; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
340; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
341; SSE-NEXT:    callq foo
342; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
343; SSE-NEXT:    movdqa %xmm0, %xmm2
344; SSE-NEXT:    psrlq $32, %xmm2
345; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
346; SSE-NEXT:    pmuludq %xmm3, %xmm2
347; SSE-NEXT:    movdqa %xmm3, %xmm1
348; SSE-NEXT:    psrlq $32, %xmm1
349; SSE-NEXT:    pmuludq %xmm0, %xmm1
350; SSE-NEXT:    paddq %xmm2, %xmm1
351; SSE-NEXT:    psllq $32, %xmm1
352; SSE-NEXT:    pmuludq %xmm3, %xmm0
353; SSE-NEXT:    paddq %xmm1, %xmm0
354; SSE-NEXT:    addq $40, %rsp
355; SSE-NEXT:    retq
356;
357; AVX-LABEL: mul_v2i64spill:
358; AVX:       # BB#0: # %entry
359; AVX-NEXT:    subq $40, %rsp
360; AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
361; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
362; AVX-NEXT:    callq foo
363; AVX-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
364; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm0
365; AVX-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
366; AVX-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
367; AVX-NEXT:    vpsrlq $32, %xmm2, %xmm1
368; AVX-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
369; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
370; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
371; AVX-NEXT:    vpmuludq %xmm2, %xmm3, %xmm1
372; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
373; AVX-NEXT:    addq $40, %rsp
374; AVX-NEXT:    retq
375entry:
376  ; Use a call to force spills.
377  call void @foo()
378  %A = mul <2 x i64> %i, %j
379  ret <2 x i64> %A
380}
381
382define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind  {
383; SSE2-LABEL: mul_v32i8c:
384; SSE2:       # BB#0: # %entry
385; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
386; SSE2-NEXT:    psraw $8, %xmm2
387; SSE2-NEXT:    movdqa %xmm0, %xmm3
388; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
389; SSE2-NEXT:    psraw $8, %xmm3
390; SSE2-NEXT:    pmullw %xmm2, %xmm3
391; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
392; SSE2-NEXT:    pand %xmm4, %xmm3
393; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
394; SSE2-NEXT:    psraw $8, %xmm0
395; SSE2-NEXT:    pmullw %xmm2, %xmm0
396; SSE2-NEXT:    pand %xmm4, %xmm0
397; SSE2-NEXT:    packuswb %xmm3, %xmm0
398; SSE2-NEXT:    movdqa %xmm1, %xmm3
399; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
400; SSE2-NEXT:    psraw $8, %xmm3
401; SSE2-NEXT:    pmullw %xmm2, %xmm3
402; SSE2-NEXT:    pand %xmm4, %xmm3
403; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
404; SSE2-NEXT:    psraw $8, %xmm1
405; SSE2-NEXT:    pmullw %xmm2, %xmm1
406; SSE2-NEXT:    pand %xmm4, %xmm1
407; SSE2-NEXT:    packuswb %xmm3, %xmm1
408; SSE2-NEXT:    retq
409;
410; SSE41-LABEL: mul_v32i8c:
411; SSE41:       # BB#0: # %entry
412; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
413; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
414; SSE41-NEXT:    pmullw %xmm4, %xmm2
415; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
416; SSE41-NEXT:    pand %xmm5, %xmm2
417; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
418; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
419; SSE41-NEXT:    pmullw %xmm4, %xmm0
420; SSE41-NEXT:    pand %xmm5, %xmm0
421; SSE41-NEXT:    packuswb %xmm0, %xmm2
422; SSE41-NEXT:    pmovsxbw %xmm1, %xmm3
423; SSE41-NEXT:    pmullw %xmm4, %xmm3
424; SSE41-NEXT:    pand %xmm5, %xmm3
425; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
426; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
427; SSE41-NEXT:    pmullw %xmm4, %xmm0
428; SSE41-NEXT:    pand %xmm5, %xmm0
429; SSE41-NEXT:    packuswb %xmm0, %xmm3
430; SSE41-NEXT:    movdqa %xmm2, %xmm0
431; SSE41-NEXT:    movdqa %xmm3, %xmm1
432; SSE41-NEXT:    retq
433;
434; AVX2-LABEL: mul_v32i8c:
435; AVX2:       # BB#0: # %entry
436; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
437; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
438; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
439; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
440; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
441; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
442; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
443; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
444; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
445; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
446; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
447; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
448; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
449; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
450; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
451; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
452; AVX2-NEXT:    retq
453;
454; AVX512F-LABEL: mul_v32i8c:
455; AVX512F:       # BB#0: # %entry
456; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
457; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
458; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
459; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
460; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
461; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
462; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
463; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
464; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
465; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
466; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
467; AVX512F-NEXT:    retq
468;
469; AVX512BW-LABEL: mul_v32i8c:
470; AVX512BW:       # BB#0: # %entry
471; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
472; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %zmm0, %zmm0
473; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
474; AVX512BW-NEXT:    retq
475entry:
476  %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
477  ret <32 x i8> %A
478}
479
480define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind  {
481; SSE-LABEL: mul_v16i16c:
482; SSE:       # BB#0: # %entry
483; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
484; SSE-NEXT:    pmullw %xmm2, %xmm0
485; SSE-NEXT:    pmullw %xmm2, %xmm1
486; SSE-NEXT:    retq
487;
488; AVX-LABEL: mul_v16i16c:
489; AVX:       # BB#0: # %entry
490; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
491; AVX-NEXT:    retq
492entry:
493  %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
494  ret <16 x i16> %A
495}
496
497define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind  {
498; SSE2-LABEL: mul_v8i32c:
499; SSE2:       # BB#0: # %entry
500; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117]
501; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
502; SSE2-NEXT:    pmuludq %xmm2, %xmm0
503; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
504; SSE2-NEXT:    pmuludq %xmm2, %xmm3
505; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
506; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
507; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
508; SSE2-NEXT:    pmuludq %xmm2, %xmm1
509; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
510; SSE2-NEXT:    pmuludq %xmm2, %xmm3
511; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
512; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
513; SSE2-NEXT:    retq
514;
515; SSE41-LABEL: mul_v8i32c:
516; SSE41:       # BB#0: # %entry
517; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [117,117,117,117]
518; SSE41-NEXT:    pmulld %xmm2, %xmm0
519; SSE41-NEXT:    pmulld %xmm2, %xmm1
520; SSE41-NEXT:    retq
521;
522; AVX-LABEL: mul_v8i32c:
523; AVX:       # BB#0: # %entry
524; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
525; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
526; AVX-NEXT:    retq
527entry:
528  %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 >
529  ret <8 x i32> %A
530}
531
532define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind  {
533; SSE-LABEL: mul_v4i64c:
534; SSE:       # BB#0: # %entry
535; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [117,117]
536; SSE-NEXT:    movdqa %xmm0, %xmm3
537; SSE-NEXT:    pmuludq %xmm2, %xmm3
538; SSE-NEXT:    psrlq $32, %xmm0
539; SSE-NEXT:    pmuludq %xmm2, %xmm0
540; SSE-NEXT:    psllq $32, %xmm0
541; SSE-NEXT:    paddq %xmm3, %xmm0
542; SSE-NEXT:    movdqa %xmm1, %xmm3
543; SSE-NEXT:    pmuludq %xmm2, %xmm3
544; SSE-NEXT:    psrlq $32, %xmm1
545; SSE-NEXT:    pmuludq %xmm2, %xmm1
546; SSE-NEXT:    psllq $32, %xmm1
547; SSE-NEXT:    paddq %xmm3, %xmm1
548; SSE-NEXT:    retq
549;
550; AVX-LABEL: mul_v4i64c:
551; AVX:       # BB#0: # %entry
552; AVX-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
553; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
554; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
555; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
556; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
557; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
558; AVX-NEXT:    retq
559entry:
560  %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
561  ret <4 x i64> %A
562}
563
564define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind  {
565; SSE2-LABEL: mul_v32i8:
566; SSE2:       # BB#0: # %entry
567; SSE2-NEXT:    movdqa %xmm2, %xmm4
568; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
569; SSE2-NEXT:    psraw $8, %xmm4
570; SSE2-NEXT:    movdqa %xmm0, %xmm5
571; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
572; SSE2-NEXT:    psraw $8, %xmm5
573; SSE2-NEXT:    pmullw %xmm4, %xmm5
574; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
575; SSE2-NEXT:    pand %xmm4, %xmm5
576; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
577; SSE2-NEXT:    psraw $8, %xmm2
578; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
579; SSE2-NEXT:    psraw $8, %xmm0
580; SSE2-NEXT:    pmullw %xmm2, %xmm0
581; SSE2-NEXT:    pand %xmm4, %xmm0
582; SSE2-NEXT:    packuswb %xmm5, %xmm0
583; SSE2-NEXT:    movdqa %xmm3, %xmm2
584; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
585; SSE2-NEXT:    psraw $8, %xmm2
586; SSE2-NEXT:    movdqa %xmm1, %xmm5
587; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
588; SSE2-NEXT:    psraw $8, %xmm5
589; SSE2-NEXT:    pmullw %xmm2, %xmm5
590; SSE2-NEXT:    pand %xmm4, %xmm5
591; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
592; SSE2-NEXT:    psraw $8, %xmm3
593; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
594; SSE2-NEXT:    psraw $8, %xmm1
595; SSE2-NEXT:    pmullw %xmm3, %xmm1
596; SSE2-NEXT:    pand %xmm4, %xmm1
597; SSE2-NEXT:    packuswb %xmm5, %xmm1
598; SSE2-NEXT:    retq
599;
600; SSE41-LABEL: mul_v32i8:
601; SSE41:       # BB#0: # %entry
602; SSE41-NEXT:    pmovsxbw %xmm2, %xmm5
603; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
604; SSE41-NEXT:    pmullw %xmm5, %xmm4
605; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
606; SSE41-NEXT:    pand %xmm5, %xmm4
607; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
608; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
609; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
610; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
611; SSE41-NEXT:    pmullw %xmm2, %xmm0
612; SSE41-NEXT:    pand %xmm5, %xmm0
613; SSE41-NEXT:    packuswb %xmm0, %xmm4
614; SSE41-NEXT:    pmovsxbw %xmm3, %xmm0
615; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
616; SSE41-NEXT:    pmullw %xmm0, %xmm2
617; SSE41-NEXT:    pand %xmm5, %xmm2
618; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
619; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
620; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
621; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
622; SSE41-NEXT:    pmullw %xmm0, %xmm1
623; SSE41-NEXT:    pand %xmm5, %xmm1
624; SSE41-NEXT:    packuswb %xmm1, %xmm2
625; SSE41-NEXT:    movdqa %xmm4, %xmm0
626; SSE41-NEXT:    movdqa %xmm2, %xmm1
627; SSE41-NEXT:    retq
628;
629; AVX2-LABEL: mul_v32i8:
630; AVX2:       # BB#0: # %entry
631; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
632; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
633; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
634; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
635; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
636; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
637; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
638; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
639; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
640; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
641; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
642; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
643; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
644; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
645; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
646; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
647; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
648; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
649; AVX2-NEXT:    retq
650;
651; AVX512F-LABEL: mul_v32i8:
652; AVX512F:       # BB#0: # %entry
653; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm2
654; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm3
655; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
656; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
657; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
658; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
659; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
660; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
661; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
662; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
663; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
664; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
665; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
666; AVX512F-NEXT:    retq
667;
668; AVX512BW-LABEL: mul_v32i8:
669; AVX512BW:       # BB#0: # %entry
670; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
671; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
672; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
673; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
674; AVX512BW-NEXT:    retq
675entry:
676  %A = mul <32 x i8> %i, %j
677  ret <32 x i8> %A
678}
679
680define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind  {
681; SSE-LABEL: mul_v16i16:
682; SSE:       # BB#0: # %entry
683; SSE-NEXT:    pmullw %xmm2, %xmm0
684; SSE-NEXT:    pmullw %xmm3, %xmm1
685; SSE-NEXT:    retq
686;
687; AVX-LABEL: mul_v16i16:
688; AVX:       # BB#0: # %entry
689; AVX-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
690; AVX-NEXT:    retq
691entry:
692  %A = mul <16 x i16> %i, %j
693  ret <16 x i16> %A
694}
695
696define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind  {
697; SSE2-LABEL: mul_v8i32:
698; SSE2:       # BB#0: # %entry
699; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
700; SSE2-NEXT:    pmuludq %xmm2, %xmm0
701; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
702; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
703; SSE2-NEXT:    pmuludq %xmm4, %xmm2
704; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
705; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
706; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
707; SSE2-NEXT:    pmuludq %xmm3, %xmm1
708; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
709; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
710; SSE2-NEXT:    pmuludq %xmm2, %xmm3
711; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
712; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
713; SSE2-NEXT:    retq
714;
715; SSE41-LABEL: mul_v8i32:
716; SSE41:       # BB#0: # %entry
717; SSE41-NEXT:    pmulld %xmm2, %xmm0
718; SSE41-NEXT:    pmulld %xmm3, %xmm1
719; SSE41-NEXT:    retq
720;
721; AVX-LABEL: mul_v8i32:
722; AVX:       # BB#0: # %entry
723; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
724; AVX-NEXT:    retq
725entry:
726  %A = mul <8 x i32> %i, %j
727  ret <8 x i32> %A
728}
729
730define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind  {
731; SSE-LABEL: mul_v4i64:
732; SSE:       # BB#0: # %entry
733; SSE-NEXT:    movdqa %xmm0, %xmm4
734; SSE-NEXT:    psrlq $32, %xmm4
735; SSE-NEXT:    pmuludq %xmm2, %xmm4
736; SSE-NEXT:    movdqa %xmm2, %xmm5
737; SSE-NEXT:    psrlq $32, %xmm5
738; SSE-NEXT:    pmuludq %xmm0, %xmm5
739; SSE-NEXT:    paddq %xmm4, %xmm5
740; SSE-NEXT:    psllq $32, %xmm5
741; SSE-NEXT:    pmuludq %xmm2, %xmm0
742; SSE-NEXT:    paddq %xmm5, %xmm0
743; SSE-NEXT:    movdqa %xmm1, %xmm2
744; SSE-NEXT:    psrlq $32, %xmm2
745; SSE-NEXT:    pmuludq %xmm3, %xmm2
746; SSE-NEXT:    movdqa %xmm3, %xmm4
747; SSE-NEXT:    psrlq $32, %xmm4
748; SSE-NEXT:    pmuludq %xmm1, %xmm4
749; SSE-NEXT:    paddq %xmm2, %xmm4
750; SSE-NEXT:    psllq $32, %xmm4
751; SSE-NEXT:    pmuludq %xmm3, %xmm1
752; SSE-NEXT:    paddq %xmm4, %xmm1
753; SSE-NEXT:    retq
754;
755; AVX-LABEL: mul_v4i64:
756; AVX:       # BB#0: # %entry
757; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm2
758; AVX-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
759; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm3
760; AVX-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
761; AVX-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
762; AVX-NEXT:    vpsllq $32, %ymm2, %ymm2
763; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
764; AVX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
765; AVX-NEXT:    retq
766entry:
767  %A = mul <4 x i64> %i, %j
768  ret <4 x i64> %A
769}
770
771define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
772; SSE2-LABEL: mul_v64i8c:
773; SSE2:       # BB#0: # %entry
774; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
775; SSE2-NEXT:    psraw $8, %xmm4
776; SSE2-NEXT:    movdqa %xmm0, %xmm6
777; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
778; SSE2-NEXT:    psraw $8, %xmm6
779; SSE2-NEXT:    pmullw %xmm4, %xmm6
780; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
781; SSE2-NEXT:    pand %xmm5, %xmm6
782; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
783; SSE2-NEXT:    psraw $8, %xmm0
784; SSE2-NEXT:    pmullw %xmm4, %xmm0
785; SSE2-NEXT:    pand %xmm5, %xmm0
786; SSE2-NEXT:    packuswb %xmm6, %xmm0
787; SSE2-NEXT:    movdqa %xmm1, %xmm6
788; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
789; SSE2-NEXT:    psraw $8, %xmm6
790; SSE2-NEXT:    pmullw %xmm4, %xmm6
791; SSE2-NEXT:    pand %xmm5, %xmm6
792; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
793; SSE2-NEXT:    psraw $8, %xmm1
794; SSE2-NEXT:    pmullw %xmm4, %xmm1
795; SSE2-NEXT:    pand %xmm5, %xmm1
796; SSE2-NEXT:    packuswb %xmm6, %xmm1
797; SSE2-NEXT:    movdqa %xmm2, %xmm6
798; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
799; SSE2-NEXT:    psraw $8, %xmm6
800; SSE2-NEXT:    pmullw %xmm4, %xmm6
801; SSE2-NEXT:    pand %xmm5, %xmm6
802; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
803; SSE2-NEXT:    psraw $8, %xmm2
804; SSE2-NEXT:    pmullw %xmm4, %xmm2
805; SSE2-NEXT:    pand %xmm5, %xmm2
806; SSE2-NEXT:    packuswb %xmm6, %xmm2
807; SSE2-NEXT:    movdqa %xmm3, %xmm6
808; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
809; SSE2-NEXT:    psraw $8, %xmm6
810; SSE2-NEXT:    pmullw %xmm4, %xmm6
811; SSE2-NEXT:    pand %xmm5, %xmm6
812; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
813; SSE2-NEXT:    psraw $8, %xmm3
814; SSE2-NEXT:    pmullw %xmm4, %xmm3
815; SSE2-NEXT:    pand %xmm5, %xmm3
816; SSE2-NEXT:    packuswb %xmm6, %xmm3
817; SSE2-NEXT:    retq
818;
819; SSE41-LABEL: mul_v64i8c:
820; SSE41:       # BB#0: # %entry
821; SSE41-NEXT:    movdqa %xmm1, %xmm4
822; SSE41-NEXT:    movdqa %xmm0, %xmm1
823; SSE41-NEXT:    pmovsxbw %xmm1, %xmm0
824; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117]
825; SSE41-NEXT:    pmullw %xmm6, %xmm0
826; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
827; SSE41-NEXT:    pand %xmm7, %xmm0
828; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
829; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
830; SSE41-NEXT:    pmullw %xmm6, %xmm1
831; SSE41-NEXT:    pand %xmm7, %xmm1
832; SSE41-NEXT:    packuswb %xmm1, %xmm0
833; SSE41-NEXT:    pmovsxbw %xmm4, %xmm1
834; SSE41-NEXT:    pmullw %xmm6, %xmm1
835; SSE41-NEXT:    pand %xmm7, %xmm1
836; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
837; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
838; SSE41-NEXT:    pmullw %xmm6, %xmm4
839; SSE41-NEXT:    pand %xmm7, %xmm4
840; SSE41-NEXT:    packuswb %xmm4, %xmm1
841; SSE41-NEXT:    pmovsxbw %xmm2, %xmm4
842; SSE41-NEXT:    pmullw %xmm6, %xmm4
843; SSE41-NEXT:    pand %xmm7, %xmm4
844; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
845; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
846; SSE41-NEXT:    pmullw %xmm6, %xmm2
847; SSE41-NEXT:    pand %xmm7, %xmm2
848; SSE41-NEXT:    packuswb %xmm2, %xmm4
849; SSE41-NEXT:    pmovsxbw %xmm3, %xmm5
850; SSE41-NEXT:    pmullw %xmm6, %xmm5
851; SSE41-NEXT:    pand %xmm7, %xmm5
852; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
853; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
854; SSE41-NEXT:    pmullw %xmm6, %xmm2
855; SSE41-NEXT:    pand %xmm7, %xmm2
856; SSE41-NEXT:    packuswb %xmm2, %xmm5
857; SSE41-NEXT:    movdqa %xmm4, %xmm2
858; SSE41-NEXT:    movdqa %xmm5, %xmm3
859; SSE41-NEXT:    retq
860;
861; AVX2-LABEL: mul_v64i8c:
862; AVX2:       # BB#0: # %entry
863; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
864; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
865; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
866; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
867; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
868; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
869; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
870; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
871; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
872; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
873; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
874; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
875; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
876; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
877; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
878; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
879; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
880; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
881; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
882; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
883; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
884; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
885; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
886; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
887; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
888; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
889; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
890; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
891; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
892; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
893; AVX2-NEXT:    retq
894;
895; AVX512F-LABEL: mul_v64i8c:
896; AVX512F:       # BB#0: # %entry
897; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm2
898; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
899; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
900; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
901; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
902; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
903; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
904; AVX512F-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
905; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
906; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
907; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
908; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm2
909; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
910; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
911; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
912; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
913; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
914; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
915; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
916; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
917; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
918; AVX512F-NEXT:    retq
919;
920; AVX512BW-LABEL: mul_v64i8c:
921; AVX512BW:       # BB#0: # %entry
922; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1
923; AVX512BW-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
924; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
925; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
926; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
927; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
928; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
929; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
930; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
931; AVX512BW-NEXT:    retq
932entry:
933  %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
934  ret <64 x i8> %A
935}
936
937define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
938; SSE2-LABEL: mul_v64i8:
939; SSE2:       # BB#0: # %entry
940; SSE2-NEXT:    movdqa %xmm4, %xmm8
941; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
942; SSE2-NEXT:    psraw $8, %xmm8
943; SSE2-NEXT:    movdqa %xmm0, %xmm9
944; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
945; SSE2-NEXT:    psraw $8, %xmm9
946; SSE2-NEXT:    pmullw %xmm8, %xmm9
947; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
948; SSE2-NEXT:    pand %xmm8, %xmm9
949; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
950; SSE2-NEXT:    psraw $8, %xmm4
951; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
952; SSE2-NEXT:    psraw $8, %xmm0
953; SSE2-NEXT:    pmullw %xmm4, %xmm0
954; SSE2-NEXT:    pand %xmm8, %xmm0
955; SSE2-NEXT:    packuswb %xmm9, %xmm0
956; SSE2-NEXT:    movdqa %xmm5, %xmm9
957; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
958; SSE2-NEXT:    psraw $8, %xmm9
959; SSE2-NEXT:    movdqa %xmm1, %xmm4
960; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
961; SSE2-NEXT:    psraw $8, %xmm4
962; SSE2-NEXT:    pmullw %xmm9, %xmm4
963; SSE2-NEXT:    pand %xmm8, %xmm4
964; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
965; SSE2-NEXT:    psraw $8, %xmm5
966; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
967; SSE2-NEXT:    psraw $8, %xmm1
968; SSE2-NEXT:    pmullw %xmm5, %xmm1
969; SSE2-NEXT:    pand %xmm8, %xmm1
970; SSE2-NEXT:    packuswb %xmm4, %xmm1
971; SSE2-NEXT:    movdqa %xmm6, %xmm4
972; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
973; SSE2-NEXT:    psraw $8, %xmm4
974; SSE2-NEXT:    movdqa %xmm2, %xmm5
975; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
976; SSE2-NEXT:    psraw $8, %xmm5
977; SSE2-NEXT:    pmullw %xmm4, %xmm5
978; SSE2-NEXT:    pand %xmm8, %xmm5
979; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
980; SSE2-NEXT:    psraw $8, %xmm6
981; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
982; SSE2-NEXT:    psraw $8, %xmm2
983; SSE2-NEXT:    pmullw %xmm6, %xmm2
984; SSE2-NEXT:    pand %xmm8, %xmm2
985; SSE2-NEXT:    packuswb %xmm5, %xmm2
986; SSE2-NEXT:    movdqa %xmm7, %xmm4
987; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
988; SSE2-NEXT:    psraw $8, %xmm4
989; SSE2-NEXT:    movdqa %xmm3, %xmm5
990; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
991; SSE2-NEXT:    psraw $8, %xmm5
992; SSE2-NEXT:    pmullw %xmm4, %xmm5
993; SSE2-NEXT:    pand %xmm8, %xmm5
994; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
995; SSE2-NEXT:    psraw $8, %xmm7
996; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
997; SSE2-NEXT:    psraw $8, %xmm3
998; SSE2-NEXT:    pmullw %xmm7, %xmm3
999; SSE2-NEXT:    pand %xmm8, %xmm3
1000; SSE2-NEXT:    packuswb %xmm5, %xmm3
1001; SSE2-NEXT:    retq
1002;
1003; SSE41-LABEL: mul_v64i8:
1004; SSE41:       # BB#0: # %entry
1005; SSE41-NEXT:    movdqa %xmm1, %xmm8
1006; SSE41-NEXT:    movdqa %xmm0, %xmm1
1007; SSE41-NEXT:    pmovsxbw %xmm4, %xmm9
1008; SSE41-NEXT:    pmovsxbw %xmm1, %xmm0
1009; SSE41-NEXT:    pmullw %xmm9, %xmm0
1010; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
1011; SSE41-NEXT:    pand %xmm9, %xmm0
1012; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
1013; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
1014; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1015; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
1016; SSE41-NEXT:    pmullw %xmm4, %xmm1
1017; SSE41-NEXT:    pand %xmm9, %xmm1
1018; SSE41-NEXT:    packuswb %xmm1, %xmm0
1019; SSE41-NEXT:    pmovsxbw %xmm5, %xmm4
1020; SSE41-NEXT:    pmovsxbw %xmm8, %xmm1
1021; SSE41-NEXT:    pmullw %xmm4, %xmm1
1022; SSE41-NEXT:    pand %xmm9, %xmm1
1023; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
1024; SSE41-NEXT:    pmovsxbw %xmm4, %xmm4
1025; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
1026; SSE41-NEXT:    pmovsxbw %xmm5, %xmm5
1027; SSE41-NEXT:    pmullw %xmm4, %xmm5
1028; SSE41-NEXT:    pand %xmm9, %xmm5
1029; SSE41-NEXT:    packuswb %xmm5, %xmm1
1030; SSE41-NEXT:    pmovsxbw %xmm6, %xmm5
1031; SSE41-NEXT:    pmovsxbw %xmm2, %xmm4
1032; SSE41-NEXT:    pmullw %xmm5, %xmm4
1033; SSE41-NEXT:    pand %xmm9, %xmm4
1034; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
1035; SSE41-NEXT:    pmovsxbw %xmm5, %xmm5
1036; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1037; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1038; SSE41-NEXT:    pmullw %xmm5, %xmm2
1039; SSE41-NEXT:    pand %xmm9, %xmm2
1040; SSE41-NEXT:    packuswb %xmm2, %xmm4
1041; SSE41-NEXT:    pmovsxbw %xmm7, %xmm2
1042; SSE41-NEXT:    pmovsxbw %xmm3, %xmm5
1043; SSE41-NEXT:    pmullw %xmm2, %xmm5
1044; SSE41-NEXT:    pand %xmm9, %xmm5
1045; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]
1046; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
1047; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1048; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
1049; SSE41-NEXT:    pmullw %xmm2, %xmm3
1050; SSE41-NEXT:    pand %xmm9, %xmm3
1051; SSE41-NEXT:    packuswb %xmm3, %xmm5
1052; SSE41-NEXT:    movdqa %xmm4, %xmm2
1053; SSE41-NEXT:    movdqa %xmm5, %xmm3
1054; SSE41-NEXT:    retq
1055;
1056; AVX2-LABEL: mul_v64i8:
1057; AVX2:       # BB#0: # %entry
1058; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
1059; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
1060; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
1061; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
1062; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
1063; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
1064; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1065; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
1066; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1067; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1068; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
1069; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1070; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1071; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1072; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1073; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
1074; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1075; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
1076; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
1077; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
1078; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
1079; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
1080; AVX2-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
1081; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
1082; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1083; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1084; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
1085; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
1086; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
1087; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1088; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1089; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
1090; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
1091; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1092; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1093; AVX2-NEXT:    retq
1094;
1095; AVX512F-LABEL: mul_v64i8:
1096; AVX512F:       # BB#0: # %entry
1097; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm4
1098; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm5
1099; AVX512F-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
1100; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
1101; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
1102; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
1103; AVX512F-NEXT:    vpmovsxbw %xmm2, %ymm2
1104; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
1105; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
1106; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
1107; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1108; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1109; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
1110; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm2
1111; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm4
1112; AVX512F-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
1113; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
1114; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
1115; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm3
1116; AVX512F-NEXT:    vpmovsxbw %xmm3, %ymm3
1117; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
1118; AVX512F-NEXT:    vpmovsxbw %xmm1, %ymm1
1119; AVX512F-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
1120; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
1121; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
1122; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
1123; AVX512F-NEXT:    retq
1124;
1125; AVX512BW-LABEL: mul_v64i8:
1126; AVX512BW:       # BB#0: # %entry
1127; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
1128; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
1129; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
1130; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
1131; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1132; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
1133; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1134; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1135; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
1136; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1137; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
1138; AVX512BW-NEXT:    retq
1139entry:
1140  %A = mul <64 x i8> %i, %j
1141  ret <64 x i8> %A
1142}
1143
1144; PR30845
1145define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
1146; SSE2-LABEL: mul_v4i64_zero_upper:
1147; SSE2:       # BB#0: # %entry
1148; SSE2-NEXT:    pxor %xmm3, %xmm3
1149; SSE2-NEXT:    movdqa %xmm0, %xmm2
1150; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1151; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1152; SSE2-NEXT:    movdqa %xmm1, %xmm4
1153; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1154; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1155; SSE2-NEXT:    pmuludq %xmm0, %xmm1
1156; SSE2-NEXT:    pmuludq %xmm4, %xmm2
1157; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
1158; SSE2-NEXT:    movaps %xmm2, %xmm0
1159; SSE2-NEXT:    retq
1160;
1161; SSE41-LABEL: mul_v4i64_zero_upper:
1162; SSE41:       # BB#0: # %entry
1163; SSE41-NEXT:    pxor %xmm3, %xmm3
1164; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
1165; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1166; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
1167; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1168; SSE41-NEXT:    pmuludq %xmm0, %xmm1
1169; SSE41-NEXT:    pmuludq %xmm4, %xmm2
1170; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
1171; SSE41-NEXT:    movaps %xmm2, %xmm0
1172; SSE41-NEXT:    retq
1173;
1174; AVX2-LABEL: mul_v4i64_zero_upper:
1175; AVX2:       # BB#0: # %entry
1176; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1177; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1178; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1179; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1180; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1181; AVX2-NEXT:    vzeroupper
1182; AVX2-NEXT:    retq
1183;
1184; AVX512-LABEL: mul_v4i64_zero_upper:
1185; AVX512:       # BB#0: # %entry
1186; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1187; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1188; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1189; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1190; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1191; AVX512-NEXT:    retq
1192entry:
1193  %val1a = zext <4 x i32> %val1 to <4 x i64>
1194  %val2a = zext <4 x i32> %val2 to <4 x i64>
1195  %res64 = mul <4 x i64> %val1a, %val2a
1196  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
1197  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1198  ret <4 x i32> %res
1199}
1200
1201define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
1202; SSE2-LABEL: mul_v4i64_zero_upper_left:
1203; SSE2:       # BB#0: # %entry
1204; SSE2-NEXT:    pxor %xmm3, %xmm3
1205; SSE2-NEXT:    movdqa %xmm0, %xmm4
1206; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1207; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1208; SSE2-NEXT:    movdqa %xmm0, %xmm3
1209; SSE2-NEXT:    pmuludq %xmm2, %xmm3
1210; SSE2-NEXT:    psrlq $32, %xmm2
1211; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1212; SSE2-NEXT:    psllq $32, %xmm2
1213; SSE2-NEXT:    paddq %xmm3, %xmm2
1214; SSE2-NEXT:    movdqa %xmm4, %xmm0
1215; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1216; SSE2-NEXT:    psrlq $32, %xmm1
1217; SSE2-NEXT:    pmuludq %xmm4, %xmm1
1218; SSE2-NEXT:    psllq $32, %xmm1
1219; SSE2-NEXT:    paddq %xmm1, %xmm0
1220; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
1221; SSE2-NEXT:    retq
1222;
1223; SSE41-LABEL: mul_v4i64_zero_upper_left:
1224; SSE41:       # BB#0: # %entry
1225; SSE41-NEXT:    pxor %xmm3, %xmm3
1226; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
1227; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1228; SSE41-NEXT:    movdqa %xmm0, %xmm3
1229; SSE41-NEXT:    pmuludq %xmm2, %xmm3
1230; SSE41-NEXT:    psrlq $32, %xmm2
1231; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1232; SSE41-NEXT:    psllq $32, %xmm2
1233; SSE41-NEXT:    paddq %xmm3, %xmm2
1234; SSE41-NEXT:    movdqa %xmm4, %xmm0
1235; SSE41-NEXT:    pmuludq %xmm1, %xmm0
1236; SSE41-NEXT:    psrlq $32, %xmm1
1237; SSE41-NEXT:    pmuludq %xmm4, %xmm1
1238; SSE41-NEXT:    psllq $32, %xmm1
1239; SSE41-NEXT:    paddq %xmm1, %xmm0
1240; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
1241; SSE41-NEXT:    retq
1242;
1243; AVX2-LABEL: mul_v4i64_zero_upper_left:
1244; AVX2:       # BB#0: # %entry
1245; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1246; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
1247; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
1248; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1249; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
1250; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1251; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1252; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1253; AVX2-NEXT:    vzeroupper
1254; AVX2-NEXT:    retq
1255;
1256; AVX512-LABEL: mul_v4i64_zero_upper_left:
1257; AVX512:       # BB#0: # %entry
1258; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1259; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
1260; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm1
1261; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1262; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
1263; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1264; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1265; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1266; AVX512-NEXT:    retq
1267entry:
1268  %val1a = zext <4 x i32> %val1 to <4 x i64>
1269  %res64 = mul <4 x i64> %val1a, %val2
1270  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
1271  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1272  ret <4 x i32> %res
1273}
1274
1275define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
1276; SSE2-LABEL: mul_v4i64_zero_lower:
1277; SSE2:       # BB#0: # %entry
1278; SSE2-NEXT:    pxor %xmm4, %xmm4
1279; SSE2-NEXT:    movdqa %xmm0, %xmm3
1280; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1281; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1282; SSE2-NEXT:    psrlq $32, %xmm2
1283; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1284; SSE2-NEXT:    psllq $32, %xmm2
1285; SSE2-NEXT:    psrlq $32, %xmm1
1286; SSE2-NEXT:    pmuludq %xmm1, %xmm3
1287; SSE2-NEXT:    psllq $32, %xmm3
1288; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
1289; SSE2-NEXT:    movaps %xmm3, %xmm0
1290; SSE2-NEXT:    retq
1291;
1292; SSE41-LABEL: mul_v4i64_zero_lower:
1293; SSE41:       # BB#0: # %entry
1294; SSE41-NEXT:    pxor %xmm4, %xmm4
1295; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
1296; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1297; SSE41-NEXT:    psrlq $32, %xmm2
1298; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1299; SSE41-NEXT:    psllq $32, %xmm2
1300; SSE41-NEXT:    psrlq $32, %xmm1
1301; SSE41-NEXT:    pmuludq %xmm1, %xmm3
1302; SSE41-NEXT:    psllq $32, %xmm3
1303; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
1304; SSE41-NEXT:    movaps %xmm3, %xmm0
1305; SSE41-NEXT:    retq
1306;
1307; AVX2-LABEL: mul_v4i64_zero_lower:
1308; AVX2:       # BB#0: # %entry
1309; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1310; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
1311; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1312; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
1313; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1314; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1315; AVX2-NEXT:    vzeroupper
1316; AVX2-NEXT:    retq
1317;
1318; AVX512-LABEL: mul_v4i64_zero_lower:
1319; AVX512:       # BB#0: # %entry
1320; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1321; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm1
1322; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1323; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
1324; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1325; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1326; AVX512-NEXT:    retq
1327entry:
1328  %val1a = zext <4 x i32> %val1 to <4 x i64>
1329  %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
1330  %res64 = mul <4 x i64> %val1a, %val2a
1331  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
1332  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1333  ret <4 x i32> %res
1334}
1335
1336define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
1337; SSE2-LABEL: mul_v8i64_zero_upper:
1338; SSE2:       # BB#0: # %entry
1339; SSE2-NEXT:    pxor %xmm6, %xmm6
1340; SSE2-NEXT:    movdqa %xmm0, %xmm4
1341; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
1342; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1343; SSE2-NEXT:    movdqa %xmm1, %xmm5
1344; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1345; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
1346; SSE2-NEXT:    movdqa %xmm2, %xmm8
1347; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
1348; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
1349; SSE2-NEXT:    movdqa %xmm3, %xmm7
1350; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1351; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
1352; SSE2-NEXT:    pmuludq %xmm1, %xmm3
1353; SSE2-NEXT:    pmuludq %xmm7, %xmm5
1354; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1355; SSE2-NEXT:    pmuludq %xmm8, %xmm4
1356; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
1357; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
1358; SSE2-NEXT:    movaps %xmm4, %xmm0
1359; SSE2-NEXT:    movaps %xmm5, %xmm1
1360; SSE2-NEXT:    retq
1361;
1362; SSE41-LABEL: mul_v8i64_zero_upper:
1363; SSE41:       # BB#0: # %entry
1364; SSE41-NEXT:    pxor %xmm6, %xmm6
1365; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero
1366; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1367; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
1368; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
1369; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
1370; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
1371; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero
1372; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
1373; SSE41-NEXT:    pmuludq %xmm1, %xmm3
1374; SSE41-NEXT:    pmuludq %xmm0, %xmm2
1375; SSE41-NEXT:    pmuludq %xmm7, %xmm5
1376; SSE41-NEXT:    pmuludq %xmm8, %xmm4
1377; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
1378; SSE41-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
1379; SSE41-NEXT:    movaps %xmm4, %xmm0
1380; SSE41-NEXT:    movaps %xmm5, %xmm1
1381; SSE41-NEXT:    retq
1382;
1383; AVX2-LABEL: mul_v8i64_zero_upper:
1384; AVX2:       # BB#0: # %entry
1385; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1386; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1387; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1388; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1389; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1390; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1391; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1392; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm1
1393; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
1394; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1395; AVX2-NEXT:    retq
1396;
1397; AVX512-LABEL: mul_v8i64_zero_upper:
1398; AVX512:       # BB#0: # %entry
1399; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
1400; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
1401; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
1402; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1403; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1404; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1405; AVX512-NEXT:    retq
1406entry:
1407  %val1a = zext <8 x i32> %val1 to <8 x i64>
1408  %val2a = zext <8 x i32> %val2 to <8 x i64>
1409  %res64 = mul <8 x i64> %val1a, %val2a
1410  %rescast = bitcast <8 x i64> %res64 to <16 x i32>
1411  %res = shufflevector <16 x i32> %rescast, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 9, i32 11, i32 13, i32 15 >
1412  ret <8 x i32> %res
1413}
1414
1415define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
1416; SSE2-LABEL: mul_v8i64_sext:
1417; SSE2:       # BB#0:
1418; SSE2-NEXT:    movdqa %xmm1, %xmm4
1419; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
1420; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
1421; SSE2-NEXT:    movdqa %xmm8, %xmm1
1422; SSE2-NEXT:    psrad $31, %xmm1
1423; SSE2-NEXT:    psrad $16, %xmm8
1424; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
1425; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1426; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
1427; SSE2-NEXT:    movdqa %xmm9, %xmm1
1428; SSE2-NEXT:    psrad $31, %xmm1
1429; SSE2-NEXT:    psrad $16, %xmm9
1430; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
1431; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1432; SSE2-NEXT:    movdqa %xmm7, %xmm1
1433; SSE2-NEXT:    psrad $31, %xmm1
1434; SSE2-NEXT:    psrad $16, %xmm7
1435; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
1436; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1437; SSE2-NEXT:    movdqa %xmm0, %xmm1
1438; SSE2-NEXT:    psrad $31, %xmm1
1439; SSE2-NEXT:    psrad $16, %xmm0
1440; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1441; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1442; SSE2-NEXT:    movdqa %xmm3, %xmm1
1443; SSE2-NEXT:    psrad $31, %xmm1
1444; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1445; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
1446; SSE2-NEXT:    movdqa %xmm1, %xmm5
1447; SSE2-NEXT:    psrad $31, %xmm5
1448; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
1449; SSE2-NEXT:    movdqa %xmm2, %xmm5
1450; SSE2-NEXT:    psrad $31, %xmm5
1451; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1452; SSE2-NEXT:    movdqa %xmm4, %xmm5
1453; SSE2-NEXT:    psrad $31, %xmm5
1454; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1455; SSE2-NEXT:    movdqa %xmm4, %xmm5
1456; SSE2-NEXT:    psrlq $32, %xmm5
1457; SSE2-NEXT:    pmuludq %xmm0, %xmm5
1458; SSE2-NEXT:    movdqa %xmm0, %xmm6
1459; SSE2-NEXT:    psrlq $32, %xmm6
1460; SSE2-NEXT:    pmuludq %xmm4, %xmm6
1461; SSE2-NEXT:    paddq %xmm5, %xmm6
1462; SSE2-NEXT:    psllq $32, %xmm6
1463; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1464; SSE2-NEXT:    paddq %xmm6, %xmm0
1465; SSE2-NEXT:    movdqa %xmm2, %xmm4
1466; SSE2-NEXT:    psrlq $32, %xmm4
1467; SSE2-NEXT:    pmuludq %xmm7, %xmm4
1468; SSE2-NEXT:    movdqa %xmm7, %xmm5
1469; SSE2-NEXT:    psrlq $32, %xmm5
1470; SSE2-NEXT:    pmuludq %xmm2, %xmm5
1471; SSE2-NEXT:    paddq %xmm4, %xmm5
1472; SSE2-NEXT:    psllq $32, %xmm5
1473; SSE2-NEXT:    pmuludq %xmm7, %xmm2
1474; SSE2-NEXT:    paddq %xmm5, %xmm2
1475; SSE2-NEXT:    movdqa %xmm1, %xmm4
1476; SSE2-NEXT:    psrlq $32, %xmm4
1477; SSE2-NEXT:    pmuludq %xmm9, %xmm4
1478; SSE2-NEXT:    movdqa %xmm9, %xmm5
1479; SSE2-NEXT:    psrlq $32, %xmm5
1480; SSE2-NEXT:    pmuludq %xmm1, %xmm5
1481; SSE2-NEXT:    paddq %xmm4, %xmm5
1482; SSE2-NEXT:    psllq $32, %xmm5
1483; SSE2-NEXT:    pmuludq %xmm9, %xmm1
1484; SSE2-NEXT:    paddq %xmm5, %xmm1
1485; SSE2-NEXT:    movdqa %xmm3, %xmm4
1486; SSE2-NEXT:    psrlq $32, %xmm4
1487; SSE2-NEXT:    pmuludq %xmm8, %xmm4
1488; SSE2-NEXT:    movdqa %xmm8, %xmm5
1489; SSE2-NEXT:    psrlq $32, %xmm5
1490; SSE2-NEXT:    pmuludq %xmm3, %xmm5
1491; SSE2-NEXT:    paddq %xmm4, %xmm5
1492; SSE2-NEXT:    psllq $32, %xmm5
1493; SSE2-NEXT:    pmuludq %xmm8, %xmm3
1494; SSE2-NEXT:    paddq %xmm5, %xmm3
1495; SSE2-NEXT:    retq
1496;
1497; SSE41-LABEL: mul_v8i64_sext:
1498; SSE41:       # BB#0:
1499; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
1500; SSE41-NEXT:    pmovsxwq %xmm3, %xmm8
1501; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
1502; SSE41-NEXT:    pmovsxwq %xmm3, %xmm6
1503; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
1504; SSE41-NEXT:    pmovsxwq %xmm3, %xmm7
1505; SSE41-NEXT:    pmovsxwq %xmm0, %xmm5
1506; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1507; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1508; SSE41-NEXT:    pmovsxdq %xmm2, %xmm2
1509; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1510; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1511; SSE41-NEXT:    pmovsxdq %xmm1, %xmm0
1512; SSE41-NEXT:    pmuldq %xmm5, %xmm0
1513; SSE41-NEXT:    pmuldq %xmm7, %xmm4
1514; SSE41-NEXT:    pmuldq %xmm6, %xmm2
1515; SSE41-NEXT:    pmuldq %xmm8, %xmm3
1516; SSE41-NEXT:    movdqa %xmm4, %xmm1
1517; SSE41-NEXT:    retq
1518;
1519; AVX2-LABEL: mul_v8i64_sext:
1520; AVX2:       # BB#0:
1521; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1522; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
1523; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
1524; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1525; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
1526; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
1527; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1528; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm1
1529; AVX2-NEXT:    retq
1530;
1531; AVX512-LABEL: mul_v8i64_sext:
1532; AVX512:       # BB#0:
1533; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
1534; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
1535; AVX512-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
1536; AVX512-NEXT:    retq
1537  %1 = sext <8 x i16> %val1 to <8 x i64>
1538  %2 = sext <8 x i32> %val2 to <8 x i64>
1539  %3 = mul <8 x i64> %1, %2
1540  ret <8 x i64> %3
1541}
1542