1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
8
9;
10; add
11;
12
13define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
14; SSE-LABEL: trunc_add_v4i64_v4i32:
15; SSE:       # BB#0:
16; SSE-NEXT:    paddq %xmm3, %xmm1
17; SSE-NEXT:    paddq %xmm2, %xmm0
18; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
19; SSE-NEXT:    retq
20;
21; AVX1-LABEL: trunc_add_v4i64_v4i32:
22; AVX1:       # BB#0:
23; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
24; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
25; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
26; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
27; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
28; AVX1-NEXT:    vzeroupper
29; AVX1-NEXT:    retq
30;
31; AVX2-LABEL: trunc_add_v4i64_v4i32:
32; AVX2:       # BB#0:
33; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
34; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
35; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
36; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
37; AVX2-NEXT:    vzeroupper
38; AVX2-NEXT:    retq
39;
40; AVX512-LABEL: trunc_add_v4i64_v4i32:
41; AVX512:       # BB#0:
42; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
43; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
44; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
45; AVX512-NEXT:    retq
46  %1 = add <4 x i64> %a0, %a1
47  %2 = trunc <4 x i64> %1 to <4 x i32>
48  ret <4 x i32> %2
49}
50
51define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
52; SSE-LABEL: trunc_add_v8i64_v8i16:
53; SSE:       # BB#0:
54; SSE-NEXT:    paddq %xmm4, %xmm0
55; SSE-NEXT:    paddq %xmm5, %xmm1
56; SSE-NEXT:    paddq %xmm6, %xmm2
57; SSE-NEXT:    paddq %xmm7, %xmm3
58; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
59; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
60; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
61; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
62; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
63; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
64; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
65; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
66; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
67; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
68; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
69; SSE-NEXT:    movapd %xmm2, %xmm0
70; SSE-NEXT:    retq
71;
72; AVX1-LABEL: trunc_add_v8i64_v8i16:
73; AVX1:       # BB#0:
74; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
75; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
76; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
77; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
78; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
79; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
80; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
81; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
82; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
83; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
84; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
85; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
86; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
87; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
88; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
89; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
90; AVX1-NEXT:    vzeroupper
91; AVX1-NEXT:    retq
92;
93; AVX2-LABEL: trunc_add_v8i64_v8i16:
94; AVX2:       # BB#0:
95; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
96; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
97; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
98; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
99; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
100; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
101; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
102; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
103; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
104; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
105; AVX2-NEXT:    vzeroupper
106; AVX2-NEXT:    retq
107;
108; AVX512-LABEL: trunc_add_v8i64_v8i16:
109; AVX512:       # BB#0:
110; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
111; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
112; AVX512-NEXT:    retq
113  %1 = add <8 x i64> %a0, %a1
114  %2 = trunc <8 x i64> %1 to <8 x i16>
115  ret <8 x i16> %2
116}
117
118define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
119; SSE-LABEL: trunc_add_v8i32_v8i16:
120; SSE:       # BB#0:
121; SSE-NEXT:    paddd %xmm2, %xmm0
122; SSE-NEXT:    paddd %xmm3, %xmm1
123; SSE-NEXT:    pslld $16, %xmm1
124; SSE-NEXT:    psrad $16, %xmm1
125; SSE-NEXT:    pslld $16, %xmm0
126; SSE-NEXT:    psrad $16, %xmm0
127; SSE-NEXT:    packssdw %xmm1, %xmm0
128; SSE-NEXT:    retq
129;
130; AVX1-LABEL: trunc_add_v8i32_v8i16:
131; AVX1:       # BB#0:
132; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
133; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
134; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
135; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
136; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
137; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
138; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
139; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
140; AVX1-NEXT:    vzeroupper
141; AVX1-NEXT:    retq
142;
143; AVX2-LABEL: trunc_add_v8i32_v8i16:
144; AVX2:       # BB#0:
145; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
146; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
147; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
148; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
149; AVX2-NEXT:    vzeroupper
150; AVX2-NEXT:    retq
151;
152; AVX512-LABEL: trunc_add_v8i32_v8i16:
153; AVX512:       # BB#0:
154; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
155; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
156; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
157; AVX512-NEXT:    retq
158  %1 = add <8 x i32> %a0, %a1
159  %2 = trunc <8 x i32> %1 to <8 x i16>
160  ret <8 x i16> %2
161}
162
163define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
164; SSE-LABEL: trunc_add_v16i64_v16i8:
165; SSE:       # BB#0:
166; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
167; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
168; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
169; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
170; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
171; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
172; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
173; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
174; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
175; SSE-NEXT:    pand %xmm8, %xmm7
176; SSE-NEXT:    pand %xmm8, %xmm6
177; SSE-NEXT:    packuswb %xmm7, %xmm6
178; SSE-NEXT:    pand %xmm8, %xmm5
179; SSE-NEXT:    pand %xmm8, %xmm4
180; SSE-NEXT:    packuswb %xmm5, %xmm4
181; SSE-NEXT:    packuswb %xmm6, %xmm4
182; SSE-NEXT:    pand %xmm8, %xmm3
183; SSE-NEXT:    pand %xmm8, %xmm2
184; SSE-NEXT:    packuswb %xmm3, %xmm2
185; SSE-NEXT:    pand %xmm8, %xmm1
186; SSE-NEXT:    pand %xmm8, %xmm0
187; SSE-NEXT:    packuswb %xmm1, %xmm0
188; SSE-NEXT:    packuswb %xmm2, %xmm0
189; SSE-NEXT:    packuswb %xmm4, %xmm0
190; SSE-NEXT:    retq
191;
192; AVX1-LABEL: trunc_add_v16i64_v16i8:
193; AVX1:       # BB#0:
194; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
195; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
196; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
197; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
198; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
199; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
200; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
201; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
202; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
203; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
204; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
205; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
206; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
207; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
208; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
209; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
210; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
211; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
212; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
213; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
214; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
215; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
216; AVX1-NEXT:    vpackuswb %xmm2, %xmm5, %xmm2
217; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
218; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
219; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
220; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
221; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
222; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
223; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
224; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
225; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
226; AVX1-NEXT:    vzeroupper
227; AVX1-NEXT:    retq
228;
229; AVX2-LABEL: trunc_add_v16i64_v16i8:
230; AVX2:       # BB#0:
231; AVX2-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
232; AVX2-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
233; AVX2-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
234; AVX2-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
235; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
236; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
237; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
238; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
239; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
240; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
241; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
242; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
243; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
244; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
245; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
246; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
247; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
248; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
249; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
250; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
251; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
252; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
253; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
254; AVX2-NEXT:    vzeroupper
255; AVX2-NEXT:    retq
256;
257; AVX512F-LABEL: trunc_add_v16i64_v16i8:
258; AVX512F:       # BB#0:
259; AVX512F-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
260; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
261; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
262; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
263; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
264; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
265; AVX512F-NEXT:    retq
266;
267; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
268; AVX512BW:       # BB#0:
269; AVX512BW-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
270; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
271; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
272; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
273; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
274; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
275; AVX512BW-NEXT:    retq
276;
277; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
278; AVX512DQ:       # BB#0:
279; AVX512DQ-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
280; AVX512DQ-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
281; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
282; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
283; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
284; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
285; AVX512DQ-NEXT:    retq
286  %1 = add <16 x i64> %a0, %a1
287  %2 = trunc <16 x i64> %1 to <16 x i8>
288  ret <16 x i8> %2
289}
290
291define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
292; SSE-LABEL: trunc_add_v16i32_v16i8:
293; SSE:       # BB#0:
294; SSE-NEXT:    paddd %xmm4, %xmm0
295; SSE-NEXT:    paddd %xmm5, %xmm1
296; SSE-NEXT:    paddd %xmm6, %xmm2
297; SSE-NEXT:    paddd %xmm7, %xmm3
298; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
299; SSE-NEXT:    pand %xmm4, %xmm3
300; SSE-NEXT:    pand %xmm4, %xmm2
301; SSE-NEXT:    packuswb %xmm3, %xmm2
302; SSE-NEXT:    pand %xmm4, %xmm1
303; SSE-NEXT:    pand %xmm4, %xmm0
304; SSE-NEXT:    packuswb %xmm1, %xmm0
305; SSE-NEXT:    packuswb %xmm2, %xmm0
306; SSE-NEXT:    retq
307;
308; AVX1-LABEL: trunc_add_v16i32_v16i8:
309; AVX1:       # BB#0:
310; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
311; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
312; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
313; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
314; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
315; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
316; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
317; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
318; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
319; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
320; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
321; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
322; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
323; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
324; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
325; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
326; AVX1-NEXT:    vzeroupper
327; AVX1-NEXT:    retq
328;
329; AVX2-LABEL: trunc_add_v16i32_v16i8:
330; AVX2:       # BB#0:
331; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
332; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
333; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
334; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
335; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
336; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
337; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
338; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
339; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
340; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
341; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
342; AVX2-NEXT:    vzeroupper
343; AVX2-NEXT:    retq
344;
345; AVX512-LABEL: trunc_add_v16i32_v16i8:
346; AVX512:       # BB#0:
347; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
348; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
349; AVX512-NEXT:    retq
350  %1 = add <16 x i32> %a0, %a1
351  %2 = trunc <16 x i32> %1 to <16 x i8>
352  ret <16 x i8> %2
353}
354
355define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
356; SSE-LABEL: trunc_add_v16i16_v16i8:
357; SSE:       # BB#0:
358; SSE-NEXT:    paddw %xmm2, %xmm0
359; SSE-NEXT:    paddw %xmm3, %xmm1
360; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
361; SSE-NEXT:    pand %xmm2, %xmm1
362; SSE-NEXT:    pand %xmm2, %xmm0
363; SSE-NEXT:    packuswb %xmm1, %xmm0
364; SSE-NEXT:    retq
365;
366; AVX1-LABEL: trunc_add_v16i16_v16i8:
367; AVX1:       # BB#0:
368; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
369; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
370; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
371; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
372; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
373; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
374; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
375; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
376; AVX1-NEXT:    vzeroupper
377; AVX1-NEXT:    retq
378;
379; AVX2-LABEL: trunc_add_v16i16_v16i8:
380; AVX2:       # BB#0:
381; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
382; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
383; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
384; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
385; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
386; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
387; AVX2-NEXT:    vzeroupper
388; AVX2-NEXT:    retq
389;
390; AVX512F-LABEL: trunc_add_v16i16_v16i8:
391; AVX512F:       # BB#0:
392; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
393; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
394; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
395; AVX512F-NEXT:    retq
396;
397; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
398; AVX512BW:       # BB#0:
399; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
400; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
401; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
402; AVX512BW-NEXT:    retq
403;
404; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
405; AVX512DQ:       # BB#0:
406; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
407; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
408; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
409; AVX512DQ-NEXT:    retq
410  %1 = add <16 x i16> %a0, %a1
411  %2 = trunc <16 x i16> %1 to <16 x i8>
412  ret <16 x i8> %2
413}
414
415;
416; add to constant
417;
418
419define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
420; SSE-LABEL: trunc_add_const_v4i64_v4i32:
421; SSE:       # BB#0:
422; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
423; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
424; SSE-NEXT:    retq
425;
426; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
427; AVX1:       # BB#0:
428; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
429; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
430; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
431; AVX1-NEXT:    vzeroupper
432; AVX1-NEXT:    retq
433;
434; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
435; AVX2:       # BB#0:
436; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
437; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
438; AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
439; AVX2-NEXT:    vzeroupper
440; AVX2-NEXT:    retq
441;
442; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
443; AVX512:       # BB#0:
444; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
445; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
446; AVX512-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
447; AVX512-NEXT:    retq
448  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
449  %2 = trunc <4 x i64> %1 to <4 x i32>
450  ret <4 x i32> %2
451}
452
453define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
454; SSE-LABEL: trunc_add_const_v8i64_v8i16:
455; SSE:       # BB#0:
456; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
457; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
458; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
459; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
460; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
461; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
462; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
463; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
464; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
465; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
466; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
467; SSE-NEXT:    paddw {{.*}}(%rip), %xmm2
468; SSE-NEXT:    movdqa %xmm2, %xmm0
469; SSE-NEXT:    retq
470;
471; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
472; AVX1:       # BB#0:
473; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
474; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
475; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
476; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
477; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
478; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
479; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
480; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
481; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
482; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
483; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
484; AVX1-NEXT:    vzeroupper
485; AVX1-NEXT:    retq
486;
487; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
488; AVX2:       # BB#0:
489; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
490; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
491; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
492; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
493; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
494; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
495; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
496; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
497; AVX2-NEXT:    vzeroupper
498; AVX2-NEXT:    retq
499;
500; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
501; AVX512:       # BB#0:
502; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
503; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
504; AVX512-NEXT:    retq
505  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
506  %2 = trunc <8 x i64> %1 to <8 x i16>
507  ret <8 x i16> %2
508}
509
510define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
511; SSE-LABEL: trunc_add_const_v8i32_v8i16:
512; SSE:       # BB#0:
513; SSE-NEXT:    pslld $16, %xmm1
514; SSE-NEXT:    psrad $16, %xmm1
515; SSE-NEXT:    pslld $16, %xmm0
516; SSE-NEXT:    psrad $16, %xmm0
517; SSE-NEXT:    packssdw %xmm1, %xmm0
518; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
519; SSE-NEXT:    retq
520;
521; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
522; AVX1:       # BB#0:
523; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
524; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
525; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
526; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
527; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
528; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
529; AVX1-NEXT:    vzeroupper
530; AVX1-NEXT:    retq
531;
532; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
533; AVX2:       # BB#0:
534; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
535; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
536; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
537; AVX2-NEXT:    vzeroupper
538; AVX2-NEXT:    retq
539;
540; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
541; AVX512:       # BB#0:
542; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
543; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
544; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
545; AVX512-NEXT:    retq
546  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
547  %2 = trunc <8 x i32> %1 to <8 x i16>
548  ret <8 x i16> %2
549}
550
551define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
552; SSE-LABEL: trunc_add_const_v16i64_v16i8:
553; SSE:       # BB#0:
554; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
555; SSE-NEXT:    pand %xmm8, %xmm7
556; SSE-NEXT:    pand %xmm8, %xmm6
557; SSE-NEXT:    packuswb %xmm7, %xmm6
558; SSE-NEXT:    pand %xmm8, %xmm5
559; SSE-NEXT:    pand %xmm8, %xmm4
560; SSE-NEXT:    packuswb %xmm5, %xmm4
561; SSE-NEXT:    packuswb %xmm6, %xmm4
562; SSE-NEXT:    pand %xmm8, %xmm3
563; SSE-NEXT:    pand %xmm8, %xmm2
564; SSE-NEXT:    packuswb %xmm3, %xmm2
565; SSE-NEXT:    pand %xmm8, %xmm1
566; SSE-NEXT:    pand %xmm8, %xmm0
567; SSE-NEXT:    packuswb %xmm1, %xmm0
568; SSE-NEXT:    packuswb %xmm2, %xmm0
569; SSE-NEXT:    packuswb %xmm4, %xmm0
570; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
571; SSE-NEXT:    retq
572;
573; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
574; AVX1:       # BB#0:
575; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
576; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
577; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
578; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
579; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
580; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
581; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
582; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
583; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
584; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
585; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
586; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
587; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
588; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
589; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
590; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
591; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
592; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
593; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
594; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
595; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
596; AVX1-NEXT:    vzeroupper
597; AVX1-NEXT:    retq
598;
599; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
600; AVX2:       # BB#0:
601; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
602; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
603; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
604; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
605; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
606; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
607; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
608; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
609; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
610; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
611; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
612; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
613; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
614; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
615; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
616; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
617; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
618; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
619; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
620; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
621; AVX2-NEXT:    vzeroupper
622; AVX2-NEXT:    retq
623;
624; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
625; AVX512F:       # BB#0:
626; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
627; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
628; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
629; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
630; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
631; AVX512F-NEXT:    retq
632;
633; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
634; AVX512BW:       # BB#0:
635; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
636; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
637; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
638; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
639; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
640; AVX512BW-NEXT:    retq
641;
642; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
643; AVX512DQ:       # BB#0:
644; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
645; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
646; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
647; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
648; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
649; AVX512DQ-NEXT:    retq
650  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
651  %2 = trunc <16 x i64> %1 to <16 x i8>
652  ret <16 x i8> %2
653}
654
655define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
656; SSE-LABEL: trunc_add_const_v16i32_v16i8:
657; SSE:       # BB#0:
658; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
659; SSE-NEXT:    pand %xmm4, %xmm3
660; SSE-NEXT:    pand %xmm4, %xmm2
661; SSE-NEXT:    packuswb %xmm3, %xmm2
662; SSE-NEXT:    pand %xmm4, %xmm1
663; SSE-NEXT:    pand %xmm4, %xmm0
664; SSE-NEXT:    packuswb %xmm1, %xmm0
665; SSE-NEXT:    packuswb %xmm2, %xmm0
666; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
667; SSE-NEXT:    retq
668;
669; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
670; AVX1:       # BB#0:
671; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
672; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
673; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
674; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
675; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
676; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
677; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
678; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
679; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
680; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
681; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
682; AVX1-NEXT:    vzeroupper
683; AVX1-NEXT:    retq
684;
685; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
686; AVX2:       # BB#0:
687; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
688; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
689; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
690; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
691; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
692; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
693; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
694; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
695; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
696; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
697; AVX2-NEXT:    vzeroupper
698; AVX2-NEXT:    retq
699;
700; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
701; AVX512:       # BB#0:
702; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
703; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
704; AVX512-NEXT:    retq
705  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
706  %2 = trunc <16 x i32> %1 to <16 x i8>
707  ret <16 x i8> %2
708}
709
710define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
711; SSE-LABEL: trunc_add_const_v16i16_v16i8:
712; SSE:       # BB#0:
713; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
714; SSE-NEXT:    pand %xmm2, %xmm1
715; SSE-NEXT:    pand %xmm2, %xmm0
716; SSE-NEXT:    packuswb %xmm1, %xmm0
717; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
718; SSE-NEXT:    retq
719;
720; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
721; AVX1:       # BB#0:
722; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
723; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
724; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
725; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
726; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
727; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
728; AVX1-NEXT:    vzeroupper
729; AVX1-NEXT:    retq
730;
731; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
732; AVX2:       # BB#0:
733; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
734; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
735; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
736; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
737; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
738; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
739; AVX2-NEXT:    vzeroupper
740; AVX2-NEXT:    retq
741;
742; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
743; AVX512F:       # BB#0:
744; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
745; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
746; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
747; AVX512F-NEXT:    retq
748;
749; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
750; AVX512BW:       # BB#0:
751; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
752; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
753; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
754; AVX512BW-NEXT:    retq
755;
756; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
757; AVX512DQ:       # BB#0:
758; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
759; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
760; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
761; AVX512DQ-NEXT:    retq
762  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
763  %2 = trunc <16 x i16> %1 to <16 x i8>
764  ret <16 x i8> %2
765}
766
767;
768; sub
769;
770
771define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
772; SSE-LABEL: trunc_sub_v4i64_v4i32:
773; SSE:       # BB#0:
774; SSE-NEXT:    psubq %xmm3, %xmm1
775; SSE-NEXT:    psubq %xmm2, %xmm0
776; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
777; SSE-NEXT:    retq
778;
779; AVX1-LABEL: trunc_sub_v4i64_v4i32:
780; AVX1:       # BB#0:
781; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
782; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
783; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
784; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
785; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
786; AVX1-NEXT:    vzeroupper
787; AVX1-NEXT:    retq
788;
789; AVX2-LABEL: trunc_sub_v4i64_v4i32:
790; AVX2:       # BB#0:
791; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
792; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
793; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
794; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
795; AVX2-NEXT:    vzeroupper
796; AVX2-NEXT:    retq
797;
798; AVX512-LABEL: trunc_sub_v4i64_v4i32:
799; AVX512:       # BB#0:
800; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
801; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
802; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
803; AVX512-NEXT:    retq
804  %1 = sub <4 x i64> %a0, %a1
805  %2 = trunc <4 x i64> %1 to <4 x i32>
806  ret <4 x i32> %2
807}
808
809define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
810; SSE-LABEL: trunc_sub_v8i64_v8i16:
811; SSE:       # BB#0:
812; SSE-NEXT:    psubq %xmm4, %xmm0
813; SSE-NEXT:    psubq %xmm5, %xmm1
814; SSE-NEXT:    psubq %xmm6, %xmm2
815; SSE-NEXT:    psubq %xmm7, %xmm3
816; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
817; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
818; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
819; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
820; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
821; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
822; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
823; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
824; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
825; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
826; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
827; SSE-NEXT:    movapd %xmm2, %xmm0
828; SSE-NEXT:    retq
829;
830; AVX1-LABEL: trunc_sub_v8i64_v8i16:
831; AVX1:       # BB#0:
832; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
833; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
834; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
835; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
836; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
837; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
838; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
839; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
840; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
841; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
842; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
843; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
844; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
845; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
846; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
847; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
848; AVX1-NEXT:    vzeroupper
849; AVX1-NEXT:    retq
850;
851; AVX2-LABEL: trunc_sub_v8i64_v8i16:
852; AVX2:       # BB#0:
853; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
854; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
855; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
856; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
857; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
858; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
859; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
860; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
861; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
862; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
863; AVX2-NEXT:    vzeroupper
864; AVX2-NEXT:    retq
865;
866; AVX512-LABEL: trunc_sub_v8i64_v8i16:
867; AVX512:       # BB#0:
868; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
869; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
870; AVX512-NEXT:    retq
871  %1 = sub <8 x i64> %a0, %a1
872  %2 = trunc <8 x i64> %1 to <8 x i16>
873  ret <8 x i16> %2
874}
875
876define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
877; SSE-LABEL: trunc_sub_v8i32_v8i16:
878; SSE:       # BB#0:
879; SSE-NEXT:    psubd %xmm2, %xmm0
880; SSE-NEXT:    psubd %xmm3, %xmm1
881; SSE-NEXT:    pslld $16, %xmm1
882; SSE-NEXT:    psrad $16, %xmm1
883; SSE-NEXT:    pslld $16, %xmm0
884; SSE-NEXT:    psrad $16, %xmm0
885; SSE-NEXT:    packssdw %xmm1, %xmm0
886; SSE-NEXT:    retq
887;
888; AVX1-LABEL: trunc_sub_v8i32_v8i16:
889; AVX1:       # BB#0:
890; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
891; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
892; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
893; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
894; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
895; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
896; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
897; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
898; AVX1-NEXT:    vzeroupper
899; AVX1-NEXT:    retq
900;
901; AVX2-LABEL: trunc_sub_v8i32_v8i16:
902; AVX2:       # BB#0:
903; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
904; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
905; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
906; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
907; AVX2-NEXT:    vzeroupper
908; AVX2-NEXT:    retq
909;
910; AVX512-LABEL: trunc_sub_v8i32_v8i16:
911; AVX512:       # BB#0:
912; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
913; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
914; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
915; AVX512-NEXT:    retq
916  %1 = sub <8 x i32> %a0, %a1
917  %2 = trunc <8 x i32> %1 to <8 x i16>
918  ret <8 x i16> %2
919}
920
921define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
922; SSE-LABEL: trunc_sub_v16i64_v16i8:
923; SSE:       # BB#0:
924; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
925; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
926; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
927; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
928; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
929; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
930; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
931; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
932; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
933; SSE-NEXT:    pand %xmm8, %xmm7
934; SSE-NEXT:    pand %xmm8, %xmm6
935; SSE-NEXT:    packuswb %xmm7, %xmm6
936; SSE-NEXT:    pand %xmm8, %xmm5
937; SSE-NEXT:    pand %xmm8, %xmm4
938; SSE-NEXT:    packuswb %xmm5, %xmm4
939; SSE-NEXT:    packuswb %xmm6, %xmm4
940; SSE-NEXT:    pand %xmm8, %xmm3
941; SSE-NEXT:    pand %xmm8, %xmm2
942; SSE-NEXT:    packuswb %xmm3, %xmm2
943; SSE-NEXT:    pand %xmm8, %xmm1
944; SSE-NEXT:    pand %xmm8, %xmm0
945; SSE-NEXT:    packuswb %xmm1, %xmm0
946; SSE-NEXT:    packuswb %xmm2, %xmm0
947; SSE-NEXT:    packuswb %xmm4, %xmm0
948; SSE-NEXT:    retq
949;
950; AVX1-LABEL: trunc_sub_v16i64_v16i8:
951; AVX1:       # BB#0:
952; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
953; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
954; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
955; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
956; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
957; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
958; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
959; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
960; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
961; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
962; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
963; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
964; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
965; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
966; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
967; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
968; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
969; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
970; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
971; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
972; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
973; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
974; AVX1-NEXT:    vpackuswb %xmm2, %xmm5, %xmm2
975; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
976; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
977; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
978; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
979; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
980; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
981; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
982; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
983; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
984; AVX1-NEXT:    vzeroupper
985; AVX1-NEXT:    retq
986;
987; AVX2-LABEL: trunc_sub_v16i64_v16i8:
988; AVX2:       # BB#0:
989; AVX2-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
990; AVX2-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
991; AVX2-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
992; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
993; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
994; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
995; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
996; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
997; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
998; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
999; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1000; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1001; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1002; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1003; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1004; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1005; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1006; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1007; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1008; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1009; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1010; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
1011; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1012; AVX2-NEXT:    vzeroupper
1013; AVX2-NEXT:    retq
1014;
1015; AVX512F-LABEL: trunc_sub_v16i64_v16i8:
1016; AVX512F:       # BB#0:
1017; AVX512F-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1018; AVX512F-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1019; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1020; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1021; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1022; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1023; AVX512F-NEXT:    retq
1024;
1025; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
1026; AVX512BW:       # BB#0:
1027; AVX512BW-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1028; AVX512BW-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1029; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1030; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1031; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1032; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
1033; AVX512BW-NEXT:    retq
1034;
1035; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
1036; AVX512DQ:       # BB#0:
1037; AVX512DQ-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1038; AVX512DQ-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1039; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1040; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
1041; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
1042; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1043; AVX512DQ-NEXT:    retq
1044  %1 = sub <16 x i64> %a0, %a1
1045  %2 = trunc <16 x i64> %1 to <16 x i8>
1046  ret <16 x i8> %2
1047}
1048
1049define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1050; SSE-LABEL: trunc_sub_v16i32_v16i8:
1051; SSE:       # BB#0:
1052; SSE-NEXT:    psubd %xmm4, %xmm0
1053; SSE-NEXT:    psubd %xmm5, %xmm1
1054; SSE-NEXT:    psubd %xmm6, %xmm2
1055; SSE-NEXT:    psubd %xmm7, %xmm3
1056; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1057; SSE-NEXT:    pand %xmm4, %xmm3
1058; SSE-NEXT:    pand %xmm4, %xmm2
1059; SSE-NEXT:    packuswb %xmm3, %xmm2
1060; SSE-NEXT:    pand %xmm4, %xmm1
1061; SSE-NEXT:    pand %xmm4, %xmm0
1062; SSE-NEXT:    packuswb %xmm1, %xmm0
1063; SSE-NEXT:    packuswb %xmm2, %xmm0
1064; SSE-NEXT:    retq
1065;
1066; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1067; AVX1:       # BB#0:
1068; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
1069; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1070; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1071; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
1072; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
1073; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1074; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1075; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
1076; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1077; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1078; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1079; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
1080; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1081; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1082; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1083; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1084; AVX1-NEXT:    vzeroupper
1085; AVX1-NEXT:    retq
1086;
1087; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1088; AVX2:       # BB#0:
1089; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
1090; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
1091; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1092; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1093; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1094; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1095; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1096; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1097; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1098; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1099; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1100; AVX2-NEXT:    vzeroupper
1101; AVX2-NEXT:    retq
1102;
1103; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1104; AVX512:       # BB#0:
1105; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
1106; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1107; AVX512-NEXT:    retq
1108  %1 = sub <16 x i32> %a0, %a1
1109  %2 = trunc <16 x i32> %1 to <16 x i8>
1110  ret <16 x i8> %2
1111}
1112
1113define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1114; SSE-LABEL: trunc_sub_v16i16_v16i8:
1115; SSE:       # BB#0:
1116; SSE-NEXT:    psubw %xmm2, %xmm0
1117; SSE-NEXT:    psubw %xmm3, %xmm1
1118; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1119; SSE-NEXT:    pand %xmm2, %xmm1
1120; SSE-NEXT:    pand %xmm2, %xmm0
1121; SSE-NEXT:    packuswb %xmm1, %xmm0
1122; SSE-NEXT:    retq
1123;
1124; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1125; AVX1:       # BB#0:
1126; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
1127; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1128; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1129; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1130; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1131; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1132; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1133; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1134; AVX1-NEXT:    vzeroupper
1135; AVX1-NEXT:    retq
1136;
1137; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1138; AVX2:       # BB#0:
1139; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1140; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1141; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1142; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1143; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1144; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1145; AVX2-NEXT:    vzeroupper
1146; AVX2-NEXT:    retq
1147;
1148; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1149; AVX512F:       # BB#0:
1150; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1151; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1152; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1153; AVX512F-NEXT:    retq
1154;
1155; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1156; AVX512BW:       # BB#0:
1157; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1158; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1159; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1160; AVX512BW-NEXT:    retq
1161;
1162; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1163; AVX512DQ:       # BB#0:
1164; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1165; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1166; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1167; AVX512DQ-NEXT:    retq
1168  %1 = sub <16 x i16> %a0, %a1
1169  %2 = trunc <16 x i16> %1 to <16 x i8>
1170  ret <16 x i8> %2
1171}
1172
1173;
1174; sub to constant
1175;
1176
1177define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1178; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1179; SSE:       # BB#0:
1180; SSE-NEXT:    movl $1, %eax
1181; SSE-NEXT:    movd %rax, %xmm2
1182; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1183; SSE-NEXT:    psubq %xmm2, %xmm0
1184; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
1185; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1186; SSE-NEXT:    retq
1187;
1188; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1189; AVX1:       # BB#0:
1190; AVX1-NEXT:    movl $1, %eax
1191; AVX1-NEXT:    vmovq %rax, %xmm1
1192; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1193; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1194; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1195; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
1196; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
1197; AVX1-NEXT:    vzeroupper
1198; AVX1-NEXT:    retq
1199;
1200; AVX2-LABEL: trunc_sub_const_v4i64_v4i32:
1201; AVX2:       # BB#0:
1202; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1203; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1204; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1205; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1206; AVX2-NEXT:    vzeroupper
1207; AVX2-NEXT:    retq
1208;
1209; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1210; AVX512:       # BB#0:
1211; AVX512-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1212; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1213; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1214; AVX512-NEXT:    retq
1215  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1216  %2 = trunc <4 x i64> %1 to <4 x i32>
1217  ret <4 x i32> %2
1218}
1219
1220define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1221; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1222; SSE:       # BB#0:
1223; SSE-NEXT:    movl $1, %eax
1224; SSE-NEXT:    movd %rax, %xmm4
1225; SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
1226; SSE-NEXT:    psubq %xmm4, %xmm0
1227; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
1228; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
1229; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
1230; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1231; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1232; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1233; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1234; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1235; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1236; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1237; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1238; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1239; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1240; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1241; SSE-NEXT:    movapd %xmm2, %xmm0
1242; SSE-NEXT:    retq
1243;
1244; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1245; AVX1:       # BB#0:
1246; AVX1-NEXT:    movl $1, %eax
1247; AVX1-NEXT:    vmovq %rax, %xmm2
1248; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1249; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
1250; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1251; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
1252; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm3
1253; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1254; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
1255; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1256; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
1257; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
1258; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1259; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
1260; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
1261; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1262; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1263; AVX1-NEXT:    vzeroupper
1264; AVX1-NEXT:    retq
1265;
1266; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1267; AVX2:       # BB#0:
1268; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
1269; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1270; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1271; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1272; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1273; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1274; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1275; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1276; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1277; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1278; AVX2-NEXT:    vzeroupper
1279; AVX2-NEXT:    retq
1280;
1281; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1282; AVX512:       # BB#0:
1283; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
1284; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1285; AVX512-NEXT:    retq
1286  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1287  %2 = trunc <8 x i64> %1 to <8 x i16>
1288  ret <8 x i16> %2
1289}
1290
1291define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1292; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1293; SSE:       # BB#0:
1294; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
1295; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
1296; SSE-NEXT:    pslld $16, %xmm1
1297; SSE-NEXT:    psrad $16, %xmm1
1298; SSE-NEXT:    pslld $16, %xmm0
1299; SSE-NEXT:    psrad $16, %xmm0
1300; SSE-NEXT:    packssdw %xmm1, %xmm0
1301; SSE-NEXT:    retq
1302;
1303; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1304; AVX1:       # BB#0:
1305; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm1
1306; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1307; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1308; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1309; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1310; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1311; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1312; AVX1-NEXT:    vzeroupper
1313; AVX1-NEXT:    retq
1314;
1315; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1316; AVX2:       # BB#0:
1317; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
1318; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1319; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1320; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1321; AVX2-NEXT:    vzeroupper
1322; AVX2-NEXT:    retq
1323;
1324; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1325; AVX512:       # BB#0:
1326; AVX512-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
1327; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1328; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1329; AVX512-NEXT:    retq
1330  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1331  %2 = trunc <8 x i32> %1 to <8 x i16>
1332  ret <8 x i16> %2
1333}
1334
1335define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1336; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1337; SSE:       # BB#0:
1338; SSE-NEXT:    movl $1, %eax
1339; SSE-NEXT:    movd %rax, %xmm8
1340; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
1341; SSE-NEXT:    psubq %xmm8, %xmm0
1342; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
1343; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
1344; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
1345; SSE-NEXT:    psubq {{.*}}(%rip), %xmm4
1346; SSE-NEXT:    psubq {{.*}}(%rip), %xmm5
1347; SSE-NEXT:    psubq {{.*}}(%rip), %xmm6
1348; SSE-NEXT:    psubq {{.*}}(%rip), %xmm7
1349; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1350; SSE-NEXT:    pand %xmm8, %xmm7
1351; SSE-NEXT:    pand %xmm8, %xmm6
1352; SSE-NEXT:    packuswb %xmm7, %xmm6
1353; SSE-NEXT:    pand %xmm8, %xmm5
1354; SSE-NEXT:    pand %xmm8, %xmm4
1355; SSE-NEXT:    packuswb %xmm5, %xmm4
1356; SSE-NEXT:    packuswb %xmm6, %xmm4
1357; SSE-NEXT:    pand %xmm8, %xmm3
1358; SSE-NEXT:    pand %xmm8, %xmm2
1359; SSE-NEXT:    packuswb %xmm3, %xmm2
1360; SSE-NEXT:    pand %xmm8, %xmm1
1361; SSE-NEXT:    pand %xmm8, %xmm0
1362; SSE-NEXT:    packuswb %xmm1, %xmm0
1363; SSE-NEXT:    packuswb %xmm2, %xmm0
1364; SSE-NEXT:    packuswb %xmm4, %xmm0
1365; SSE-NEXT:    retq
1366;
1367; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1368; AVX1:       # BB#0:
1369; AVX1-NEXT:    movl $1, %eax
1370; AVX1-NEXT:    vmovq %rax, %xmm4
1371; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
1372; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
1373; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1374; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
1375; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm5
1376; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1377; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
1378; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm6
1379; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1380; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
1381; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm7
1382; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1383; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
1384; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1385; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
1386; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
1387; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
1388; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1389; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
1390; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
1391; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
1392; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1393; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
1394; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
1395; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1396; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
1397; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
1398; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1399; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1400; AVX1-NEXT:    vzeroupper
1401; AVX1-NEXT:    retq
1402;
1403; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1404; AVX2:       # BB#0:
1405; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
1406; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1407; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm3, %ymm3
1408; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm2, %ymm2
1409; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
1410; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1411; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1412; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
1413; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1414; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1415; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1416; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1417; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1418; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1419; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1420; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1421; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1422; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1423; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1424; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1425; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1426; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
1427; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1428; AVX2-NEXT:    vzeroupper
1429; AVX2-NEXT:    retq
1430;
1431; AVX512F-LABEL: trunc_sub_const_v16i64_v16i8:
1432; AVX512F:       # BB#0:
1433; AVX512F-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
1434; AVX512F-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
1435; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1436; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1437; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1438; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1439; AVX512F-NEXT:    retq
1440;
1441; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
1442; AVX512BW:       # BB#0:
1443; AVX512BW-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
1444; AVX512BW-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
1445; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1446; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1447; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1448; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
1449; AVX512BW-NEXT:    retq
1450;
1451; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
1452; AVX512DQ:       # BB#0:
1453; AVX512DQ-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
1454; AVX512DQ-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
1455; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1456; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
1457; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
1458; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1459; AVX512DQ-NEXT:    retq
1460  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1461  %2 = trunc <16 x i64> %1 to <16 x i8>
1462  ret <16 x i8> %2
1463}
1464
1465define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1466; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1467; SSE:       # BB#0:
1468; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
1469; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
1470; SSE-NEXT:    psubd {{.*}}(%rip), %xmm2
1471; SSE-NEXT:    psubd {{.*}}(%rip), %xmm3
1472; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1473; SSE-NEXT:    pand %xmm4, %xmm3
1474; SSE-NEXT:    pand %xmm4, %xmm2
1475; SSE-NEXT:    packuswb %xmm3, %xmm2
1476; SSE-NEXT:    pand %xmm4, %xmm1
1477; SSE-NEXT:    pand %xmm4, %xmm0
1478; SSE-NEXT:    packuswb %xmm1, %xmm0
1479; SSE-NEXT:    packuswb %xmm2, %xmm0
1480; SSE-NEXT:    retq
1481;
1482; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1483; AVX1:       # BB#0:
1484; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm2
1485; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1486; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1487; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm3
1488; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1489; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm1
1490; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1491; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1492; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
1493; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
1494; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1495; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1496; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1497; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1498; AVX1-NEXT:    vzeroupper
1499; AVX1-NEXT:    retq
1500;
1501; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1502; AVX2:       # BB#0:
1503; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
1504; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm1, %ymm1
1505; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1506; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1507; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1508; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1509; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1510; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1511; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1512; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1513; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1514; AVX2-NEXT:    vzeroupper
1515; AVX2-NEXT:    retq
1516;
1517; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1518; AVX512:       # BB#0:
1519; AVX512-NEXT:    vpsubd {{.*}}(%rip), %zmm0, %zmm0
1520; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1521; AVX512-NEXT:    retq
1522  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1523  %2 = trunc <16 x i32> %1 to <16 x i8>
1524  ret <16 x i8> %2
1525}
1526
1527define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1528; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1529; SSE:       # BB#0:
1530; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
1531; SSE-NEXT:    psubw {{.*}}(%rip), %xmm1
1532; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1533; SSE-NEXT:    pand %xmm2, %xmm1
1534; SSE-NEXT:    pand %xmm2, %xmm0
1535; SSE-NEXT:    packuswb %xmm1, %xmm0
1536; SSE-NEXT:    retq
1537;
1538; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1539; AVX1:       # BB#0:
1540; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm1
1541; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1542; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1543; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1544; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1545; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1546; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1547; AVX1-NEXT:    vzeroupper
1548; AVX1-NEXT:    retq
1549;
1550; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1551; AVX2:       # BB#0:
1552; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1553; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1554; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1555; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1556; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1557; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1558; AVX2-NEXT:    vzeroupper
1559; AVX2-NEXT:    retq
1560;
1561; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1562; AVX512F:       # BB#0:
1563; AVX512F-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1564; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1565; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1566; AVX512F-NEXT:    retq
1567;
1568; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1569; AVX512BW:       # BB#0:
1570; AVX512BW-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1571; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1572; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1573; AVX512BW-NEXT:    retq
1574;
1575; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1576; AVX512DQ:       # BB#0:
1577; AVX512DQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1578; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1579; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1580; AVX512DQ-NEXT:    retq
1581  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1582  %2 = trunc <16 x i16> %1 to <16 x i8>
1583  ret <16 x i8> %2
1584}
1585
1586;
1587; mul
1588;
1589
1590define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1591; SSE-LABEL: trunc_mul_v4i64_v4i32:
1592; SSE:       # BB#0:
1593; SSE-NEXT:    movdqa %xmm1, %xmm4
1594; SSE-NEXT:    psrlq $32, %xmm4
1595; SSE-NEXT:    pmuludq %xmm3, %xmm4
1596; SSE-NEXT:    movdqa %xmm3, %xmm5
1597; SSE-NEXT:    psrlq $32, %xmm5
1598; SSE-NEXT:    pmuludq %xmm1, %xmm5
1599; SSE-NEXT:    paddq %xmm4, %xmm5
1600; SSE-NEXT:    psllq $32, %xmm5
1601; SSE-NEXT:    pmuludq %xmm3, %xmm1
1602; SSE-NEXT:    paddq %xmm5, %xmm1
1603; SSE-NEXT:    movdqa %xmm0, %xmm3
1604; SSE-NEXT:    psrlq $32, %xmm3
1605; SSE-NEXT:    pmuludq %xmm2, %xmm3
1606; SSE-NEXT:    movdqa %xmm2, %xmm4
1607; SSE-NEXT:    psrlq $32, %xmm4
1608; SSE-NEXT:    pmuludq %xmm0, %xmm4
1609; SSE-NEXT:    paddq %xmm3, %xmm4
1610; SSE-NEXT:    psllq $32, %xmm4
1611; SSE-NEXT:    pmuludq %xmm2, %xmm0
1612; SSE-NEXT:    paddq %xmm4, %xmm0
1613; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1614; SSE-NEXT:    retq
1615;
1616; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1617; AVX1:       # BB#0:
1618; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1619; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1620; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1621; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1622; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1623; AVX1-NEXT:    vzeroupper
1624; AVX1-NEXT:    retq
1625;
1626; AVX2-LABEL: trunc_mul_v4i64_v4i32:
1627; AVX2:       # BB#0:
1628; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1629; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1630; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1631; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1632; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1633; AVX2-NEXT:    vzeroupper
1634; AVX2-NEXT:    retq
1635;
1636; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1637; AVX512F:       # BB#0:
1638; AVX512F-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
1639; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
1640; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1641; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1642; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1643; AVX512F-NEXT:    retq
1644;
1645; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1646; AVX512BW:       # BB#0:
1647; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
1648; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
1649; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1650; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1651; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1652; AVX512BW-NEXT:    retq
1653;
1654; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1655; AVX512DQ:       # BB#0:
1656; AVX512DQ-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
1657; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
1658; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1659; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1660; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1661; AVX512DQ-NEXT:    retq
1662  %1 = mul <4 x i64> %a0, %a1
1663  %2 = trunc <4 x i64> %1 to <4 x i32>
1664  ret <4 x i32> %2
1665}
1666
1667define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1668; SSE-LABEL: trunc_mul_v8i64_v8i16:
1669; SSE:       # BB#0:
1670; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1671; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
1672; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1673; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1674; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
1675; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1676; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1677; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1678; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1679; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1680; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1681; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1682; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1683; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1684; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1685; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1686; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1687; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1688; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1689; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1690; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1691; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1692; SSE-NEXT:    pmullw %xmm6, %xmm2
1693; SSE-NEXT:    movdqa %xmm2, %xmm0
1694; SSE-NEXT:    retq
1695;
1696; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1697; AVX1:       # BB#0:
1698; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
1699; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
1700; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
1701; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1702; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
1703; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
1704; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
1705; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
1706; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
1707; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1708; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1709; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1710; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
1711; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1712; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1713; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1714; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
1715; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1716; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1717; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1718; AVX1-NEXT:    vzeroupper
1719; AVX1-NEXT:    retq
1720;
1721; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1722; AVX2:       # BB#0:
1723; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
1724; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1725; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1726; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
1727; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1728; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1729; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1730; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1731; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
1732; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1733; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1734; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1735; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1736; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1737; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1738; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1739; AVX2-NEXT:    vzeroupper
1740; AVX2-NEXT:    retq
1741;
1742; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1743; AVX512F:       # BB#0:
1744; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1745; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1746; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1747; AVX512F-NEXT:    retq
1748;
1749; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1750; AVX512BW:       # BB#0:
1751; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1752; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1753; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1754; AVX512BW-NEXT:    retq
1755;
1756; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1757; AVX512DQ:       # BB#0:
1758; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1759; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
1760; AVX512DQ-NEXT:    retq
1761  %1 = mul <8 x i64> %a0, %a1
1762  %2 = trunc <8 x i64> %1 to <8 x i16>
1763  ret <8 x i16> %2
1764}
1765
1766define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1767; SSE-LABEL: trunc_mul_v8i32_v8i16:
1768; SSE:       # BB#0:
1769; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1770; SSE-NEXT:    pmuludq %xmm2, %xmm0
1771; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1772; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1773; SSE-NEXT:    pmuludq %xmm4, %xmm2
1774; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1775; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1776; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1777; SSE-NEXT:    pmuludq %xmm3, %xmm1
1778; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1779; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1780; SSE-NEXT:    pmuludq %xmm2, %xmm3
1781; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1782; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1783; SSE-NEXT:    pslld $16, %xmm1
1784; SSE-NEXT:    psrad $16, %xmm1
1785; SSE-NEXT:    pslld $16, %xmm0
1786; SSE-NEXT:    psrad $16, %xmm0
1787; SSE-NEXT:    packssdw %xmm1, %xmm0
1788; SSE-NEXT:    retq
1789;
1790; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1791; AVX1:       # BB#0:
1792; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
1793; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1794; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1795; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1796; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1797; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1798; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1799; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1800; AVX1-NEXT:    vzeroupper
1801; AVX1-NEXT:    retq
1802;
1803; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1804; AVX2:       # BB#0:
1805; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1806; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1807; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1808; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1809; AVX2-NEXT:    vzeroupper
1810; AVX2-NEXT:    retq
1811;
1812; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1813; AVX512:       # BB#0:
1814; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1815; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1816; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1817; AVX512-NEXT:    retq
1818  %1 = mul <8 x i32> %a0, %a1
1819  %2 = trunc <8 x i32> %1 to <8 x i16>
1820  ret <8 x i16> %2
1821}
1822
1823define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1824; SSE-LABEL: trunc_mul_v16i64_v16i8:
1825; SSE:       # BB#0:
1826; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1827; SSE-NEXT:    movdqa %xmm0, %xmm9
1828; SSE-NEXT:    psrlq $32, %xmm9
1829; SSE-NEXT:    pmuludq %xmm8, %xmm9
1830; SSE-NEXT:    movdqa %xmm8, %xmm10
1831; SSE-NEXT:    psrlq $32, %xmm10
1832; SSE-NEXT:    pmuludq %xmm0, %xmm10
1833; SSE-NEXT:    paddq %xmm9, %xmm10
1834; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1835; SSE-NEXT:    psllq $32, %xmm10
1836; SSE-NEXT:    pmuludq %xmm8, %xmm0
1837; SSE-NEXT:    paddq %xmm10, %xmm0
1838; SSE-NEXT:    movdqa %xmm1, %xmm8
1839; SSE-NEXT:    psrlq $32, %xmm8
1840; SSE-NEXT:    pmuludq %xmm9, %xmm8
1841; SSE-NEXT:    movdqa %xmm9, %xmm10
1842; SSE-NEXT:    psrlq $32, %xmm10
1843; SSE-NEXT:    pmuludq %xmm1, %xmm10
1844; SSE-NEXT:    paddq %xmm8, %xmm10
1845; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1846; SSE-NEXT:    psllq $32, %xmm10
1847; SSE-NEXT:    pmuludq %xmm9, %xmm1
1848; SSE-NEXT:    paddq %xmm10, %xmm1
1849; SSE-NEXT:    movdqa %xmm2, %xmm9
1850; SSE-NEXT:    psrlq $32, %xmm9
1851; SSE-NEXT:    pmuludq %xmm8, %xmm9
1852; SSE-NEXT:    movdqa %xmm8, %xmm10
1853; SSE-NEXT:    psrlq $32, %xmm10
1854; SSE-NEXT:    pmuludq %xmm2, %xmm10
1855; SSE-NEXT:    paddq %xmm9, %xmm10
1856; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1857; SSE-NEXT:    psllq $32, %xmm10
1858; SSE-NEXT:    pmuludq %xmm8, %xmm2
1859; SSE-NEXT:    paddq %xmm10, %xmm2
1860; SSE-NEXT:    movdqa %xmm3, %xmm8
1861; SSE-NEXT:    psrlq $32, %xmm8
1862; SSE-NEXT:    pmuludq %xmm9, %xmm8
1863; SSE-NEXT:    movdqa %xmm9, %xmm10
1864; SSE-NEXT:    psrlq $32, %xmm10
1865; SSE-NEXT:    pmuludq %xmm3, %xmm10
1866; SSE-NEXT:    paddq %xmm8, %xmm10
1867; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1868; SSE-NEXT:    psllq $32, %xmm10
1869; SSE-NEXT:    pmuludq %xmm9, %xmm3
1870; SSE-NEXT:    paddq %xmm10, %xmm3
1871; SSE-NEXT:    movdqa %xmm4, %xmm9
1872; SSE-NEXT:    psrlq $32, %xmm9
1873; SSE-NEXT:    pmuludq %xmm8, %xmm9
1874; SSE-NEXT:    movdqa %xmm8, %xmm10
1875; SSE-NEXT:    psrlq $32, %xmm10
1876; SSE-NEXT:    pmuludq %xmm4, %xmm10
1877; SSE-NEXT:    paddq %xmm9, %xmm10
1878; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1879; SSE-NEXT:    psllq $32, %xmm10
1880; SSE-NEXT:    pmuludq %xmm8, %xmm4
1881; SSE-NEXT:    paddq %xmm10, %xmm4
1882; SSE-NEXT:    movdqa %xmm5, %xmm8
1883; SSE-NEXT:    psrlq $32, %xmm8
1884; SSE-NEXT:    pmuludq %xmm9, %xmm8
1885; SSE-NEXT:    movdqa %xmm9, %xmm10
1886; SSE-NEXT:    psrlq $32, %xmm10
1887; SSE-NEXT:    pmuludq %xmm5, %xmm10
1888; SSE-NEXT:    paddq %xmm8, %xmm10
1889; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1890; SSE-NEXT:    psllq $32, %xmm10
1891; SSE-NEXT:    pmuludq %xmm9, %xmm5
1892; SSE-NEXT:    paddq %xmm10, %xmm5
1893; SSE-NEXT:    movdqa %xmm6, %xmm9
1894; SSE-NEXT:    psrlq $32, %xmm9
1895; SSE-NEXT:    pmuludq %xmm8, %xmm9
1896; SSE-NEXT:    movdqa %xmm8, %xmm10
1897; SSE-NEXT:    psrlq $32, %xmm10
1898; SSE-NEXT:    pmuludq %xmm6, %xmm10
1899; SSE-NEXT:    paddq %xmm9, %xmm10
1900; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1901; SSE-NEXT:    psllq $32, %xmm10
1902; SSE-NEXT:    pmuludq %xmm8, %xmm6
1903; SSE-NEXT:    paddq %xmm10, %xmm6
1904; SSE-NEXT:    movdqa %xmm7, %xmm8
1905; SSE-NEXT:    psrlq $32, %xmm8
1906; SSE-NEXT:    pmuludq %xmm9, %xmm8
1907; SSE-NEXT:    movdqa %xmm9, %xmm10
1908; SSE-NEXT:    psrlq $32, %xmm10
1909; SSE-NEXT:    pmuludq %xmm7, %xmm10
1910; SSE-NEXT:    paddq %xmm8, %xmm10
1911; SSE-NEXT:    pmuludq %xmm9, %xmm7
1912; SSE-NEXT:    psllq $32, %xmm10
1913; SSE-NEXT:    paddq %xmm10, %xmm7
1914; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1915; SSE-NEXT:    pand %xmm8, %xmm7
1916; SSE-NEXT:    pand %xmm8, %xmm6
1917; SSE-NEXT:    packuswb %xmm7, %xmm6
1918; SSE-NEXT:    pand %xmm8, %xmm5
1919; SSE-NEXT:    pand %xmm8, %xmm4
1920; SSE-NEXT:    packuswb %xmm5, %xmm4
1921; SSE-NEXT:    packuswb %xmm6, %xmm4
1922; SSE-NEXT:    pand %xmm8, %xmm3
1923; SSE-NEXT:    pand %xmm8, %xmm2
1924; SSE-NEXT:    packuswb %xmm3, %xmm2
1925; SSE-NEXT:    pand %xmm8, %xmm1
1926; SSE-NEXT:    pand %xmm8, %xmm0
1927; SSE-NEXT:    packuswb %xmm1, %xmm0
1928; SSE-NEXT:    packuswb %xmm2, %xmm0
1929; SSE-NEXT:    packuswb %xmm4, %xmm0
1930; SSE-NEXT:    retq
1931;
1932; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1933; AVX1:       # BB#0:
1934; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm8
1935; AVX1-NEXT:    vpmuludq %xmm4, %xmm8, %xmm8
1936; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm9
1937; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm9
1938; AVX1-NEXT:    vpaddq %xmm8, %xmm9, %xmm8
1939; AVX1-NEXT:    vpsllq $32, %xmm8, %xmm8
1940; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm9
1941; AVX1-NEXT:    vpaddq %xmm8, %xmm9, %xmm8
1942; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm9
1943; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1944; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
1945; AVX1-NEXT:    vpmuludq %xmm9, %xmm4, %xmm10
1946; AVX1-NEXT:    vpsrlq $32, %xmm9, %xmm4
1947; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
1948; AVX1-NEXT:    vpaddq %xmm10, %xmm4, %xmm4
1949; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
1950; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm0
1951; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm9
1952; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
1953; AVX1-NEXT:    vpmuludq %xmm5, %xmm4, %xmm4
1954; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm0
1955; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
1956; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
1957; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
1958; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
1959; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm10
1960; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm0
1961; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1962; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm5
1963; AVX1-NEXT:    vpmuludq %xmm0, %xmm5, %xmm5
1964; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
1965; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
1966; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
1967; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
1968; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
1969; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm1
1970; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm0
1971; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
1972; AVX1-NEXT:    vpsrlq $32, %xmm6, %xmm4
1973; AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
1974; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
1975; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
1976; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm4
1977; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm5
1978; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
1979; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1980; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
1981; AVX1-NEXT:    vpmuludq %xmm0, %xmm4, %xmm4
1982; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
1983; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm6
1984; AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
1985; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
1986; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm0
1987; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
1988; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm2
1989; AVX1-NEXT:    vpmuludq %xmm7, %xmm2, %xmm2
1990; AVX1-NEXT:    vpsrlq $32, %xmm7, %xmm4
1991; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
1992; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
1993; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
1994; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm4
1995; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
1996; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
1997; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1998; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm6
1999; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm6
2000; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm7
2001; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm7
2002; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
2003; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
2004; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
2005; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
2006; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2007; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2008; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2009; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
2010; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2011; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
2012; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
2013; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2014; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2015; AVX1-NEXT:    vpand %xmm4, %xmm10, %xmm2
2016; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2017; AVX1-NEXT:    vpand %xmm4, %xmm9, %xmm2
2018; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
2019; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2020; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2021; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2022; AVX1-NEXT:    vzeroupper
2023; AVX1-NEXT:    retq
2024;
2025; AVX2-LABEL: trunc_mul_v16i64_v16i8:
2026; AVX2:       # BB#0:
2027; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
2028; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
2029; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2030; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2031; AVX2-NEXT:    vpmulld %xmm7, %xmm3, %xmm3
2032; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
2033; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
2034; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2035; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2036; AVX2-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
2037; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2038; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2039; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2040; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2041; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2042; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
2043; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
2044; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
2045; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2046; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2047; AVX2-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
2048; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
2049; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2050; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2051; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2052; AVX2-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
2053; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2054; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2055; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2056; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
2057; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2058; AVX2-NEXT:    vzeroupper
2059; AVX2-NEXT:    retq
2060;
2061; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
2062; AVX512F:       # BB#0:
2063; AVX512F-NEXT:    vpmovqd %zmm3, %ymm3
2064; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
2065; AVX512F-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2066; AVX512F-NEXT:    vpmovqd %zmm2, %ymm2
2067; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
2068; AVX512F-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2069; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2070; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2071; AVX512F-NEXT:    retq
2072;
2073; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
2074; AVX512BW:       # BB#0:
2075; AVX512BW-NEXT:    vpmovqd %zmm3, %ymm3
2076; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
2077; AVX512BW-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2078; AVX512BW-NEXT:    vpmovqd %zmm2, %ymm2
2079; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
2080; AVX512BW-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2081; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2082; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
2083; AVX512BW-NEXT:    retq
2084;
2085; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
2086; AVX512DQ:       # BB#0:
2087; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
2088; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
2089; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
2090; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
2091; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
2092; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2093; AVX512DQ-NEXT:    retq
2094  %1 = mul <16 x i64> %a0, %a1
2095  %2 = trunc <16 x i64> %1 to <16 x i8>
2096  ret <16 x i8> %2
2097}
2098
2099define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2100; SSE-LABEL: trunc_mul_v16i32_v16i8:
2101; SSE:       # BB#0:
2102; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
2103; SSE-NEXT:    pmuludq %xmm4, %xmm0
2104; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2105; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2106; SSE-NEXT:    pmuludq %xmm8, %xmm4
2107; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2108; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2109; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2110; SSE-NEXT:    pmuludq %xmm5, %xmm1
2111; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2112; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2113; SSE-NEXT:    pmuludq %xmm4, %xmm5
2114; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2115; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2116; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2117; SSE-NEXT:    pmuludq %xmm6, %xmm2
2118; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2119; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
2120; SSE-NEXT:    pmuludq %xmm4, %xmm5
2121; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2122; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2123; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2124; SSE-NEXT:    pmuludq %xmm7, %xmm3
2125; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2126; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
2127; SSE-NEXT:    pmuludq %xmm4, %xmm5
2128; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2129; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2130; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2131; SSE-NEXT:    pand %xmm4, %xmm3
2132; SSE-NEXT:    pand %xmm4, %xmm2
2133; SSE-NEXT:    packuswb %xmm3, %xmm2
2134; SSE-NEXT:    pand %xmm4, %xmm1
2135; SSE-NEXT:    pand %xmm4, %xmm0
2136; SSE-NEXT:    packuswb %xmm1, %xmm0
2137; SSE-NEXT:    packuswb %xmm2, %xmm0
2138; SSE-NEXT:    retq
2139;
2140; AVX1-LABEL: trunc_mul_v16i32_v16i8:
2141; AVX1:       # BB#0:
2142; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
2143; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2144; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2145; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
2146; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
2147; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2148; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2149; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
2150; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2151; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2152; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2153; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2154; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
2155; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
2156; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
2157; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2158; AVX1-NEXT:    vzeroupper
2159; AVX1-NEXT:    retq
2160;
2161; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2162; AVX2:       # BB#0:
2163; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2164; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2165; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2166; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2167; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2168; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2169; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2170; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2171; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2172; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2173; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2174; AVX2-NEXT:    vzeroupper
2175; AVX2-NEXT:    retq
2176;
2177; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2178; AVX512:       # BB#0:
2179; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2180; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2181; AVX512-NEXT:    retq
2182  %1 = mul <16 x i32> %a0, %a1
2183  %2 = trunc <16 x i32> %1 to <16 x i8>
2184  ret <16 x i8> %2
2185}
2186
2187define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2188; SSE-LABEL: trunc_mul_v16i16_v16i8:
2189; SSE:       # BB#0:
2190; SSE-NEXT:    pmullw %xmm2, %xmm0
2191; SSE-NEXT:    pmullw %xmm3, %xmm1
2192; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2193; SSE-NEXT:    pand %xmm2, %xmm1
2194; SSE-NEXT:    pand %xmm2, %xmm0
2195; SSE-NEXT:    packuswb %xmm1, %xmm0
2196; SSE-NEXT:    retq
2197;
2198; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2199; AVX1:       # BB#0:
2200; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2201; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2202; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2203; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2204; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2205; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2206; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
2207; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2208; AVX1-NEXT:    vzeroupper
2209; AVX1-NEXT:    retq
2210;
2211; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2212; AVX2:       # BB#0:
2213; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2214; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2215; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2216; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2217; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2218; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2219; AVX2-NEXT:    vzeroupper
2220; AVX2-NEXT:    retq
2221;
2222; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2223; AVX512F:       # BB#0:
2224; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2225; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
2226; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2227; AVX512F-NEXT:    retq
2228;
2229; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2230; AVX512BW:       # BB#0:
2231; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2232; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2233; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2234; AVX512BW-NEXT:    retq
2235;
2236; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2237; AVX512DQ:       # BB#0:
2238; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2239; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2240; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2241; AVX512DQ-NEXT:    retq
2242  %1 = mul <16 x i16> %a0, %a1
2243  %2 = trunc <16 x i16> %1 to <16 x i8>
2244  ret <16 x i8> %2
2245}
2246
2247;
2248; mul to constant
2249;
2250
2251define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2252; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2253; SSE:       # BB#0:
2254; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,3]
2255; SSE-NEXT:    movdqa %xmm1, %xmm3
2256; SSE-NEXT:    pmuludq %xmm2, %xmm3
2257; SSE-NEXT:    psrlq $32, %xmm1
2258; SSE-NEXT:    pmuludq %xmm2, %xmm1
2259; SSE-NEXT:    psllq $32, %xmm1
2260; SSE-NEXT:    paddq %xmm3, %xmm1
2261; SSE-NEXT:    movl $1, %eax
2262; SSE-NEXT:    movd %rax, %xmm2
2263; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2264; SSE-NEXT:    movdqa %xmm0, %xmm3
2265; SSE-NEXT:    pmuludq %xmm2, %xmm3
2266; SSE-NEXT:    psrlq $32, %xmm0
2267; SSE-NEXT:    pmuludq %xmm2, %xmm0
2268; SSE-NEXT:    psllq $32, %xmm0
2269; SSE-NEXT:    paddq %xmm3, %xmm0
2270; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2271; SSE-NEXT:    retq
2272;
2273; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2274; AVX1:       # BB#0:
2275; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2276; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2277; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2278; AVX1-NEXT:    vzeroupper
2279; AVX1-NEXT:    retq
2280;
2281; AVX2-LABEL: trunc_mul_const_v4i64_v4i32:
2282; AVX2:       # BB#0:
2283; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2284; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2285; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2286; AVX2-NEXT:    vzeroupper
2287; AVX2-NEXT:    retq
2288;
2289; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2290; AVX512:       # BB#0:
2291; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
2292; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2293; AVX512-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2294; AVX512-NEXT:    retq
2295  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2296  %2 = trunc <4 x i64> %1 to <4 x i32>
2297  ret <4 x i32> %2
2298}
2299
2300define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2301; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2302; SSE:       # BB#0:
2303; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2304; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
2305; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2306; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
2307; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2308; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2309; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2310; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2311; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2312; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2313; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2314; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm2
2315; SSE-NEXT:    movdqa %xmm2, %xmm0
2316; SSE-NEXT:    retq
2317;
2318; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2319; AVX1:       # BB#0:
2320; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2321; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2322; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2323; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
2324; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
2325; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2326; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2327; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
2328; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2329; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2330; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2331; AVX1-NEXT:    vzeroupper
2332; AVX1-NEXT:    retq
2333;
2334; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2335; AVX2:       # BB#0:
2336; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2337; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2338; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2339; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2340; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2341; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2342; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2343; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2344; AVX2-NEXT:    vzeroupper
2345; AVX2-NEXT:    retq
2346;
2347; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2348; AVX512:       # BB#0:
2349; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2350; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2351; AVX512-NEXT:    retq
2352  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2353  %2 = trunc <8 x i64> %1 to <8 x i16>
2354  ret <8 x i16> %2
2355}
2356
2357define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2358; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2359; SSE:       # BB#0:
2360; SSE-NEXT:    pslld $16, %xmm1
2361; SSE-NEXT:    psrad $16, %xmm1
2362; SSE-NEXT:    pslld $16, %xmm0
2363; SSE-NEXT:    psrad $16, %xmm0
2364; SSE-NEXT:    packssdw %xmm1, %xmm0
2365; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2366; SSE-NEXT:    retq
2367;
2368; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2369; AVX1:       # BB#0:
2370; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2371; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2372; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2373; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2374; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2375; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2376; AVX1-NEXT:    vzeroupper
2377; AVX1-NEXT:    retq
2378;
2379; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2380; AVX2:       # BB#0:
2381; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2382; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2383; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2384; AVX2-NEXT:    vzeroupper
2385; AVX2-NEXT:    retq
2386;
2387; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2388; AVX512:       # BB#0:
2389; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
2390; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2391; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2392; AVX512-NEXT:    retq
2393  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2394  %2 = trunc <8 x i32> %1 to <8 x i16>
2395  ret <8 x i16> %2
2396}
2397
2398define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2399; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2400; SSE:       # BB#0:
2401; SSE-NEXT:    movl $1, %eax
2402; SSE-NEXT:    movd %rax, %xmm8
2403; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
2404; SSE-NEXT:    movdqa %xmm0, %xmm9
2405; SSE-NEXT:    pmuludq %xmm8, %xmm9
2406; SSE-NEXT:    psrlq $32, %xmm0
2407; SSE-NEXT:    pmuludq %xmm8, %xmm0
2408; SSE-NEXT:    psllq $32, %xmm0
2409; SSE-NEXT:    paddq %xmm9, %xmm0
2410; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [2,3]
2411; SSE-NEXT:    movdqa %xmm1, %xmm9
2412; SSE-NEXT:    pmuludq %xmm8, %xmm9
2413; SSE-NEXT:    psrlq $32, %xmm1
2414; SSE-NEXT:    pmuludq %xmm8, %xmm1
2415; SSE-NEXT:    psllq $32, %xmm1
2416; SSE-NEXT:    paddq %xmm9, %xmm1
2417; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [4,5]
2418; SSE-NEXT:    movdqa %xmm2, %xmm9
2419; SSE-NEXT:    pmuludq %xmm8, %xmm9
2420; SSE-NEXT:    psrlq $32, %xmm2
2421; SSE-NEXT:    pmuludq %xmm8, %xmm2
2422; SSE-NEXT:    psllq $32, %xmm2
2423; SSE-NEXT:    paddq %xmm9, %xmm2
2424; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [6,7]
2425; SSE-NEXT:    movdqa %xmm3, %xmm9
2426; SSE-NEXT:    pmuludq %xmm8, %xmm9
2427; SSE-NEXT:    psrlq $32, %xmm3
2428; SSE-NEXT:    pmuludq %xmm8, %xmm3
2429; SSE-NEXT:    psllq $32, %xmm3
2430; SSE-NEXT:    paddq %xmm9, %xmm3
2431; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [8,9]
2432; SSE-NEXT:    movdqa %xmm4, %xmm9
2433; SSE-NEXT:    pmuludq %xmm8, %xmm9
2434; SSE-NEXT:    psrlq $32, %xmm4
2435; SSE-NEXT:    pmuludq %xmm8, %xmm4
2436; SSE-NEXT:    psllq $32, %xmm4
2437; SSE-NEXT:    paddq %xmm9, %xmm4
2438; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [10,11]
2439; SSE-NEXT:    movdqa %xmm5, %xmm9
2440; SSE-NEXT:    pmuludq %xmm8, %xmm9
2441; SSE-NEXT:    psrlq $32, %xmm5
2442; SSE-NEXT:    pmuludq %xmm8, %xmm5
2443; SSE-NEXT:    psllq $32, %xmm5
2444; SSE-NEXT:    paddq %xmm9, %xmm5
2445; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [12,13]
2446; SSE-NEXT:    movdqa %xmm6, %xmm9
2447; SSE-NEXT:    pmuludq %xmm8, %xmm9
2448; SSE-NEXT:    psrlq $32, %xmm6
2449; SSE-NEXT:    pmuludq %xmm8, %xmm6
2450; SSE-NEXT:    psllq $32, %xmm6
2451; SSE-NEXT:    paddq %xmm9, %xmm6
2452; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [14,15]
2453; SSE-NEXT:    movdqa %xmm7, %xmm9
2454; SSE-NEXT:    pmuludq %xmm8, %xmm9
2455; SSE-NEXT:    psrlq $32, %xmm7
2456; SSE-NEXT:    pmuludq %xmm8, %xmm7
2457; SSE-NEXT:    psllq $32, %xmm7
2458; SSE-NEXT:    paddq %xmm9, %xmm7
2459; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2460; SSE-NEXT:    pand %xmm8, %xmm7
2461; SSE-NEXT:    pand %xmm8, %xmm6
2462; SSE-NEXT:    packuswb %xmm7, %xmm6
2463; SSE-NEXT:    pand %xmm8, %xmm5
2464; SSE-NEXT:    pand %xmm8, %xmm4
2465; SSE-NEXT:    packuswb %xmm5, %xmm4
2466; SSE-NEXT:    packuswb %xmm6, %xmm4
2467; SSE-NEXT:    pand %xmm8, %xmm3
2468; SSE-NEXT:    pand %xmm8, %xmm2
2469; SSE-NEXT:    packuswb %xmm3, %xmm2
2470; SSE-NEXT:    pand %xmm8, %xmm1
2471; SSE-NEXT:    pand %xmm8, %xmm0
2472; SSE-NEXT:    packuswb %xmm1, %xmm0
2473; SSE-NEXT:    packuswb %xmm2, %xmm0
2474; SSE-NEXT:    packuswb %xmm4, %xmm0
2475; SSE-NEXT:    retq
2476;
2477; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2478; AVX1:       # BB#0:
2479; AVX1-NEXT:    movl $1, %eax
2480; AVX1-NEXT:    vmovq %rax, %xmm4
2481; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2482; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm5
2483; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
2484; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
2485; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2486; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm8
2487; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2488; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,3]
2489; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm6
2490; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
2491; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
2492; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2493; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm9
2494; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5]
2495; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm6
2496; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm7
2497; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
2498; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
2499; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
2500; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2501; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7]
2502; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm7
2503; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
2504; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm1
2505; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
2506; AVX1-NEXT:    vpaddq %xmm1, %xmm7, %xmm1
2507; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9]
2508; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
2509; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
2510; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
2511; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2512; AVX1-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
2513; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2514; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11]
2515; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
2516; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
2517; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
2518; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
2519; AVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
2520; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13]
2521; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
2522; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm0
2523; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
2524; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2525; AVX1-NEXT:    vpaddq %xmm0, %xmm7, %xmm0
2526; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2527; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [14,15]
2528; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
2529; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm3
2530; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
2531; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
2532; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
2533; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2534; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
2535; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
2536; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2537; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
2538; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm3
2539; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2540; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
2541; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
2542; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm2
2543; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2544; AVX1-NEXT:    vpand %xmm6, %xmm9, %xmm2
2545; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm3
2546; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2547; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2548; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2549; AVX1-NEXT:    vzeroupper
2550; AVX1-NEXT:    retq
2551;
2552; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2553; AVX2:       # BB#0:
2554; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2555; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2556; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
2557; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2558; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2559; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
2560; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2561; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2562; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2563; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2564; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2565; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2566; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2567; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2568; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2569; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2570; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2571; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2572; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2573; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2574; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2575; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2576; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2577; AVX2-NEXT:    vzeroupper
2578; AVX2-NEXT:    retq
2579;
2580; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2581; AVX512F:       # BB#0:
2582; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
2583; AVX512F-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2584; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
2585; AVX512F-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
2586; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2587; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2588; AVX512F-NEXT:    retq
2589;
2590; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2591; AVX512BW:       # BB#0:
2592; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
2593; AVX512BW-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2594; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
2595; AVX512BW-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
2596; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2597; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
2598; AVX512BW-NEXT:    retq
2599;
2600; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2601; AVX512DQ:       # BB#0:
2602; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
2603; AVX512DQ-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2604; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
2605; AVX512DQ-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
2606; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
2607; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2608; AVX512DQ-NEXT:    retq
2609  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2610  %2 = trunc <16 x i64> %1 to <16 x i8>
2611  ret <16 x i8> %2
2612}
2613
2614define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2615; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2616; SSE:       # BB#0:
2617; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,2,3]
2618; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2619; SSE-NEXT:    pmuludq %xmm4, %xmm0
2620; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2621; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2622; SSE-NEXT:    pmuludq %xmm5, %xmm4
2623; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2624; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2625; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5,6,7]
2626; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2627; SSE-NEXT:    pmuludq %xmm4, %xmm1
2628; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2629; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2630; SSE-NEXT:    pmuludq %xmm5, %xmm4
2631; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2632; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2633; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,9,10,11]
2634; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2635; SSE-NEXT:    pmuludq %xmm4, %xmm2
2636; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2637; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2638; SSE-NEXT:    pmuludq %xmm5, %xmm4
2639; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2640; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2641; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [12,13,14,15]
2642; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2643; SSE-NEXT:    pmuludq %xmm4, %xmm3
2644; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2645; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2646; SSE-NEXT:    pmuludq %xmm5, %xmm4
2647; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2648; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2649; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2650; SSE-NEXT:    pand %xmm4, %xmm3
2651; SSE-NEXT:    pand %xmm4, %xmm2
2652; SSE-NEXT:    packuswb %xmm3, %xmm2
2653; SSE-NEXT:    pand %xmm4, %xmm1
2654; SSE-NEXT:    pand %xmm4, %xmm0
2655; SSE-NEXT:    packuswb %xmm1, %xmm0
2656; SSE-NEXT:    packuswb %xmm2, %xmm0
2657; SSE-NEXT:    retq
2658;
2659; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2660; AVX1:       # BB#0:
2661; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
2662; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2663; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2664; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
2665; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2666; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2667; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2668; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2669; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2670; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
2671; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2672; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2673; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
2674; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2675; AVX1-NEXT:    vzeroupper
2676; AVX1-NEXT:    retq
2677;
2678; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2679; AVX2:       # BB#0:
2680; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2681; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2682; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2683; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
2684; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2685; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2686; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2687; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2688; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2689; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2690; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2691; AVX2-NEXT:    vzeroupper
2692; AVX2-NEXT:    retq
2693;
2694; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2695; AVX512:       # BB#0:
2696; AVX512-NEXT:    vpmulld {{.*}}(%rip), %zmm0, %zmm0
2697; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2698; AVX512-NEXT:    retq
2699  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2700  %2 = trunc <16 x i32> %1 to <16 x i8>
2701  ret <16 x i8> %2
2702}
2703
2704define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2705; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2706; SSE:       # BB#0:
2707; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2708; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
2709; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2710; SSE-NEXT:    pand %xmm2, %xmm1
2711; SSE-NEXT:    pand %xmm2, %xmm0
2712; SSE-NEXT:    packuswb %xmm1, %xmm0
2713; SSE-NEXT:    retq
2714;
2715; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2716; AVX1:       # BB#0:
2717; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2718; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2719; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2720; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2721; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2722; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2723; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2724; AVX1-NEXT:    vzeroupper
2725; AVX1-NEXT:    retq
2726;
2727; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2728; AVX2:       # BB#0:
2729; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2730; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2731; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2732; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2733; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2734; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2735; AVX2-NEXT:    vzeroupper
2736; AVX2-NEXT:    retq
2737;
2738; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2739; AVX512F:       # BB#0:
2740; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2741; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
2742; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2743; AVX512F-NEXT:    retq
2744;
2745; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2746; AVX512BW:       # BB#0:
2747; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2748; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2749; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2750; AVX512BW-NEXT:    retq
2751;
2752; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2753; AVX512DQ:       # BB#0:
2754; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2755; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
2756; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2757; AVX512DQ-NEXT:    retq
2758  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2759  %2 = trunc <16 x i16> %1 to <16 x i8>
2760  ret <16 x i8> %2
2761}
2762
2763;
2764; and
2765;
2766
2767define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2768; SSE-LABEL: trunc_and_v4i64_v4i32:
2769; SSE:       # BB#0:
2770; SSE-NEXT:    andps %xmm3, %xmm1
2771; SSE-NEXT:    andps %xmm2, %xmm0
2772; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2773; SSE-NEXT:    retq
2774;
2775; AVX1-LABEL: trunc_and_v4i64_v4i32:
2776; AVX1:       # BB#0:
2777; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2778; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2779; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2780; AVX1-NEXT:    vzeroupper
2781; AVX1-NEXT:    retq
2782;
2783; AVX2-LABEL: trunc_and_v4i64_v4i32:
2784; AVX2:       # BB#0:
2785; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2786; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2787; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2788; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2789; AVX2-NEXT:    vzeroupper
2790; AVX2-NEXT:    retq
2791;
2792; AVX512-LABEL: trunc_and_v4i64_v4i32:
2793; AVX512:       # BB#0:
2794; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2795; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2796; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2797; AVX512-NEXT:    retq
2798  %1 = and <4 x i64> %a0, %a1
2799  %2 = trunc <4 x i64> %1 to <4 x i32>
2800  ret <4 x i32> %2
2801}
2802
2803define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2804; SSE-LABEL: trunc_and_v8i64_v8i16:
2805; SSE:       # BB#0:
2806; SSE-NEXT:    pand %xmm4, %xmm0
2807; SSE-NEXT:    pand %xmm5, %xmm1
2808; SSE-NEXT:    pand %xmm6, %xmm2
2809; SSE-NEXT:    pand %xmm7, %xmm3
2810; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2811; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
2812; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2813; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
2814; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2815; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2816; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2817; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2818; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2819; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2820; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2821; SSE-NEXT:    movapd %xmm2, %xmm0
2822; SSE-NEXT:    retq
2823;
2824; AVX1-LABEL: trunc_and_v8i64_v8i16:
2825; AVX1:       # BB#0:
2826; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2827; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2828; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2829; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2830; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2831; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
2832; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
2833; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2834; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2835; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
2836; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2837; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2838; AVX1-NEXT:    vzeroupper
2839; AVX1-NEXT:    retq
2840;
2841; AVX2-LABEL: trunc_and_v8i64_v8i16:
2842; AVX2:       # BB#0:
2843; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2844; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2845; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2846; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2847; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2848; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2849; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2850; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2851; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2852; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2853; AVX2-NEXT:    vzeroupper
2854; AVX2-NEXT:    retq
2855;
2856; AVX512-LABEL: trunc_and_v8i64_v8i16:
2857; AVX512:       # BB#0:
2858; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2859; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2860; AVX512-NEXT:    retq
2861  %1 = and <8 x i64> %a0, %a1
2862  %2 = trunc <8 x i64> %1 to <8 x i16>
2863  ret <8 x i16> %2
2864}
2865
2866define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2867; SSE-LABEL: trunc_and_v8i32_v8i16:
2868; SSE:       # BB#0:
2869; SSE-NEXT:    pand %xmm2, %xmm0
2870; SSE-NEXT:    pand %xmm3, %xmm1
2871; SSE-NEXT:    pslld $16, %xmm1
2872; SSE-NEXT:    psrad $16, %xmm1
2873; SSE-NEXT:    pslld $16, %xmm0
2874; SSE-NEXT:    psrad $16, %xmm0
2875; SSE-NEXT:    packssdw %xmm1, %xmm0
2876; SSE-NEXT:    retq
2877;
2878; AVX1-LABEL: trunc_and_v8i32_v8i16:
2879; AVX1:       # BB#0:
2880; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2881; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2882; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2883; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2884; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2885; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2886; AVX1-NEXT:    vzeroupper
2887; AVX1-NEXT:    retq
2888;
2889; AVX2-LABEL: trunc_and_v8i32_v8i16:
2890; AVX2:       # BB#0:
2891; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2892; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2893; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2894; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2895; AVX2-NEXT:    vzeroupper
2896; AVX2-NEXT:    retq
2897;
2898; AVX512-LABEL: trunc_and_v8i32_v8i16:
2899; AVX512:       # BB#0:
2900; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2901; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2902; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2903; AVX512-NEXT:    retq
2904  %1 = and <8 x i32> %a0, %a1
2905  %2 = trunc <8 x i32> %1 to <8 x i16>
2906  ret <8 x i16> %2
2907}
2908
2909define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2910; SSE-LABEL: trunc_and_v16i64_v16i8:
2911; SSE:       # BB#0:
2912; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
2913; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
2914; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
2915; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
2916; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
2917; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
2918; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
2919; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
2920; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2921; SSE-NEXT:    pand %xmm8, %xmm7
2922; SSE-NEXT:    pand %xmm8, %xmm6
2923; SSE-NEXT:    packuswb %xmm7, %xmm6
2924; SSE-NEXT:    pand %xmm8, %xmm5
2925; SSE-NEXT:    pand %xmm8, %xmm4
2926; SSE-NEXT:    packuswb %xmm5, %xmm4
2927; SSE-NEXT:    packuswb %xmm6, %xmm4
2928; SSE-NEXT:    pand %xmm8, %xmm3
2929; SSE-NEXT:    pand %xmm8, %xmm2
2930; SSE-NEXT:    packuswb %xmm3, %xmm2
2931; SSE-NEXT:    pand %xmm8, %xmm1
2932; SSE-NEXT:    pand %xmm8, %xmm0
2933; SSE-NEXT:    packuswb %xmm1, %xmm0
2934; SSE-NEXT:    packuswb %xmm2, %xmm0
2935; SSE-NEXT:    packuswb %xmm4, %xmm0
2936; SSE-NEXT:    retq
2937;
2938; AVX1-LABEL: trunc_and_v16i64_v16i8:
2939; AVX1:       # BB#0:
2940; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
2941; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
2942; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
2943; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
2944; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
2945; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2946; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
2947; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
2948; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
2949; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
2950; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
2951; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
2952; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
2953; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
2954; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2955; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
2956; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
2957; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
2958; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2959; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
2960; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
2961; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2962; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2963; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2964; AVX1-NEXT:    vzeroupper
2965; AVX1-NEXT:    retq
2966;
2967; AVX2-LABEL: trunc_and_v16i64_v16i8:
2968; AVX2:       # BB#0:
2969; AVX2-NEXT:    vpand %ymm5, %ymm1, %ymm1
2970; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
2971; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
2972; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
2973; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
2974; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2975; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2976; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2977; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2978; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2979; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2980; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2981; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2982; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2983; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2984; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2985; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
2986; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2987; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2988; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2989; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2990; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2991; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2992; AVX2-NEXT:    vzeroupper
2993; AVX2-NEXT:    retq
2994;
2995; AVX512F-LABEL: trunc_and_v16i64_v16i8:
2996; AVX512F:       # BB#0:
2997; AVX512F-NEXT:    vpandq %zmm3, %zmm1, %zmm1
2998; AVX512F-NEXT:    vpandq %zmm2, %zmm0, %zmm0
2999; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
3000; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
3001; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3002; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3003; AVX512F-NEXT:    retq
3004;
3005; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
3006; AVX512BW:       # BB#0:
3007; AVX512BW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
3008; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
3009; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
3010; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
3011; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3012; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
3013; AVX512BW-NEXT:    retq
3014;
3015; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
3016; AVX512DQ:       # BB#0:
3017; AVX512DQ-NEXT:    vpandq %zmm3, %zmm1, %zmm1
3018; AVX512DQ-NEXT:    vpandq %zmm2, %zmm0, %zmm0
3019; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
3020; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
3021; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
3022; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3023; AVX512DQ-NEXT:    retq
3024  %1 = and <16 x i64> %a0, %a1
3025  %2 = trunc <16 x i64> %1 to <16 x i8>
3026  ret <16 x i8> %2
3027}
3028
3029define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3030; SSE-LABEL: trunc_and_v16i32_v16i8:
3031; SSE:       # BB#0:
3032; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3033; SSE-NEXT:    pand %xmm8, %xmm7
3034; SSE-NEXT:    pand %xmm3, %xmm7
3035; SSE-NEXT:    pand %xmm8, %xmm6
3036; SSE-NEXT:    pand %xmm2, %xmm6
3037; SSE-NEXT:    packuswb %xmm7, %xmm6
3038; SSE-NEXT:    pand %xmm8, %xmm5
3039; SSE-NEXT:    pand %xmm1, %xmm5
3040; SSE-NEXT:    pand %xmm8, %xmm4
3041; SSE-NEXT:    pand %xmm4, %xmm0
3042; SSE-NEXT:    packuswb %xmm5, %xmm0
3043; SSE-NEXT:    packuswb %xmm6, %xmm0
3044; SSE-NEXT:    retq
3045;
3046; AVX1-LABEL: trunc_and_v16i32_v16i8:
3047; AVX1:       # BB#0:
3048; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3049; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
3050; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3051; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3052; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3053; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
3054; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3055; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3056; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3057; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
3058; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3059; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3060; AVX1-NEXT:    vzeroupper
3061; AVX1-NEXT:    retq
3062;
3063; AVX2-LABEL: trunc_and_v16i32_v16i8:
3064; AVX2:       # BB#0:
3065; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3066; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3067; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3068; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3069; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3070; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3071; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3072; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3073; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3074; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3075; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3076; AVX2-NEXT:    vzeroupper
3077; AVX2-NEXT:    retq
3078;
3079; AVX512-LABEL: trunc_and_v16i32_v16i8:
3080; AVX512:       # BB#0:
3081; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
3082; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3083; AVX512-NEXT:    retq
3084  %1 = and <16 x i32> %a0, %a1
3085  %2 = trunc <16 x i32> %1 to <16 x i8>
3086  ret <16 x i8> %2
3087}
3088
3089define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3090; SSE-LABEL: trunc_and_v16i16_v16i8:
3091; SSE:       # BB#0:
3092; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3093; SSE-NEXT:    pand %xmm4, %xmm3
3094; SSE-NEXT:    pand %xmm1, %xmm3
3095; SSE-NEXT:    pand %xmm4, %xmm2
3096; SSE-NEXT:    pand %xmm2, %xmm0
3097; SSE-NEXT:    packuswb %xmm3, %xmm0
3098; SSE-NEXT:    retq
3099;
3100; AVX1-LABEL: trunc_and_v16i16_v16i8:
3101; AVX1:       # BB#0:
3102; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
3103; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3104; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3105; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3106; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3107; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3108; AVX1-NEXT:    vzeroupper
3109; AVX1-NEXT:    retq
3110;
3111; AVX2-LABEL: trunc_and_v16i16_v16i8:
3112; AVX2:       # BB#0:
3113; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
3114; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3115; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3116; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3117; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3118; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3119; AVX2-NEXT:    vzeroupper
3120; AVX2-NEXT:    retq
3121;
3122; AVX512F-LABEL: trunc_and_v16i16_v16i8:
3123; AVX512F:       # BB#0:
3124; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
3125; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
3126; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3127; AVX512F-NEXT:    retq
3128;
3129; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
3130; AVX512BW:       # BB#0:
3131; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
3132; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3133; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3134; AVX512BW-NEXT:    retq
3135;
3136; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
3137; AVX512DQ:       # BB#0:
3138; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
3139; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3140; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3141; AVX512DQ-NEXT:    retq
3142  %1 = and <16 x i16> %a0, %a1
3143  %2 = trunc <16 x i16> %1 to <16 x i8>
3144  ret <16 x i8> %2
3145}
3146
3147;
3148; and to constant
3149;
3150
3151define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3152; SSE-LABEL: trunc_and_const_v4i64_v4i32:
3153; SSE:       # BB#0:
3154; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3155; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
3156; SSE-NEXT:    retq
3157;
3158; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
3159; AVX1:       # BB#0:
3160; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3161; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3162; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
3163; AVX1-NEXT:    vzeroupper
3164; AVX1-NEXT:    retq
3165;
3166; AVX2-LABEL: trunc_and_const_v4i64_v4i32:
3167; AVX2:       # BB#0:
3168; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3169; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3170; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3171; AVX2-NEXT:    vzeroupper
3172; AVX2-NEXT:    retq
3173;
3174; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
3175; AVX512:       # BB#0:
3176; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3177; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3178; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3179; AVX512-NEXT:    retq
3180  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3181  %2 = trunc <4 x i64> %1 to <4 x i32>
3182  ret <4 x i32> %2
3183}
3184
3185define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3186; SSE-LABEL: trunc_and_const_v8i64_v8i16:
3187; SSE:       # BB#0:
3188; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
3189; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
3190; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3191; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3192; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3193; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3194; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3195; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3196; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3197; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3198; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3199; SSE-NEXT:    andpd {{.*}}(%rip), %xmm2
3200; SSE-NEXT:    movapd %xmm2, %xmm0
3201; SSE-NEXT:    retq
3202;
3203; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
3204; AVX1:       # BB#0:
3205; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3206; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3207; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3208; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3209; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
3210; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3211; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3212; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3213; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3214; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3215; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3216; AVX1-NEXT:    vzeroupper
3217; AVX1-NEXT:    retq
3218;
3219; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
3220; AVX2:       # BB#0:
3221; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3222; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3223; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3224; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3225; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3226; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3227; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3228; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3229; AVX2-NEXT:    vzeroupper
3230; AVX2-NEXT:    retq
3231;
3232; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3233; AVX512:       # BB#0:
3234; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3235; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3236; AVX512-NEXT:    retq
3237  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3238  %2 = trunc <8 x i64> %1 to <8 x i16>
3239  ret <8 x i16> %2
3240}
3241
3242define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3243; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3244; SSE:       # BB#0:
3245; SSE-NEXT:    pslld $16, %xmm1
3246; SSE-NEXT:    psrad $16, %xmm1
3247; SSE-NEXT:    pslld $16, %xmm0
3248; SSE-NEXT:    psrad $16, %xmm0
3249; SSE-NEXT:    packssdw %xmm1, %xmm0
3250; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3251; SSE-NEXT:    retq
3252;
3253; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3254; AVX1:       # BB#0:
3255; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3256; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3257; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3258; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3259; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3260; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3261; AVX1-NEXT:    vzeroupper
3262; AVX1-NEXT:    retq
3263;
3264; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3265; AVX2:       # BB#0:
3266; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3267; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3268; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3269; AVX2-NEXT:    vzeroupper
3270; AVX2-NEXT:    retq
3271;
3272; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3273; AVX512:       # BB#0:
3274; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3275; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3276; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3277; AVX512-NEXT:    retq
3278  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3279  %2 = trunc <8 x i32> %1 to <8 x i16>
3280  ret <8 x i16> %2
3281}
3282
3283define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3284; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3285; SSE:       # BB#0:
3286; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3287; SSE-NEXT:    pand %xmm8, %xmm7
3288; SSE-NEXT:    pand %xmm8, %xmm6
3289; SSE-NEXT:    packuswb %xmm7, %xmm6
3290; SSE-NEXT:    pand %xmm8, %xmm5
3291; SSE-NEXT:    pand %xmm8, %xmm4
3292; SSE-NEXT:    packuswb %xmm5, %xmm4
3293; SSE-NEXT:    packuswb %xmm6, %xmm4
3294; SSE-NEXT:    pand %xmm8, %xmm3
3295; SSE-NEXT:    pand %xmm8, %xmm2
3296; SSE-NEXT:    packuswb %xmm3, %xmm2
3297; SSE-NEXT:    pand %xmm8, %xmm1
3298; SSE-NEXT:    pand %xmm8, %xmm0
3299; SSE-NEXT:    packuswb %xmm1, %xmm0
3300; SSE-NEXT:    packuswb %xmm2, %xmm0
3301; SSE-NEXT:    packuswb %xmm4, %xmm0
3302; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3303; SSE-NEXT:    retq
3304;
3305; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3306; AVX1:       # BB#0:
3307; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3308; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3309; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3310; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3311; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
3312; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
3313; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3314; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
3315; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
3316; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
3317; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3318; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3319; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
3320; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
3321; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3322; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3323; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
3324; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
3325; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3326; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3327; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3328; AVX1-NEXT:    vzeroupper
3329; AVX1-NEXT:    retq
3330;
3331; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3332; AVX2:       # BB#0:
3333; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3334; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3335; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3336; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
3337; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3338; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3339; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3340; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3341; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3342; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
3343; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3344; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3345; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3346; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3347; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3348; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3349; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3350; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
3351; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3352; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3353; AVX2-NEXT:    vzeroupper
3354; AVX2-NEXT:    retq
3355;
3356; AVX512F-LABEL: trunc_and_const_v16i64_v16i8:
3357; AVX512F:       # BB#0:
3358; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
3359; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
3360; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3361; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3362; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3363; AVX512F-NEXT:    retq
3364;
3365; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
3366; AVX512BW:       # BB#0:
3367; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
3368; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
3369; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3370; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
3371; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3372; AVX512BW-NEXT:    retq
3373;
3374; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
3375; AVX512DQ:       # BB#0:
3376; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
3377; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
3378; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
3379; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3380; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3381; AVX512DQ-NEXT:    retq
3382  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3383  %2 = trunc <16 x i64> %1 to <16 x i8>
3384  ret <16 x i8> %2
3385}
3386
3387define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3388; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3389; SSE:       # BB#0:
3390; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3391; SSE-NEXT:    pand %xmm4, %xmm3
3392; SSE-NEXT:    pand %xmm4, %xmm2
3393; SSE-NEXT:    packuswb %xmm3, %xmm2
3394; SSE-NEXT:    pand %xmm4, %xmm1
3395; SSE-NEXT:    pand %xmm4, %xmm0
3396; SSE-NEXT:    packuswb %xmm1, %xmm0
3397; SSE-NEXT:    packuswb %xmm2, %xmm0
3398; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3399; SSE-NEXT:    retq
3400;
3401; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3402; AVX1:       # BB#0:
3403; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3404; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3405; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3406; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
3407; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3408; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3409; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3410; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
3411; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3412; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3413; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3414; AVX1-NEXT:    vzeroupper
3415; AVX1-NEXT:    retq
3416;
3417; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3418; AVX2:       # BB#0:
3419; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3420; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3421; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3422; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3423; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3424; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3425; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3426; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3427; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3428; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3429; AVX2-NEXT:    vzeroupper
3430; AVX2-NEXT:    retq
3431;
3432; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3433; AVX512:       # BB#0:
3434; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3435; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3436; AVX512-NEXT:    retq
3437  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3438  %2 = trunc <16 x i32> %1 to <16 x i8>
3439  ret <16 x i8> %2
3440}
3441
3442define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3443; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3444; SSE:       # BB#0:
3445; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3446; SSE-NEXT:    pand %xmm2, %xmm1
3447; SSE-NEXT:    pand %xmm2, %xmm0
3448; SSE-NEXT:    packuswb %xmm1, %xmm0
3449; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3450; SSE-NEXT:    retq
3451;
3452; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3453; AVX1:       # BB#0:
3454; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3455; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3456; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3457; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3458; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3459; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3460; AVX1-NEXT:    vzeroupper
3461; AVX1-NEXT:    retq
3462;
3463; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3464; AVX2:       # BB#0:
3465; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3466; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3467; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3468; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3469; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3470; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3471; AVX2-NEXT:    vzeroupper
3472; AVX2-NEXT:    retq
3473;
3474; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3475; AVX512F:       # BB#0:
3476; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
3477; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3478; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3479; AVX512F-NEXT:    retq
3480;
3481; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3482; AVX512BW:       # BB#0:
3483; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3484; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3485; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3486; AVX512BW-NEXT:    retq
3487;
3488; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3489; AVX512DQ:       # BB#0:
3490; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3491; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3492; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3493; AVX512DQ-NEXT:    retq
3494  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3495  %2 = trunc <16 x i16> %1 to <16 x i8>
3496  ret <16 x i8> %2
3497}
3498
3499;
3500; xor
3501;
3502
3503define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3504; SSE-LABEL: trunc_xor_v4i64_v4i32:
3505; SSE:       # BB#0:
3506; SSE-NEXT:    xorps %xmm3, %xmm1
3507; SSE-NEXT:    xorps %xmm2, %xmm0
3508; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3509; SSE-NEXT:    retq
3510;
3511; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3512; AVX1:       # BB#0:
3513; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3514; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3515; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3516; AVX1-NEXT:    vzeroupper
3517; AVX1-NEXT:    retq
3518;
3519; AVX2-LABEL: trunc_xor_v4i64_v4i32:
3520; AVX2:       # BB#0:
3521; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3522; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3523; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3524; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3525; AVX2-NEXT:    vzeroupper
3526; AVX2-NEXT:    retq
3527;
3528; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3529; AVX512:       # BB#0:
3530; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3531; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3532; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3533; AVX512-NEXT:    retq
3534  %1 = xor <4 x i64> %a0, %a1
3535  %2 = trunc <4 x i64> %1 to <4 x i32>
3536  ret <4 x i32> %2
3537}
3538
3539define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3540; SSE-LABEL: trunc_xor_v8i64_v8i16:
3541; SSE:       # BB#0:
3542; SSE-NEXT:    pxor %xmm4, %xmm0
3543; SSE-NEXT:    pxor %xmm5, %xmm1
3544; SSE-NEXT:    pxor %xmm6, %xmm2
3545; SSE-NEXT:    pxor %xmm7, %xmm3
3546; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
3547; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
3548; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3549; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3550; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3551; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3552; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3553; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3554; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3555; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3556; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3557; SSE-NEXT:    movapd %xmm2, %xmm0
3558; SSE-NEXT:    retq
3559;
3560; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3561; AVX1:       # BB#0:
3562; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3563; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3564; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3565; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3566; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3567; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3568; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
3569; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3570; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3571; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3572; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3573; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3574; AVX1-NEXT:    vzeroupper
3575; AVX1-NEXT:    retq
3576;
3577; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3578; AVX2:       # BB#0:
3579; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3580; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3581; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3582; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3583; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3584; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3585; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3586; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3587; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3588; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3589; AVX2-NEXT:    vzeroupper
3590; AVX2-NEXT:    retq
3591;
3592; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3593; AVX512:       # BB#0:
3594; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
3595; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3596; AVX512-NEXT:    retq
3597  %1 = xor <8 x i64> %a0, %a1
3598  %2 = trunc <8 x i64> %1 to <8 x i16>
3599  ret <8 x i16> %2
3600}
3601
3602define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3603; SSE-LABEL: trunc_xor_v8i32_v8i16:
3604; SSE:       # BB#0:
3605; SSE-NEXT:    pxor %xmm2, %xmm0
3606; SSE-NEXT:    pxor %xmm3, %xmm1
3607; SSE-NEXT:    pslld $16, %xmm1
3608; SSE-NEXT:    psrad $16, %xmm1
3609; SSE-NEXT:    pslld $16, %xmm0
3610; SSE-NEXT:    psrad $16, %xmm0
3611; SSE-NEXT:    packssdw %xmm1, %xmm0
3612; SSE-NEXT:    retq
3613;
3614; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3615; AVX1:       # BB#0:
3616; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3617; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3618; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3619; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3620; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3621; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3622; AVX1-NEXT:    vzeroupper
3623; AVX1-NEXT:    retq
3624;
3625; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3626; AVX2:       # BB#0:
3627; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3628; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3629; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3630; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3631; AVX2-NEXT:    vzeroupper
3632; AVX2-NEXT:    retq
3633;
3634; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3635; AVX512:       # BB#0:
3636; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3637; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3638; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3639; AVX512-NEXT:    retq
3640  %1 = xor <8 x i32> %a0, %a1
3641  %2 = trunc <8 x i32> %1 to <8 x i16>
3642  ret <8 x i16> %2
3643}
3644
3645define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3646; SSE-LABEL: trunc_xor_v16i64_v16i8:
3647; SSE:       # BB#0:
3648; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
3649; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
3650; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
3651; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
3652; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
3653; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
3654; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
3655; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
3656; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3657; SSE-NEXT:    pand %xmm8, %xmm7
3658; SSE-NEXT:    pand %xmm8, %xmm6
3659; SSE-NEXT:    packuswb %xmm7, %xmm6
3660; SSE-NEXT:    pand %xmm8, %xmm5
3661; SSE-NEXT:    pand %xmm8, %xmm4
3662; SSE-NEXT:    packuswb %xmm5, %xmm4
3663; SSE-NEXT:    packuswb %xmm6, %xmm4
3664; SSE-NEXT:    pand %xmm8, %xmm3
3665; SSE-NEXT:    pand %xmm8, %xmm2
3666; SSE-NEXT:    packuswb %xmm3, %xmm2
3667; SSE-NEXT:    pand %xmm8, %xmm1
3668; SSE-NEXT:    pand %xmm8, %xmm0
3669; SSE-NEXT:    packuswb %xmm1, %xmm0
3670; SSE-NEXT:    packuswb %xmm2, %xmm0
3671; SSE-NEXT:    packuswb %xmm4, %xmm0
3672; SSE-NEXT:    retq
3673;
3674; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3675; AVX1:       # BB#0:
3676; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3677; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3678; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3679; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3680; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3681; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3682; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3683; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3684; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
3685; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
3686; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3687; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
3688; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
3689; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
3690; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3691; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3692; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
3693; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
3694; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3695; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3696; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
3697; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
3698; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3699; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3700; AVX1-NEXT:    vzeroupper
3701; AVX1-NEXT:    retq
3702;
3703; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3704; AVX2:       # BB#0:
3705; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
3706; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
3707; AVX2-NEXT:    vpxor %ymm7, %ymm3, %ymm3
3708; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
3709; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
3710; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3711; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
3712; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
3713; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3714; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3715; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3716; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3717; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3718; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
3719; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3720; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3721; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3722; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3723; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3724; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3725; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3726; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
3727; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3728; AVX2-NEXT:    vzeroupper
3729; AVX2-NEXT:    retq
3730;
3731; AVX512F-LABEL: trunc_xor_v16i64_v16i8:
3732; AVX512F:       # BB#0:
3733; AVX512F-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3734; AVX512F-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3735; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
3736; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
3737; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3738; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3739; AVX512F-NEXT:    retq
3740;
3741; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
3742; AVX512BW:       # BB#0:
3743; AVX512BW-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3744; AVX512BW-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3745; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
3746; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
3747; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3748; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
3749; AVX512BW-NEXT:    retq
3750;
3751; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
3752; AVX512DQ:       # BB#0:
3753; AVX512DQ-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3754; AVX512DQ-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3755; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
3756; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
3757; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
3758; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3759; AVX512DQ-NEXT:    retq
3760  %1 = xor <16 x i64> %a0, %a1
3761  %2 = trunc <16 x i64> %1 to <16 x i8>
3762  ret <16 x i8> %2
3763}
3764
3765define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3766; SSE-LABEL: trunc_xor_v16i32_v16i8:
3767; SSE:       # BB#0:
3768; SSE-NEXT:    pxor %xmm4, %xmm0
3769; SSE-NEXT:    pxor %xmm5, %xmm1
3770; SSE-NEXT:    pxor %xmm6, %xmm2
3771; SSE-NEXT:    pxor %xmm7, %xmm3
3772; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3773; SSE-NEXT:    pand %xmm4, %xmm3
3774; SSE-NEXT:    pand %xmm4, %xmm2
3775; SSE-NEXT:    packuswb %xmm3, %xmm2
3776; SSE-NEXT:    pand %xmm4, %xmm1
3777; SSE-NEXT:    pand %xmm4, %xmm0
3778; SSE-NEXT:    packuswb %xmm1, %xmm0
3779; SSE-NEXT:    packuswb %xmm2, %xmm0
3780; SSE-NEXT:    retq
3781;
3782; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3783; AVX1:       # BB#0:
3784; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3785; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3786; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3787; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3788; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3789; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
3790; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3791; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3792; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3793; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
3794; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3795; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3796; AVX1-NEXT:    vzeroupper
3797; AVX1-NEXT:    retq
3798;
3799; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3800; AVX2:       # BB#0:
3801; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3802; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3803; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3804; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3805; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3806; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3807; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3808; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3809; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3810; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3811; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3812; AVX2-NEXT:    vzeroupper
3813; AVX2-NEXT:    retq
3814;
3815; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3816; AVX512:       # BB#0:
3817; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
3818; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3819; AVX512-NEXT:    retq
3820  %1 = xor <16 x i32> %a0, %a1
3821  %2 = trunc <16 x i32> %1 to <16 x i8>
3822  ret <16 x i8> %2
3823}
3824
3825define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3826; SSE-LABEL: trunc_xor_v16i16_v16i8:
3827; SSE:       # BB#0:
3828; SSE-NEXT:    pxor %xmm2, %xmm0
3829; SSE-NEXT:    pxor %xmm3, %xmm1
3830; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3831; SSE-NEXT:    pand %xmm2, %xmm1
3832; SSE-NEXT:    pand %xmm2, %xmm0
3833; SSE-NEXT:    packuswb %xmm1, %xmm0
3834; SSE-NEXT:    retq
3835;
3836; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3837; AVX1:       # BB#0:
3838; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3839; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3840; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3841; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3842; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3843; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3844; AVX1-NEXT:    vzeroupper
3845; AVX1-NEXT:    retq
3846;
3847; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3848; AVX2:       # BB#0:
3849; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3850; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3851; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3852; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3853; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3854; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3855; AVX2-NEXT:    vzeroupper
3856; AVX2-NEXT:    retq
3857;
3858; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3859; AVX512F:       # BB#0:
3860; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3861; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
3862; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3863; AVX512F-NEXT:    retq
3864;
3865; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3866; AVX512BW:       # BB#0:
3867; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3868; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3869; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3870; AVX512BW-NEXT:    retq
3871;
3872; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3873; AVX512DQ:       # BB#0:
3874; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3875; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
3876; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3877; AVX512DQ-NEXT:    retq
3878  %1 = xor <16 x i16> %a0, %a1
3879  %2 = trunc <16 x i16> %1 to <16 x i8>
3880  ret <16 x i8> %2
3881}
3882
3883;
3884; xor to constant
3885;
3886
3887define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3888; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3889; SSE:       # BB#0:
3890; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3891; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
3892; SSE-NEXT:    retq
3893;
3894; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3895; AVX1:       # BB#0:
3896; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3897; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3898; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
3899; AVX1-NEXT:    vzeroupper
3900; AVX1-NEXT:    retq
3901;
3902; AVX2-LABEL: trunc_xor_const_v4i64_v4i32:
3903; AVX2:       # BB#0:
3904; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3905; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3906; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3907; AVX2-NEXT:    vzeroupper
3908; AVX2-NEXT:    retq
3909;
3910; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3911; AVX512:       # BB#0:
3912; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
3913; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3914; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3915; AVX512-NEXT:    retq
3916  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3917  %2 = trunc <4 x i64> %1 to <4 x i32>
3918  ret <4 x i32> %2
3919}
3920
3921define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3922; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3923; SSE:       # BB#0:
3924; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
3925; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
3926; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3927; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3928; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3929; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3930; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3931; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3932; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3933; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3934; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3935; SSE-NEXT:    xorpd {{.*}}(%rip), %xmm2
3936; SSE-NEXT:    movapd %xmm2, %xmm0
3937; SSE-NEXT:    retq
3938;
3939; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3940; AVX1:       # BB#0:
3941; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3942; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3943; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3944; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3945; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
3946; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3947; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3948; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3949; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3950; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3951; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3952; AVX1-NEXT:    vzeroupper
3953; AVX1-NEXT:    retq
3954;
3955; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3956; AVX2:       # BB#0:
3957; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
3958; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3959; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3960; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3961; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3962; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3963; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3964; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3965; AVX2-NEXT:    vzeroupper
3966; AVX2-NEXT:    retq
3967;
3968; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3969; AVX512:       # BB#0:
3970; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3971; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3972; AVX512-NEXT:    retq
3973  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3974  %2 = trunc <8 x i64> %1 to <8 x i16>
3975  ret <8 x i16> %2
3976}
3977
3978define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3979; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3980; SSE:       # BB#0:
3981; SSE-NEXT:    pslld $16, %xmm1
3982; SSE-NEXT:    psrad $16, %xmm1
3983; SSE-NEXT:    pslld $16, %xmm0
3984; SSE-NEXT:    psrad $16, %xmm0
3985; SSE-NEXT:    packssdw %xmm1, %xmm0
3986; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
3987; SSE-NEXT:    retq
3988;
3989; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3990; AVX1:       # BB#0:
3991; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3992; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3993; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3994; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3995; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3996; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3997; AVX1-NEXT:    vzeroupper
3998; AVX1-NEXT:    retq
3999;
4000; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
4001; AVX2:       # BB#0:
4002; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4003; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4004; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4005; AVX2-NEXT:    vzeroupper
4006; AVX2-NEXT:    retq
4007;
4008; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
4009; AVX512:       # BB#0:
4010; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
4011; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4012; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4013; AVX512-NEXT:    retq
4014  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4015  %2 = trunc <8 x i32> %1 to <8 x i16>
4016  ret <8 x i16> %2
4017}
4018
4019define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4020; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
4021; SSE:       # BB#0:
4022; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4023; SSE-NEXT:    pand %xmm8, %xmm7
4024; SSE-NEXT:    pand %xmm8, %xmm6
4025; SSE-NEXT:    packuswb %xmm7, %xmm6
4026; SSE-NEXT:    pand %xmm8, %xmm5
4027; SSE-NEXT:    pand %xmm8, %xmm4
4028; SSE-NEXT:    packuswb %xmm5, %xmm4
4029; SSE-NEXT:    packuswb %xmm6, %xmm4
4030; SSE-NEXT:    pand %xmm8, %xmm3
4031; SSE-NEXT:    pand %xmm8, %xmm2
4032; SSE-NEXT:    packuswb %xmm3, %xmm2
4033; SSE-NEXT:    pand %xmm8, %xmm1
4034; SSE-NEXT:    pand %xmm8, %xmm0
4035; SSE-NEXT:    packuswb %xmm1, %xmm0
4036; SSE-NEXT:    packuswb %xmm2, %xmm0
4037; SSE-NEXT:    packuswb %xmm4, %xmm0
4038; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4039; SSE-NEXT:    retq
4040;
4041; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
4042; AVX1:       # BB#0:
4043; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4044; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4045; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4046; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4047; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
4048; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
4049; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4050; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
4051; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
4052; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
4053; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4054; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4055; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
4056; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
4057; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4058; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4059; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
4060; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
4061; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4062; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4063; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4064; AVX1-NEXT:    vzeroupper
4065; AVX1-NEXT:    retq
4066;
4067; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
4068; AVX2:       # BB#0:
4069; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4070; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4071; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
4072; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
4073; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4074; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4075; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4076; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4077; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4078; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4079; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4080; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4081; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4082; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4083; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4084; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4085; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4086; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
4087; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4088; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4089; AVX2-NEXT:    vzeroupper
4090; AVX2-NEXT:    retq
4091;
4092; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8:
4093; AVX512F:       # BB#0:
4094; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
4095; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
4096; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4097; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4098; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4099; AVX512F-NEXT:    retq
4100;
4101; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
4102; AVX512BW:       # BB#0:
4103; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
4104; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
4105; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4106; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
4107; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4108; AVX512BW-NEXT:    retq
4109;
4110; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
4111; AVX512DQ:       # BB#0:
4112; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
4113; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
4114; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
4115; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4116; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4117; AVX512DQ-NEXT:    retq
4118  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4119  %2 = trunc <16 x i64> %1 to <16 x i8>
4120  ret <16 x i8> %2
4121}
4122
4123define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4124; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
4125; SSE:       # BB#0:
4126; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4127; SSE-NEXT:    pand %xmm4, %xmm3
4128; SSE-NEXT:    pand %xmm4, %xmm2
4129; SSE-NEXT:    packuswb %xmm3, %xmm2
4130; SSE-NEXT:    pand %xmm4, %xmm1
4131; SSE-NEXT:    pand %xmm4, %xmm0
4132; SSE-NEXT:    packuswb %xmm1, %xmm0
4133; SSE-NEXT:    packuswb %xmm2, %xmm0
4134; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4135; SSE-NEXT:    retq
4136;
4137; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
4138; AVX1:       # BB#0:
4139; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4140; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4141; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4142; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
4143; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
4144; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4145; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4146; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
4147; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4148; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4149; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4150; AVX1-NEXT:    vzeroupper
4151; AVX1-NEXT:    retq
4152;
4153; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
4154; AVX2:       # BB#0:
4155; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4156; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4157; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4158; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4159; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
4160; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4161; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4162; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
4163; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4164; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4165; AVX2-NEXT:    vzeroupper
4166; AVX2-NEXT:    retq
4167;
4168; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
4169; AVX512:       # BB#0:
4170; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4171; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4172; AVX512-NEXT:    retq
4173  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4174  %2 = trunc <16 x i32> %1 to <16 x i8>
4175  ret <16 x i8> %2
4176}
4177
4178define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4179; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
4180; SSE:       # BB#0:
4181; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4182; SSE-NEXT:    pand %xmm2, %xmm1
4183; SSE-NEXT:    pand %xmm2, %xmm0
4184; SSE-NEXT:    packuswb %xmm1, %xmm0
4185; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4186; SSE-NEXT:    retq
4187;
4188; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
4189; AVX1:       # BB#0:
4190; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4191; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4192; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4193; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4194; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4195; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4196; AVX1-NEXT:    vzeroupper
4197; AVX1-NEXT:    retq
4198;
4199; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
4200; AVX2:       # BB#0:
4201; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4202; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4203; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4204; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4205; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4206; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4207; AVX2-NEXT:    vzeroupper
4208; AVX2-NEXT:    retq
4209;
4210; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
4211; AVX512F:       # BB#0:
4212; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4213; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4214; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4215; AVX512F-NEXT:    retq
4216;
4217; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
4218; AVX512BW:       # BB#0:
4219; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
4220; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4221; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4222; AVX512BW-NEXT:    retq
4223;
4224; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
4225; AVX512DQ:       # BB#0:
4226; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
4227; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4228; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4229; AVX512DQ-NEXT:    retq
4230  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4231  %2 = trunc <16 x i16> %1 to <16 x i8>
4232  ret <16 x i8> %2
4233}
4234
4235;
4236; or
4237;
4238
4239define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
4240; SSE-LABEL: trunc_or_v4i64_v4i32:
4241; SSE:       # BB#0:
4242; SSE-NEXT:    orps %xmm3, %xmm1
4243; SSE-NEXT:    orps %xmm2, %xmm0
4244; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4245; SSE-NEXT:    retq
4246;
4247; AVX1-LABEL: trunc_or_v4i64_v4i32:
4248; AVX1:       # BB#0:
4249; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4250; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4251; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4252; AVX1-NEXT:    vzeroupper
4253; AVX1-NEXT:    retq
4254;
4255; AVX2-LABEL: trunc_or_v4i64_v4i32:
4256; AVX2:       # BB#0:
4257; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4258; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4259; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4260; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4261; AVX2-NEXT:    vzeroupper
4262; AVX2-NEXT:    retq
4263;
4264; AVX512-LABEL: trunc_or_v4i64_v4i32:
4265; AVX512:       # BB#0:
4266; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4267; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4268; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4269; AVX512-NEXT:    retq
4270  %1 = or <4 x i64> %a0, %a1
4271  %2 = trunc <4 x i64> %1 to <4 x i32>
4272  ret <4 x i32> %2
4273}
4274
4275define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4276; SSE-LABEL: trunc_or_v8i64_v8i16:
4277; SSE:       # BB#0:
4278; SSE-NEXT:    por %xmm4, %xmm0
4279; SSE-NEXT:    por %xmm5, %xmm1
4280; SSE-NEXT:    por %xmm6, %xmm2
4281; SSE-NEXT:    por %xmm7, %xmm3
4282; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
4283; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
4284; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4285; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
4286; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4287; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4288; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4289; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4290; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4291; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4292; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
4293; SSE-NEXT:    movapd %xmm2, %xmm0
4294; SSE-NEXT:    retq
4295;
4296; AVX1-LABEL: trunc_or_v8i64_v8i16:
4297; AVX1:       # BB#0:
4298; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4299; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4300; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4301; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
4302; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4303; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4304; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
4305; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4306; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4307; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4308; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4309; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4310; AVX1-NEXT:    vzeroupper
4311; AVX1-NEXT:    retq
4312;
4313; AVX2-LABEL: trunc_or_v8i64_v8i16:
4314; AVX2:       # BB#0:
4315; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4316; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4317; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4318; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4319; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4320; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4321; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4322; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4323; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4324; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4325; AVX2-NEXT:    vzeroupper
4326; AVX2-NEXT:    retq
4327;
4328; AVX512-LABEL: trunc_or_v8i64_v8i16:
4329; AVX512:       # BB#0:
4330; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
4331; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4332; AVX512-NEXT:    retq
4333  %1 = or <8 x i64> %a0, %a1
4334  %2 = trunc <8 x i64> %1 to <8 x i16>
4335  ret <8 x i16> %2
4336}
4337
4338define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4339; SSE-LABEL: trunc_or_v8i32_v8i16:
4340; SSE:       # BB#0:
4341; SSE-NEXT:    por %xmm2, %xmm0
4342; SSE-NEXT:    por %xmm3, %xmm1
4343; SSE-NEXT:    pslld $16, %xmm1
4344; SSE-NEXT:    psrad $16, %xmm1
4345; SSE-NEXT:    pslld $16, %xmm0
4346; SSE-NEXT:    psrad $16, %xmm0
4347; SSE-NEXT:    packssdw %xmm1, %xmm0
4348; SSE-NEXT:    retq
4349;
4350; AVX1-LABEL: trunc_or_v8i32_v8i16:
4351; AVX1:       # BB#0:
4352; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4353; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4354; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4355; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4356; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4357; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4358; AVX1-NEXT:    vzeroupper
4359; AVX1-NEXT:    retq
4360;
4361; AVX2-LABEL: trunc_or_v8i32_v8i16:
4362; AVX2:       # BB#0:
4363; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4364; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4365; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4366; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4367; AVX2-NEXT:    vzeroupper
4368; AVX2-NEXT:    retq
4369;
4370; AVX512-LABEL: trunc_or_v8i32_v8i16:
4371; AVX512:       # BB#0:
4372; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4373; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4374; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4375; AVX512-NEXT:    retq
4376  %1 = or <8 x i32> %a0, %a1
4377  %2 = trunc <8 x i32> %1 to <8 x i16>
4378  ret <8 x i16> %2
4379}
4380
4381define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4382; SSE-LABEL: trunc_or_v16i64_v16i8:
4383; SSE:       # BB#0:
4384; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
4385; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
4386; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
4387; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
4388; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
4389; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
4390; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
4391; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
4392; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4393; SSE-NEXT:    pand %xmm8, %xmm7
4394; SSE-NEXT:    pand %xmm8, %xmm6
4395; SSE-NEXT:    packuswb %xmm7, %xmm6
4396; SSE-NEXT:    pand %xmm8, %xmm5
4397; SSE-NEXT:    pand %xmm8, %xmm4
4398; SSE-NEXT:    packuswb %xmm5, %xmm4
4399; SSE-NEXT:    packuswb %xmm6, %xmm4
4400; SSE-NEXT:    pand %xmm8, %xmm3
4401; SSE-NEXT:    pand %xmm8, %xmm2
4402; SSE-NEXT:    packuswb %xmm3, %xmm2
4403; SSE-NEXT:    pand %xmm8, %xmm1
4404; SSE-NEXT:    pand %xmm8, %xmm0
4405; SSE-NEXT:    packuswb %xmm1, %xmm0
4406; SSE-NEXT:    packuswb %xmm2, %xmm0
4407; SSE-NEXT:    packuswb %xmm4, %xmm0
4408; SSE-NEXT:    retq
4409;
4410; AVX1-LABEL: trunc_or_v16i64_v16i8:
4411; AVX1:       # BB#0:
4412; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
4413; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
4414; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
4415; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
4416; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4417; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4418; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4419; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4420; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
4421; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
4422; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4423; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
4424; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
4425; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
4426; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4427; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4428; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
4429; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
4430; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4431; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4432; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
4433; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
4434; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4435; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4436; AVX1-NEXT:    vzeroupper
4437; AVX1-NEXT:    retq
4438;
4439; AVX2-LABEL: trunc_or_v16i64_v16i8:
4440; AVX2:       # BB#0:
4441; AVX2-NEXT:    vpor %ymm5, %ymm1, %ymm1
4442; AVX2-NEXT:    vpor %ymm4, %ymm0, %ymm0
4443; AVX2-NEXT:    vpor %ymm7, %ymm3, %ymm3
4444; AVX2-NEXT:    vpor %ymm6, %ymm2, %ymm2
4445; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4446; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4447; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
4448; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
4449; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4450; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4451; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4452; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4453; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4454; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4455; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4456; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4457; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4458; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4459; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4460; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4461; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4462; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
4463; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4464; AVX2-NEXT:    vzeroupper
4465; AVX2-NEXT:    retq
4466;
4467; AVX512F-LABEL: trunc_or_v16i64_v16i8:
4468; AVX512F:       # BB#0:
4469; AVX512F-NEXT:    vporq %zmm3, %zmm1, %zmm1
4470; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
4471; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
4472; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
4473; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4474; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4475; AVX512F-NEXT:    retq
4476;
4477; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
4478; AVX512BW:       # BB#0:
4479; AVX512BW-NEXT:    vporq %zmm3, %zmm1, %zmm1
4480; AVX512BW-NEXT:    vporq %zmm2, %zmm0, %zmm0
4481; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
4482; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
4483; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4484; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
4485; AVX512BW-NEXT:    retq
4486;
4487; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
4488; AVX512DQ:       # BB#0:
4489; AVX512DQ-NEXT:    vporq %zmm3, %zmm1, %zmm1
4490; AVX512DQ-NEXT:    vporq %zmm2, %zmm0, %zmm0
4491; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
4492; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
4493; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
4494; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4495; AVX512DQ-NEXT:    retq
4496  %1 = or <16 x i64> %a0, %a1
4497  %2 = trunc <16 x i64> %1 to <16 x i8>
4498  ret <16 x i8> %2
4499}
4500
4501define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4502; SSE-LABEL: trunc_or_v16i32_v16i8:
4503; SSE:       # BB#0:
4504; SSE-NEXT:    por %xmm4, %xmm0
4505; SSE-NEXT:    por %xmm5, %xmm1
4506; SSE-NEXT:    por %xmm6, %xmm2
4507; SSE-NEXT:    por %xmm7, %xmm3
4508; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4509; SSE-NEXT:    pand %xmm4, %xmm3
4510; SSE-NEXT:    pand %xmm4, %xmm2
4511; SSE-NEXT:    packuswb %xmm3, %xmm2
4512; SSE-NEXT:    pand %xmm4, %xmm1
4513; SSE-NEXT:    pand %xmm4, %xmm0
4514; SSE-NEXT:    packuswb %xmm1, %xmm0
4515; SSE-NEXT:    packuswb %xmm2, %xmm0
4516; SSE-NEXT:    retq
4517;
4518; AVX1-LABEL: trunc_or_v16i32_v16i8:
4519; AVX1:       # BB#0:
4520; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4521; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4522; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4523; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4524; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4525; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
4526; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
4527; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4528; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4529; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
4530; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4531; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4532; AVX1-NEXT:    vzeroupper
4533; AVX1-NEXT:    retq
4534;
4535; AVX2-LABEL: trunc_or_v16i32_v16i8:
4536; AVX2:       # BB#0:
4537; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4538; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4539; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4540; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4541; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4542; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4543; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
4544; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4545; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4546; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
4547; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4548; AVX2-NEXT:    vzeroupper
4549; AVX2-NEXT:    retq
4550;
4551; AVX512-LABEL: trunc_or_v16i32_v16i8:
4552; AVX512:       # BB#0:
4553; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
4554; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4555; AVX512-NEXT:    retq
4556  %1 = or <16 x i32> %a0, %a1
4557  %2 = trunc <16 x i32> %1 to <16 x i8>
4558  ret <16 x i8> %2
4559}
4560
4561define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4562; SSE-LABEL: trunc_or_v16i16_v16i8:
4563; SSE:       # BB#0:
4564; SSE-NEXT:    por %xmm2, %xmm0
4565; SSE-NEXT:    por %xmm3, %xmm1
4566; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4567; SSE-NEXT:    pand %xmm2, %xmm1
4568; SSE-NEXT:    pand %xmm2, %xmm0
4569; SSE-NEXT:    packuswb %xmm1, %xmm0
4570; SSE-NEXT:    retq
4571;
4572; AVX1-LABEL: trunc_or_v16i16_v16i8:
4573; AVX1:       # BB#0:
4574; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4576; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4577; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4578; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4579; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4580; AVX1-NEXT:    vzeroupper
4581; AVX1-NEXT:    retq
4582;
4583; AVX2-LABEL: trunc_or_v16i16_v16i8:
4584; AVX2:       # BB#0:
4585; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4586; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4587; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4588; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4589; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4590; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4591; AVX2-NEXT:    vzeroupper
4592; AVX2-NEXT:    retq
4593;
4594; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4595; AVX512F:       # BB#0:
4596; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
4597; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4598; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4599; AVX512F-NEXT:    retq
4600;
4601; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4602; AVX512BW:       # BB#0:
4603; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
4604; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4605; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4606; AVX512BW-NEXT:    retq
4607;
4608; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4609; AVX512DQ:       # BB#0:
4610; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
4611; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
4612; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4613; AVX512DQ-NEXT:    retq
4614  %1 = or <16 x i16> %a0, %a1
4615  %2 = trunc <16 x i16> %1 to <16 x i8>
4616  ret <16 x i8> %2
4617}
4618
4619;
4620; or to constant
4621;
4622
4623define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4624; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4625; SSE:       # BB#0:
4626; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4627; SSE-NEXT:    orps {{.*}}(%rip), %xmm0
4628; SSE-NEXT:    retq
4629;
4630; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4631; AVX1:       # BB#0:
4632; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4633; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4634; AVX1-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
4635; AVX1-NEXT:    vzeroupper
4636; AVX1-NEXT:    retq
4637;
4638; AVX2-LABEL: trunc_or_const_v4i64_v4i32:
4639; AVX2:       # BB#0:
4640; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4641; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4642; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4643; AVX2-NEXT:    vzeroupper
4644; AVX2-NEXT:    retq
4645;
4646; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4647; AVX512:       # BB#0:
4648; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
4649; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4650; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4651; AVX512-NEXT:    retq
4652  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4653  %2 = trunc <4 x i64> %1 to <4 x i32>
4654  ret <4 x i32> %2
4655}
4656
4657define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4658; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4659; SSE:       # BB#0:
4660; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
4661; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
4662; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4663; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
4664; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4665; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4666; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4667; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4668; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4669; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4670; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
4671; SSE-NEXT:    orpd {{.*}}(%rip), %xmm2
4672; SSE-NEXT:    movapd %xmm2, %xmm0
4673; SSE-NEXT:    retq
4674;
4675; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4676; AVX1:       # BB#0:
4677; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4678; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4679; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4680; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4681; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
4682; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4683; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4684; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4685; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4686; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4687; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4688; AVX1-NEXT:    vzeroupper
4689; AVX1-NEXT:    retq
4690;
4691; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4692; AVX2:       # BB#0:
4693; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4694; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4695; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4696; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4697; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4698; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4699; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4700; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4701; AVX2-NEXT:    vzeroupper
4702; AVX2-NEXT:    retq
4703;
4704; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4705; AVX512:       # BB#0:
4706; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4707; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4708; AVX512-NEXT:    retq
4709  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4710  %2 = trunc <8 x i64> %1 to <8 x i16>
4711  ret <8 x i16> %2
4712}
4713
4714define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4715; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4716; SSE:       # BB#0:
4717; SSE-NEXT:    pslld $16, %xmm1
4718; SSE-NEXT:    psrad $16, %xmm1
4719; SSE-NEXT:    pslld $16, %xmm0
4720; SSE-NEXT:    psrad $16, %xmm0
4721; SSE-NEXT:    packssdw %xmm1, %xmm0
4722; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4723; SSE-NEXT:    retq
4724;
4725; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4726; AVX1:       # BB#0:
4727; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4728; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4729; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4730; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4731; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4732; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4733; AVX1-NEXT:    vzeroupper
4734; AVX1-NEXT:    retq
4735;
4736; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4737; AVX2:       # BB#0:
4738; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4739; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4740; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4741; AVX2-NEXT:    vzeroupper
4742; AVX2-NEXT:    retq
4743;
4744; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4745; AVX512:       # BB#0:
4746; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
4747; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4748; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4749; AVX512-NEXT:    retq
4750  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4751  %2 = trunc <8 x i32> %1 to <8 x i16>
4752  ret <8 x i16> %2
4753}
4754
4755define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4756; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4757; SSE:       # BB#0:
4758; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4759; SSE-NEXT:    pand %xmm8, %xmm7
4760; SSE-NEXT:    pand %xmm8, %xmm6
4761; SSE-NEXT:    packuswb %xmm7, %xmm6
4762; SSE-NEXT:    pand %xmm8, %xmm5
4763; SSE-NEXT:    pand %xmm8, %xmm4
4764; SSE-NEXT:    packuswb %xmm5, %xmm4
4765; SSE-NEXT:    packuswb %xmm6, %xmm4
4766; SSE-NEXT:    pand %xmm8, %xmm3
4767; SSE-NEXT:    pand %xmm8, %xmm2
4768; SSE-NEXT:    packuswb %xmm3, %xmm2
4769; SSE-NEXT:    pand %xmm8, %xmm1
4770; SSE-NEXT:    pand %xmm8, %xmm0
4771; SSE-NEXT:    packuswb %xmm1, %xmm0
4772; SSE-NEXT:    packuswb %xmm2, %xmm0
4773; SSE-NEXT:    packuswb %xmm4, %xmm0
4774; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4775; SSE-NEXT:    retq
4776;
4777; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4778; AVX1:       # BB#0:
4779; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4780; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4781; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4782; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4783; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
4784; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
4785; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4786; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
4787; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
4788; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
4789; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4790; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4791; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
4792; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
4793; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4794; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4795; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
4796; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
4797; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4798; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4799; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4800; AVX1-NEXT:    vzeroupper
4801; AVX1-NEXT:    retq
4802;
4803; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4804; AVX2:       # BB#0:
4805; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4806; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4807; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
4808; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
4809; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4810; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4811; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4812; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4813; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4814; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4815; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
4816; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4817; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4818; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4819; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4820; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4821; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4822; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
4823; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4824; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4825; AVX2-NEXT:    vzeroupper
4826; AVX2-NEXT:    retq
4827;
4828; AVX512F-LABEL: trunc_or_const_v16i64_v16i8:
4829; AVX512F:       # BB#0:
4830; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
4831; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
4832; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4833; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4834; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4835; AVX512F-NEXT:    retq
4836;
4837; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
4838; AVX512BW:       # BB#0:
4839; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
4840; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
4841; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4842; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
4843; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4844; AVX512BW-NEXT:    retq
4845;
4846; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
4847; AVX512DQ:       # BB#0:
4848; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
4849; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
4850; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
4851; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4852; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4853; AVX512DQ-NEXT:    retq
4854  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4855  %2 = trunc <16 x i64> %1 to <16 x i8>
4856  ret <16 x i8> %2
4857}
4858
4859define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4860; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4861; SSE:       # BB#0:
4862; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4863; SSE-NEXT:    pand %xmm4, %xmm3
4864; SSE-NEXT:    pand %xmm4, %xmm2
4865; SSE-NEXT:    packuswb %xmm3, %xmm2
4866; SSE-NEXT:    pand %xmm4, %xmm1
4867; SSE-NEXT:    pand %xmm4, %xmm0
4868; SSE-NEXT:    packuswb %xmm1, %xmm0
4869; SSE-NEXT:    packuswb %xmm2, %xmm0
4870; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4871; SSE-NEXT:    retq
4872;
4873; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4874; AVX1:       # BB#0:
4875; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4876; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4877; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4878; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
4879; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
4880; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4881; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4882; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
4883; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4884; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4885; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4886; AVX1-NEXT:    vzeroupper
4887; AVX1-NEXT:    retq
4888;
4889; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4890; AVX2:       # BB#0:
4891; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4892; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4893; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4894; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4895; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
4896; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4897; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4898; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
4899; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4900; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4901; AVX2-NEXT:    vzeroupper
4902; AVX2-NEXT:    retq
4903;
4904; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4905; AVX512:       # BB#0:
4906; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4907; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4908; AVX512-NEXT:    retq
4909  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4910  %2 = trunc <16 x i32> %1 to <16 x i8>
4911  ret <16 x i8> %2
4912}
4913
4914define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4915; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4916; SSE:       # BB#0:
4917; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4918; SSE-NEXT:    pand %xmm2, %xmm1
4919; SSE-NEXT:    pand %xmm2, %xmm0
4920; SSE-NEXT:    packuswb %xmm1, %xmm0
4921; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4922; SSE-NEXT:    retq
4923;
4924; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4925; AVX1:       # BB#0:
4926; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4927; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4928; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4929; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4930; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4931; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4932; AVX1-NEXT:    vzeroupper
4933; AVX1-NEXT:    retq
4934;
4935; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4936; AVX2:       # BB#0:
4937; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4938; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4939; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4940; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4941; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4942; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4943; AVX2-NEXT:    vzeroupper
4944; AVX2-NEXT:    retq
4945;
4946; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4947; AVX512F:       # BB#0:
4948; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4949; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4950; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4951; AVX512F-NEXT:    retq
4952;
4953; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4954; AVX512BW:       # BB#0:
4955; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
4956; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4957; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4958; AVX512BW-NEXT:    retq
4959;
4960; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4961; AVX512DQ:       # BB#0:
4962; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
4963; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4964; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4965; AVX512DQ-NEXT:    retq
4966  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4967  %2 = trunc <16 x i16> %1 to <16 x i8>
4968  ret <16 x i8> %2
4969}
4970
4971;
4972; complex patterns - often created by vectorizer
4973;
4974
4975define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4976; SSE-LABEL: mul_add_const_v4i64_v4i32:
4977; SSE:       # BB#0:
4978; SSE-NEXT:    movdqa %xmm0, %xmm2
4979; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
4980; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
4981; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
4982; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4983; SSE-NEXT:    movdqa %xmm2, %xmm4
4984; SSE-NEXT:    psrlq $32, %xmm4
4985; SSE-NEXT:    pmuludq %xmm1, %xmm4
4986; SSE-NEXT:    movdqa %xmm1, %xmm5
4987; SSE-NEXT:    psrlq $32, %xmm5
4988; SSE-NEXT:    pmuludq %xmm2, %xmm5
4989; SSE-NEXT:    paddq %xmm4, %xmm5
4990; SSE-NEXT:    psllq $32, %xmm5
4991; SSE-NEXT:    pmuludq %xmm1, %xmm2
4992; SSE-NEXT:    paddq %xmm5, %xmm2
4993; SSE-NEXT:    movdqa %xmm0, %xmm1
4994; SSE-NEXT:    psrlq $32, %xmm1
4995; SSE-NEXT:    pmuludq %xmm3, %xmm1
4996; SSE-NEXT:    movdqa %xmm3, %xmm4
4997; SSE-NEXT:    psrlq $32, %xmm4
4998; SSE-NEXT:    pmuludq %xmm0, %xmm4
4999; SSE-NEXT:    paddq %xmm1, %xmm4
5000; SSE-NEXT:    psllq $32, %xmm4
5001; SSE-NEXT:    pmuludq %xmm3, %xmm0
5002; SSE-NEXT:    paddq %xmm4, %xmm0
5003; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
5004; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
5005; SSE-NEXT:    retq
5006;
5007; AVX1-LABEL: mul_add_const_v4i64_v4i32:
5008; AVX1:       # BB#0:
5009; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5010; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
5011; AVX1-NEXT:    retq
5012;
5013; AVX2-LABEL: mul_add_const_v4i64_v4i32:
5014; AVX2:       # BB#0:
5015; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5016; AVX2-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
5017; AVX2-NEXT:    retq
5018;
5019; AVX512F-LABEL: mul_add_const_v4i64_v4i32:
5020; AVX512F:       # BB#0:
5021; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5022; AVX512F-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
5023; AVX512F-NEXT:    retq
5024;
5025; AVX512BW-LABEL: mul_add_const_v4i64_v4i32:
5026; AVX512BW:       # BB#0:
5027; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5028; AVX512BW-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
5029; AVX512BW-NEXT:    retq
5030;
5031; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32:
5032; AVX512DQ:       # BB#0:
5033; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5034; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
5035; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
5036; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
5037; AVX512DQ-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
5038; AVX512DQ-NEXT:    retq
5039  %1 = sext <4 x i32> %a0 to <4 x i64>
5040  %2 = sext <4 x i32> %a1 to <4 x i64>
5041  %3 = mul <4 x i64> %1, %2
5042  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
5043  %5 = trunc <4 x i64> %4 to <4 x i32>
5044  ret <4 x i32> %5
5045}
5046
5047define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5048; SSE-LABEL: mul_add_self_v4i64_v4i32:
5049; SSE:       # BB#0:
5050; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5051; SSE-NEXT:    movdqa %xmm2, %xmm3
5052; SSE-NEXT:    psrad $31, %xmm3
5053; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5054; SSE-NEXT:    movdqa %xmm0, %xmm3
5055; SSE-NEXT:    psrad $31, %xmm3
5056; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
5057; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
5058; SSE-NEXT:    movdqa %xmm3, %xmm4
5059; SSE-NEXT:    psrad $31, %xmm4
5060; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
5061; SSE-NEXT:    movdqa %xmm1, %xmm4
5062; SSE-NEXT:    psrad $31, %xmm4
5063; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5064; SSE-NEXT:    movdqa %xmm0, %xmm4
5065; SSE-NEXT:    psrlq $32, %xmm4
5066; SSE-NEXT:    pmuludq %xmm1, %xmm4
5067; SSE-NEXT:    movdqa %xmm1, %xmm5
5068; SSE-NEXT:    psrlq $32, %xmm5
5069; SSE-NEXT:    pmuludq %xmm0, %xmm5
5070; SSE-NEXT:    paddq %xmm4, %xmm5
5071; SSE-NEXT:    psllq $32, %xmm5
5072; SSE-NEXT:    pmuludq %xmm0, %xmm1
5073; SSE-NEXT:    paddq %xmm5, %xmm1
5074; SSE-NEXT:    movdqa %xmm2, %xmm0
5075; SSE-NEXT:    psrlq $32, %xmm0
5076; SSE-NEXT:    pmuludq %xmm3, %xmm0
5077; SSE-NEXT:    movdqa %xmm3, %xmm4
5078; SSE-NEXT:    psrlq $32, %xmm4
5079; SSE-NEXT:    pmuludq %xmm2, %xmm4
5080; SSE-NEXT:    paddq %xmm0, %xmm4
5081; SSE-NEXT:    psllq $32, %xmm4
5082; SSE-NEXT:    pmuludq %xmm2, %xmm3
5083; SSE-NEXT:    paddq %xmm4, %xmm3
5084; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
5085; SSE-NEXT:    paddd %xmm1, %xmm1
5086; SSE-NEXT:    movdqa %xmm1, %xmm0
5087; SSE-NEXT:    retq
5088;
5089; AVX1-LABEL: mul_add_self_v4i64_v4i32:
5090; AVX1:       # BB#0:
5091; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5092; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
5093; AVX1-NEXT:    retq
5094;
5095; AVX2-LABEL: mul_add_self_v4i64_v4i32:
5096; AVX2:       # BB#0:
5097; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5098; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
5099; AVX2-NEXT:    retq
5100;
5101; AVX512F-LABEL: mul_add_self_v4i64_v4i32:
5102; AVX512F:       # BB#0:
5103; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5104; AVX512F-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
5105; AVX512F-NEXT:    retq
5106;
5107; AVX512BW-LABEL: mul_add_self_v4i64_v4i32:
5108; AVX512BW:       # BB#0:
5109; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5110; AVX512BW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
5111; AVX512BW-NEXT:    retq
5112;
5113; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32:
5114; AVX512DQ:       # BB#0:
5115; AVX512DQ-NEXT:    vpmovsxdq %xmm0, %ymm0
5116; AVX512DQ-NEXT:    vpmovsxdq %xmm1, %ymm1
5117; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
5118; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
5119; AVX512DQ-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
5120; AVX512DQ-NEXT:    retq
5121  %1 = sext <4 x i32> %a0 to <4 x i64>
5122  %2 = sext <4 x i32> %a1 to <4 x i64>
5123  %3 = mul <4 x i64> %1, %2
5124  %4 = add <4 x i64> %3, %3
5125  %5 = trunc <4 x i64> %4 to <4 x i32>
5126  ret <4 x i32> %5
5127}
5128
5129define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5130; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
5131; SSE:       # BB#0:
5132; SSE-NEXT:    movdqa %xmm0, %xmm2
5133; SSE-NEXT:    psrad $31, %xmm2
5134; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
5135; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
5136; SSE-NEXT:    movdqa %xmm3, %xmm2
5137; SSE-NEXT:    psrad $31, %xmm2
5138; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5139; SSE-NEXT:    movdqa %xmm1, %xmm2
5140; SSE-NEXT:    psrad $31, %xmm2
5141; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
5142; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
5143; SSE-NEXT:    movdqa %xmm4, %xmm2
5144; SSE-NEXT:    psrad $31, %xmm2
5145; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5146; SSE-NEXT:    movdqa %xmm3, %xmm2
5147; SSE-NEXT:    psrlq $32, %xmm2
5148; SSE-NEXT:    pmuludq %xmm4, %xmm2
5149; SSE-NEXT:    movdqa %xmm3, %xmm5
5150; SSE-NEXT:    pmuludq %xmm4, %xmm5
5151; SSE-NEXT:    psrlq $32, %xmm4
5152; SSE-NEXT:    pmuludq %xmm3, %xmm4
5153; SSE-NEXT:    paddq %xmm2, %xmm4
5154; SSE-NEXT:    psllq $32, %xmm4
5155; SSE-NEXT:    movdqa %xmm0, %xmm6
5156; SSE-NEXT:    psrlq $32, %xmm6
5157; SSE-NEXT:    pmuludq %xmm1, %xmm6
5158; SSE-NEXT:    movdqa %xmm0, %xmm2
5159; SSE-NEXT:    pmuludq %xmm1, %xmm2
5160; SSE-NEXT:    psrlq $32, %xmm1
5161; SSE-NEXT:    pmuludq %xmm0, %xmm1
5162; SSE-NEXT:    paddq %xmm6, %xmm1
5163; SSE-NEXT:    psllq $32, %xmm1
5164; SSE-NEXT:    paddq %xmm0, %xmm1
5165; SSE-NEXT:    paddq %xmm1, %xmm2
5166; SSE-NEXT:    paddq %xmm3, %xmm4
5167; SSE-NEXT:    paddq %xmm5, %xmm4
5168; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
5169; SSE-NEXT:    movaps %xmm2, %xmm0
5170; SSE-NEXT:    retq
5171;
5172; AVX1-LABEL: mul_add_multiuse_v4i64_v4i32:
5173; AVX1:       # BB#0:
5174; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
5175; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
5176; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
5177; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm3
5178; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
5179; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
5180; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm1
5181; AVX1-NEXT:    vpmuldq %xmm3, %xmm2, %xmm3
5182; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
5183; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
5184; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[0,2]
5185; AVX1-NEXT:    retq
5186;
5187; AVX2-LABEL: mul_add_multiuse_v4i64_v4i32:
5188; AVX2:       # BB#0:
5189; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
5190; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
5191; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
5192; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
5193; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
5194; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5195; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5196; AVX2-NEXT:    vzeroupper
5197; AVX2-NEXT:    retq
5198;
5199; AVX512F-LABEL: mul_add_multiuse_v4i64_v4i32:
5200; AVX512F:       # BB#0:
5201; AVX512F-NEXT:    vpmovsxdq %xmm0, %ymm0
5202; AVX512F-NEXT:    vpmovsxdq %xmm1, %ymm1
5203; AVX512F-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
5204; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
5205; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
5206; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5207; AVX512F-NEXT:    retq
5208;
5209; AVX512BW-LABEL: mul_add_multiuse_v4i64_v4i32:
5210; AVX512BW:       # BB#0:
5211; AVX512BW-NEXT:    vpmovsxdq %xmm0, %ymm0
5212; AVX512BW-NEXT:    vpmovsxdq %xmm1, %ymm1
5213; AVX512BW-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
5214; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
5215; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
5216; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5217; AVX512BW-NEXT:    retq
5218;
5219; AVX512DQ-LABEL: mul_add_multiuse_v4i64_v4i32:
5220; AVX512DQ:       # BB#0:
5221; AVX512DQ-NEXT:    vpmovsxdq %xmm0, %ymm0
5222; AVX512DQ-NEXT:    vpmovsxdq %xmm1, %ymm1
5223; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm1
5224; AVX512DQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
5225; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
5226; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5227; AVX512DQ-NEXT:    retq
5228  %1 = sext <4 x i32> %a0 to <4 x i64>
5229  %2 = sext <4 x i32> %a1 to <4 x i64>
5230  %3 = mul <4 x i64> %1, %2
5231  %4 = add <4 x i64> %1, %3
5232  %5 = trunc <4 x i64> %4 to <4 x i32>
5233  ret <4 x i32> %5
5234}
5235