1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
12
13;
14; add
15;
16
17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18; SSE-LABEL: trunc_add_v4i64_v4i32:
19; SSE:       # %bb.0:
20; SSE-NEXT:    paddq %xmm3, %xmm1
21; SSE-NEXT:    paddq %xmm2, %xmm0
22; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
23; SSE-NEXT:    retq
24;
25; AVX1-LABEL: trunc_add_v4i64_v4i32:
26; AVX1:       # %bb.0:
27; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
28; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
29; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
30; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
31; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
32; AVX1-NEXT:    vzeroupper
33; AVX1-NEXT:    retq
34;
35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
36; AVX2-SLOW:       # %bb.0:
37; AVX2-SLOW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
38; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
39; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
40; AVX2-SLOW-NEXT:    vzeroupper
41; AVX2-SLOW-NEXT:    retq
42;
43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
44; AVX2-FAST-ALL:       # %bb.0:
45; AVX2-FAST-ALL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
46; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
47; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
48; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
49; AVX2-FAST-ALL-NEXT:    vzeroupper
50; AVX2-FAST-ALL-NEXT:    retq
51;
52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32:
53; AVX2-FAST-PERLANE:       # %bb.0:
54; AVX2-FAST-PERLANE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
55; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
56; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
57; AVX2-FAST-PERLANE-NEXT:    vzeroupper
58; AVX2-FAST-PERLANE-NEXT:    retq
59;
60; AVX512-LABEL: trunc_add_v4i64_v4i32:
61; AVX512:       # %bb.0:
62; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
63; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
64; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
65; AVX512-NEXT:    vzeroupper
66; AVX512-NEXT:    retq
67  %1 = add <4 x i64> %a0, %a1
68  %2 = trunc <4 x i64> %1 to <4 x i32>
69  ret <4 x i32> %2
70}
71
72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
73; SSE-LABEL: trunc_add_v8i64_v8i16:
74; SSE:       # %bb.0:
75; SSE-NEXT:    paddq %xmm6, %xmm2
76; SSE-NEXT:    paddq %xmm7, %xmm3
77; SSE-NEXT:    paddq %xmm4, %xmm0
78; SSE-NEXT:    paddq %xmm5, %xmm1
79; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
80; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
81; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
82; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
83; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
84; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
85; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
86; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
87; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
88; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
89; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
90; SSE-NEXT:    retq
91;
92; AVX1-LABEL: trunc_add_v8i64_v8i16:
93; AVX1:       # %bb.0:
94; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
95; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
96; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
97; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
98; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
99; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
100; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
101; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
102; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
103; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
104; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
105; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
106; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
107; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
108; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
109; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
110; AVX1-NEXT:    vzeroupper
111; AVX1-NEXT:    retq
112;
113; AVX2-LABEL: trunc_add_v8i64_v8i16:
114; AVX2:       # %bb.0:
115; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
116; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
117; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
118; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
119; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
120; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
121; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
122; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
123; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
124; AVX2-NEXT:    vzeroupper
125; AVX2-NEXT:    retq
126;
127; AVX512-LABEL: trunc_add_v8i64_v8i16:
128; AVX512:       # %bb.0:
129; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
130; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
131; AVX512-NEXT:    vzeroupper
132; AVX512-NEXT:    retq
133  %1 = add <8 x i64> %a0, %a1
134  %2 = trunc <8 x i64> %1 to <8 x i16>
135  ret <8 x i16> %2
136}
137
138define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
139; SSE-LABEL: trunc_add_v8i32_v8i16:
140; SSE:       # %bb.0:
141; SSE-NEXT:    paddd %xmm2, %xmm0
142; SSE-NEXT:    paddd %xmm3, %xmm1
143; SSE-NEXT:    pslld $16, %xmm1
144; SSE-NEXT:    psrad $16, %xmm1
145; SSE-NEXT:    pslld $16, %xmm0
146; SSE-NEXT:    psrad $16, %xmm0
147; SSE-NEXT:    packssdw %xmm1, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX1-LABEL: trunc_add_v8i32_v8i16:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
153; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
154; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
155; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
156; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
157; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
158; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
159; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
160; AVX1-NEXT:    vzeroupper
161; AVX1-NEXT:    retq
162;
163; AVX2-LABEL: trunc_add_v8i32_v8i16:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
166; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
167; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
168; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
169; AVX2-NEXT:    vzeroupper
170; AVX2-NEXT:    retq
171;
172; AVX512-LABEL: trunc_add_v8i32_v8i16:
173; AVX512:       # %bb.0:
174; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
175; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
176; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
177; AVX512-NEXT:    vzeroupper
178; AVX512-NEXT:    retq
179  %1 = add <8 x i32> %a0, %a1
180  %2 = trunc <8 x i32> %1 to <8 x i16>
181  ret <8 x i16> %2
182}
183
184define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
185; SSE-LABEL: trunc_add_v16i64_v16i8:
186; SSE:       # %bb.0:
187; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
188; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
189; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
190; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
191; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
192; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
193; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
194; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
195; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
196; SSE-NEXT:    pand %xmm8, %xmm7
197; SSE-NEXT:    pand %xmm8, %xmm6
198; SSE-NEXT:    packuswb %xmm7, %xmm6
199; SSE-NEXT:    pand %xmm8, %xmm5
200; SSE-NEXT:    pand %xmm8, %xmm4
201; SSE-NEXT:    packuswb %xmm5, %xmm4
202; SSE-NEXT:    packuswb %xmm6, %xmm4
203; SSE-NEXT:    pand %xmm8, %xmm3
204; SSE-NEXT:    pand %xmm8, %xmm2
205; SSE-NEXT:    packuswb %xmm3, %xmm2
206; SSE-NEXT:    pand %xmm8, %xmm1
207; SSE-NEXT:    pand %xmm8, %xmm0
208; SSE-NEXT:    packuswb %xmm1, %xmm0
209; SSE-NEXT:    packuswb %xmm2, %xmm0
210; SSE-NEXT:    packuswb %xmm4, %xmm0
211; SSE-NEXT:    retq
212;
213; AVX1-LABEL: trunc_add_v16i64_v16i8:
214; AVX1:       # %bb.0:
215; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
216; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
217; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
218; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
219; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
220; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
221; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
222; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
223; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
224; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
225; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
226; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
227; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
228; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
229; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
230; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
231; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
232; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
233; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
234; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
235; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
236; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
237; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
238; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
239; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
240; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
241; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
242; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
243; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
244; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
245; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
246; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
247; AVX1-NEXT:    vzeroupper
248; AVX1-NEXT:    retq
249;
250; AVX2-LABEL: trunc_add_v16i64_v16i8:
251; AVX2:       # %bb.0:
252; AVX2-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
253; AVX2-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
254; AVX2-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
255; AVX2-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
256; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
257; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
258; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
259; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
260; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
261; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
262; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
263; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
264; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
265; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
266; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
267; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
268; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
269; AVX2-NEXT:    vzeroupper
270; AVX2-NEXT:    retq
271;
272; AVX512-LABEL: trunc_add_v16i64_v16i8:
273; AVX512:       # %bb.0:
274; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
275; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
276; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
277; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
278; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279; AVX512-NEXT:    vzeroupper
280; AVX512-NEXT:    retq
281  %1 = add <16 x i64> %a0, %a1
282  %2 = trunc <16 x i64> %1 to <16 x i8>
283  ret <16 x i8> %2
284}
285
286define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
287; SSE-LABEL: trunc_add_v16i32_v16i8:
288; SSE:       # %bb.0:
289; SSE-NEXT:    paddd %xmm4, %xmm0
290; SSE-NEXT:    paddd %xmm5, %xmm1
291; SSE-NEXT:    paddd %xmm6, %xmm2
292; SSE-NEXT:    paddd %xmm7, %xmm3
293; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
294; SSE-NEXT:    pand %xmm4, %xmm3
295; SSE-NEXT:    pand %xmm4, %xmm2
296; SSE-NEXT:    packuswb %xmm3, %xmm2
297; SSE-NEXT:    pand %xmm4, %xmm1
298; SSE-NEXT:    pand %xmm4, %xmm0
299; SSE-NEXT:    packuswb %xmm1, %xmm0
300; SSE-NEXT:    packuswb %xmm2, %xmm0
301; SSE-NEXT:    retq
302;
303; AVX1-LABEL: trunc_add_v16i32_v16i8:
304; AVX1:       # %bb.0:
305; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
306; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
307; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
308; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
309; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
310; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
311; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
312; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
313; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
314; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
315; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
316; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
317; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
318; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
319; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
320; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
321; AVX1-NEXT:    vzeroupper
322; AVX1-NEXT:    retq
323;
324; AVX2-LABEL: trunc_add_v16i32_v16i8:
325; AVX2:       # %bb.0:
326; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
327; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
328; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
329; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
330; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
331; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
332; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
333; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
334; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
335; AVX2-NEXT:    vzeroupper
336; AVX2-NEXT:    retq
337;
338; AVX512-LABEL: trunc_add_v16i32_v16i8:
339; AVX512:       # %bb.0:
340; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
341; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
342; AVX512-NEXT:    vzeroupper
343; AVX512-NEXT:    retq
344  %1 = add <16 x i32> %a0, %a1
345  %2 = trunc <16 x i32> %1 to <16 x i8>
346  ret <16 x i8> %2
347}
348
349define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
350; SSE-LABEL: trunc_add_v16i16_v16i8:
351; SSE:       # %bb.0:
352; SSE-NEXT:    paddw %xmm2, %xmm0
353; SSE-NEXT:    paddw %xmm3, %xmm1
354; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
355; SSE-NEXT:    pand %xmm2, %xmm1
356; SSE-NEXT:    pand %xmm2, %xmm0
357; SSE-NEXT:    packuswb %xmm1, %xmm0
358; SSE-NEXT:    retq
359;
360; AVX1-LABEL: trunc_add_v16i16_v16i8:
361; AVX1:       # %bb.0:
362; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
363; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
364; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
365; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
366; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
367; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
368; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
369; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
370; AVX1-NEXT:    vzeroupper
371; AVX1-NEXT:    retq
372;
373; AVX2-LABEL: trunc_add_v16i16_v16i8:
374; AVX2:       # %bb.0:
375; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
376; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
377; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
378; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
379; AVX2-NEXT:    vzeroupper
380; AVX2-NEXT:    retq
381;
382; AVX512F-LABEL: trunc_add_v16i16_v16i8:
383; AVX512F:       # %bb.0:
384; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
385; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
386; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
387; AVX512F-NEXT:    vzeroupper
388; AVX512F-NEXT:    retq
389;
390; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
391; AVX512BW:       # %bb.0:
392; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
393; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
394; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
395; AVX512BW-NEXT:    vzeroupper
396; AVX512BW-NEXT:    retq
397;
398; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
399; AVX512DQ:       # %bb.0:
400; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
401; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
402; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
403; AVX512DQ-NEXT:    vzeroupper
404; AVX512DQ-NEXT:    retq
405  %1 = add <16 x i16> %a0, %a1
406  %2 = trunc <16 x i16> %1 to <16 x i8>
407  ret <16 x i8> %2
408}
409
410define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
411; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
412; SSE:       # %bb.0:
413; SSE-NEXT:    pslld $16, %xmm2
414; SSE-NEXT:    psrad $16, %xmm2
415; SSE-NEXT:    pslld $16, %xmm1
416; SSE-NEXT:    psrad $16, %xmm1
417; SSE-NEXT:    packssdw %xmm2, %xmm1
418; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
419; SSE-NEXT:    psraw $8, %xmm0
420; SSE-NEXT:    paddw %xmm1, %xmm0
421; SSE-NEXT:    retq
422;
423; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
426; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
427; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
428; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
429; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
430; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
431; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
432; AVX1-NEXT:    vzeroupper
433; AVX1-NEXT:    retq
434;
435; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
436; AVX2:       # %bb.0:
437; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
438; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
439; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
440; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
441; AVX2-NEXT:    vzeroupper
442; AVX2-NEXT:    retq
443;
444; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
445; AVX512:       # %bb.0:
446; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
447; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
448; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
449; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
450; AVX512-NEXT:    vzeroupper
451; AVX512-NEXT:    retq
452  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
453  %2 = sext <8 x i8> %1 to <8 x i32>
454  %3 = add <8 x i32> %2, %a1
455  %4 = trunc <8 x i32> %3 to <8 x i16>
456  ret <8 x i16> %4
457}
458
459;
460; add to constant
461;
462
463define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
464; SSE-LABEL: trunc_add_const_v4i64_v4i32:
465; SSE:       # %bb.0:
466; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
467; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
468; SSE-NEXT:    retq
469;
470; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
471; AVX1:       # %bb.0:
472; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
473; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
474; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
475; AVX1-NEXT:    vzeroupper
476; AVX1-NEXT:    retq
477;
478; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
479; AVX2-SLOW:       # %bb.0:
480; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
481; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
482; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
483; AVX2-SLOW-NEXT:    vzeroupper
484; AVX2-SLOW-NEXT:    retq
485;
486; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
487; AVX2-FAST-ALL:       # %bb.0:
488; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
489; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
490; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
491; AVX2-FAST-ALL-NEXT:    vzeroupper
492; AVX2-FAST-ALL-NEXT:    retq
493;
494; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32:
495; AVX2-FAST-PERLANE:       # %bb.0:
496; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
497; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
498; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
499; AVX2-FAST-PERLANE-NEXT:    vzeroupper
500; AVX2-FAST-PERLANE-NEXT:    retq
501;
502; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
503; AVX512:       # %bb.0:
504; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
505; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
506; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
507; AVX512-NEXT:    vzeroupper
508; AVX512-NEXT:    retq
509  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
510  %2 = trunc <4 x i64> %1 to <4 x i32>
511  ret <4 x i32> %2
512}
513
514define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
515; SSE-LABEL: trunc_add_const_v8i64_v8i16:
516; SSE:       # %bb.0:
517; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
518; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
519; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
520; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
521; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
522; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
523; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
524; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
525; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
526; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
527; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
528; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
529; SSE-NEXT:    retq
530;
531; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
532; AVX1:       # %bb.0:
533; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
534; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
535; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
536; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
537; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
538; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
539; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
540; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
541; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
542; AVX1-NEXT:    vzeroupper
543; AVX1-NEXT:    retq
544;
545; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
546; AVX2:       # %bb.0:
547; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
548; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
549; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
550; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
551; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
552; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
553; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
554; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
555; AVX2-NEXT:    vzeroupper
556; AVX2-NEXT:    retq
557;
558; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
559; AVX512:       # %bb.0:
560; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
561; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
562; AVX512-NEXT:    vzeroupper
563; AVX512-NEXT:    retq
564  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
565  %2 = trunc <8 x i64> %1 to <8 x i16>
566  ret <8 x i16> %2
567}
568
569define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
570; SSE-LABEL: trunc_add_const_v8i32_v8i16:
571; SSE:       # %bb.0:
572; SSE-NEXT:    pslld $16, %xmm1
573; SSE-NEXT:    psrad $16, %xmm1
574; SSE-NEXT:    pslld $16, %xmm0
575; SSE-NEXT:    psrad $16, %xmm0
576; SSE-NEXT:    packssdw %xmm1, %xmm0
577; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
578; SSE-NEXT:    retq
579;
580; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
581; AVX1:       # %bb.0:
582; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
583; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
584; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
585; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
586; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
587; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
588; AVX1-NEXT:    vzeroupper
589; AVX1-NEXT:    retq
590;
591; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
592; AVX2:       # %bb.0:
593; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
594; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
595; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
596; AVX2-NEXT:    vzeroupper
597; AVX2-NEXT:    retq
598;
599; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
600; AVX512:       # %bb.0:
601; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
602; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
603; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
604; AVX512-NEXT:    vzeroupper
605; AVX512-NEXT:    retq
606  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
607  %2 = trunc <8 x i32> %1 to <8 x i16>
608  ret <8 x i16> %2
609}
610
611define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
612; SSE-LABEL: trunc_add_const_v16i64_v16i8:
613; SSE:       # %bb.0:
614; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
615; SSE-NEXT:    pand %xmm8, %xmm7
616; SSE-NEXT:    pand %xmm8, %xmm6
617; SSE-NEXT:    packuswb %xmm7, %xmm6
618; SSE-NEXT:    pand %xmm8, %xmm5
619; SSE-NEXT:    pand %xmm8, %xmm4
620; SSE-NEXT:    packuswb %xmm5, %xmm4
621; SSE-NEXT:    packuswb %xmm6, %xmm4
622; SSE-NEXT:    pand %xmm8, %xmm3
623; SSE-NEXT:    pand %xmm8, %xmm2
624; SSE-NEXT:    packuswb %xmm3, %xmm2
625; SSE-NEXT:    pand %xmm8, %xmm1
626; SSE-NEXT:    pand %xmm8, %xmm0
627; SSE-NEXT:    packuswb %xmm1, %xmm0
628; SSE-NEXT:    packuswb %xmm2, %xmm0
629; SSE-NEXT:    packuswb %xmm4, %xmm0
630; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
631; SSE-NEXT:    retq
632;
633; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
634; AVX1:       # %bb.0:
635; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
636; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
637; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
638; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
639; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
640; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
641; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
642; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
643; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
644; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
645; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
646; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
647; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
648; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
649; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
650; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
651; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
652; AVX1-NEXT:    vzeroupper
653; AVX1-NEXT:    retq
654;
655; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
656; AVX2:       # %bb.0:
657; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
658; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
659; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
660; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
661; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
662; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
663; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
664; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
665; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
666; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
667; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
668; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
669; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
670; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
671; AVX2-NEXT:    vzeroupper
672; AVX2-NEXT:    retq
673;
674; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
675; AVX512:       # %bb.0:
676; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
677; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
678; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
679; AVX512-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
680; AVX512-NEXT:    vzeroupper
681; AVX512-NEXT:    retq
682  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
683  %2 = trunc <16 x i64> %1 to <16 x i8>
684  ret <16 x i8> %2
685}
686
687define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
688; SSE-LABEL: trunc_add_const_v16i32_v16i8:
689; SSE:       # %bb.0:
690; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
691; SSE-NEXT:    pand %xmm4, %xmm3
692; SSE-NEXT:    pand %xmm4, %xmm2
693; SSE-NEXT:    packuswb %xmm3, %xmm2
694; SSE-NEXT:    pand %xmm4, %xmm1
695; SSE-NEXT:    pand %xmm4, %xmm0
696; SSE-NEXT:    packuswb %xmm1, %xmm0
697; SSE-NEXT:    packuswb %xmm2, %xmm0
698; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
699; SSE-NEXT:    retq
700;
701; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
702; AVX1:       # %bb.0:
703; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
704; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
705; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
706; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
707; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
708; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
709; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
710; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
711; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
712; AVX1-NEXT:    vzeroupper
713; AVX1-NEXT:    retq
714;
715; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
716; AVX2:       # %bb.0:
717; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
718; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
719; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
720; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
721; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
722; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
723; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
724; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
725; AVX2-NEXT:    vzeroupper
726; AVX2-NEXT:    retq
727;
728; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
729; AVX512:       # %bb.0:
730; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
731; AVX512-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
732; AVX512-NEXT:    vzeroupper
733; AVX512-NEXT:    retq
734  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
735  %2 = trunc <16 x i32> %1 to <16 x i8>
736  ret <16 x i8> %2
737}
738
739define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
740; SSE-LABEL: trunc_add_const_v16i16_v16i8:
741; SSE:       # %bb.0:
742; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
743; SSE-NEXT:    pand %xmm2, %xmm1
744; SSE-NEXT:    pand %xmm2, %xmm0
745; SSE-NEXT:    packuswb %xmm1, %xmm0
746; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
747; SSE-NEXT:    retq
748;
749; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
750; AVX1:       # %bb.0:
751; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
752; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
753; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
754; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
755; AVX1-NEXT:    vzeroupper
756; AVX1-NEXT:    retq
757;
758; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
759; AVX2:       # %bb.0:
760; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
761; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
762; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
763; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
764; AVX2-NEXT:    vzeroupper
765; AVX2-NEXT:    retq
766;
767; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
768; AVX512F:       # %bb.0:
769; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
770; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
771; AVX512F-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
772; AVX512F-NEXT:    vzeroupper
773; AVX512F-NEXT:    retq
774;
775; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
776; AVX512BW:       # %bb.0:
777; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
778; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
779; AVX512BW-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
780; AVX512BW-NEXT:    vzeroupper
781; AVX512BW-NEXT:    retq
782;
783; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
784; AVX512DQ:       # %bb.0:
785; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
786; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
787; AVX512DQ-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
788; AVX512DQ-NEXT:    vzeroupper
789; AVX512DQ-NEXT:    retq
790  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
791  %2 = trunc <16 x i16> %1 to <16 x i8>
792  ret <16 x i8> %2
793}
794
795;
796; sub
797;
798
799define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
800; SSE-LABEL: trunc_sub_v4i64_v4i32:
801; SSE:       # %bb.0:
802; SSE-NEXT:    psubq %xmm3, %xmm1
803; SSE-NEXT:    psubq %xmm2, %xmm0
804; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
805; SSE-NEXT:    retq
806;
807; AVX1-LABEL: trunc_sub_v4i64_v4i32:
808; AVX1:       # %bb.0:
809; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
810; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
811; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
812; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
813; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
814; AVX1-NEXT:    vzeroupper
815; AVX1-NEXT:    retq
816;
817; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
818; AVX2-SLOW:       # %bb.0:
819; AVX2-SLOW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
820; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
821; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
822; AVX2-SLOW-NEXT:    vzeroupper
823; AVX2-SLOW-NEXT:    retq
824;
825; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
826; AVX2-FAST-ALL:       # %bb.0:
827; AVX2-FAST-ALL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
828; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
829; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
830; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
831; AVX2-FAST-ALL-NEXT:    vzeroupper
832; AVX2-FAST-ALL-NEXT:    retq
833;
834; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32:
835; AVX2-FAST-PERLANE:       # %bb.0:
836; AVX2-FAST-PERLANE-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
837; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
838; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
839; AVX2-FAST-PERLANE-NEXT:    vzeroupper
840; AVX2-FAST-PERLANE-NEXT:    retq
841;
842; AVX512-LABEL: trunc_sub_v4i64_v4i32:
843; AVX512:       # %bb.0:
844; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
845; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
846; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
847; AVX512-NEXT:    vzeroupper
848; AVX512-NEXT:    retq
849  %1 = sub <4 x i64> %a0, %a1
850  %2 = trunc <4 x i64> %1 to <4 x i32>
851  ret <4 x i32> %2
852}
853
854define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
855; SSE-LABEL: trunc_sub_v8i64_v8i16:
856; SSE:       # %bb.0:
857; SSE-NEXT:    psubq %xmm6, %xmm2
858; SSE-NEXT:    psubq %xmm7, %xmm3
859; SSE-NEXT:    psubq %xmm4, %xmm0
860; SSE-NEXT:    psubq %xmm5, %xmm1
861; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
862; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
863; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
864; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
865; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
866; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
867; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
868; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
869; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
870; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
871; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
872; SSE-NEXT:    retq
873;
874; AVX1-LABEL: trunc_sub_v8i64_v8i16:
875; AVX1:       # %bb.0:
876; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
877; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
878; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
879; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
880; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
881; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
882; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
883; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
884; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
885; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
886; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
887; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
888; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
889; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
890; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
891; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
892; AVX1-NEXT:    vzeroupper
893; AVX1-NEXT:    retq
894;
895; AVX2-LABEL: trunc_sub_v8i64_v8i16:
896; AVX2:       # %bb.0:
897; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
898; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
899; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
900; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
901; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
902; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
903; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
904; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
905; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
906; AVX2-NEXT:    vzeroupper
907; AVX2-NEXT:    retq
908;
909; AVX512-LABEL: trunc_sub_v8i64_v8i16:
910; AVX512:       # %bb.0:
911; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
912; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
913; AVX512-NEXT:    vzeroupper
914; AVX512-NEXT:    retq
915  %1 = sub <8 x i64> %a0, %a1
916  %2 = trunc <8 x i64> %1 to <8 x i16>
917  ret <8 x i16> %2
918}
919
920define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
921; SSE-LABEL: trunc_sub_v8i32_v8i16:
922; SSE:       # %bb.0:
923; SSE-NEXT:    psubd %xmm2, %xmm0
924; SSE-NEXT:    psubd %xmm3, %xmm1
925; SSE-NEXT:    pslld $16, %xmm1
926; SSE-NEXT:    psrad $16, %xmm1
927; SSE-NEXT:    pslld $16, %xmm0
928; SSE-NEXT:    psrad $16, %xmm0
929; SSE-NEXT:    packssdw %xmm1, %xmm0
930; SSE-NEXT:    retq
931;
932; AVX1-LABEL: trunc_sub_v8i32_v8i16:
933; AVX1:       # %bb.0:
934; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
935; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
936; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
937; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
938; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
939; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
940; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
941; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
942; AVX1-NEXT:    vzeroupper
943; AVX1-NEXT:    retq
944;
945; AVX2-LABEL: trunc_sub_v8i32_v8i16:
946; AVX2:       # %bb.0:
947; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
948; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
949; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
950; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
951; AVX2-NEXT:    vzeroupper
952; AVX2-NEXT:    retq
953;
954; AVX512-LABEL: trunc_sub_v8i32_v8i16:
955; AVX512:       # %bb.0:
956; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
957; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
958; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
959; AVX512-NEXT:    vzeroupper
960; AVX512-NEXT:    retq
961  %1 = sub <8 x i32> %a0, %a1
962  %2 = trunc <8 x i32> %1 to <8 x i16>
963  ret <8 x i16> %2
964}
965
966define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
967; SSE-LABEL: trunc_sub_v16i64_v16i8:
968; SSE:       # %bb.0:
969; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
970; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
971; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
972; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
973; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
974; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
975; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
976; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
977; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
978; SSE-NEXT:    pand %xmm8, %xmm7
979; SSE-NEXT:    pand %xmm8, %xmm6
980; SSE-NEXT:    packuswb %xmm7, %xmm6
981; SSE-NEXT:    pand %xmm8, %xmm5
982; SSE-NEXT:    pand %xmm8, %xmm4
983; SSE-NEXT:    packuswb %xmm5, %xmm4
984; SSE-NEXT:    packuswb %xmm6, %xmm4
985; SSE-NEXT:    pand %xmm8, %xmm3
986; SSE-NEXT:    pand %xmm8, %xmm2
987; SSE-NEXT:    packuswb %xmm3, %xmm2
988; SSE-NEXT:    pand %xmm8, %xmm1
989; SSE-NEXT:    pand %xmm8, %xmm0
990; SSE-NEXT:    packuswb %xmm1, %xmm0
991; SSE-NEXT:    packuswb %xmm2, %xmm0
992; SSE-NEXT:    packuswb %xmm4, %xmm0
993; SSE-NEXT:    retq
994;
995; AVX1-LABEL: trunc_sub_v16i64_v16i8:
996; AVX1:       # %bb.0:
997; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
998; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
999; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1000; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
1001; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
1002; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1003; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1004; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
1005; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
1006; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1007; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1008; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
1009; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
1010; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1011; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1012; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
1013; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1014; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1015; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1016; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1017; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1018; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1019; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1020; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1021; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1022; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1023; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1024; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1025; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1026; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1027; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1028; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1029; AVX1-NEXT:    vzeroupper
1030; AVX1-NEXT:    retq
1031;
1032; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1033; AVX2:       # %bb.0:
1034; AVX2-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
1035; AVX2-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
1036; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1037; AVX2-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
1038; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1039; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1040; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1041; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1042; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1043; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1044; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1045; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1046; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1047; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1048; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1049; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1050; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1051; AVX2-NEXT:    vzeroupper
1052; AVX2-NEXT:    retq
1053;
1054; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1055; AVX512:       # %bb.0:
1056; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1057; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1058; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1059; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1060; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1061; AVX512-NEXT:    vzeroupper
1062; AVX512-NEXT:    retq
1063  %1 = sub <16 x i64> %a0, %a1
1064  %2 = trunc <16 x i64> %1 to <16 x i8>
1065  ret <16 x i8> %2
1066}
1067
1068define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1069; SSE-LABEL: trunc_sub_v16i32_v16i8:
1070; SSE:       # %bb.0:
1071; SSE-NEXT:    psubd %xmm4, %xmm0
1072; SSE-NEXT:    psubd %xmm5, %xmm1
1073; SSE-NEXT:    psubd %xmm6, %xmm2
1074; SSE-NEXT:    psubd %xmm7, %xmm3
1075; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1076; SSE-NEXT:    pand %xmm4, %xmm3
1077; SSE-NEXT:    pand %xmm4, %xmm2
1078; SSE-NEXT:    packuswb %xmm3, %xmm2
1079; SSE-NEXT:    pand %xmm4, %xmm1
1080; SSE-NEXT:    pand %xmm4, %xmm0
1081; SSE-NEXT:    packuswb %xmm1, %xmm0
1082; SSE-NEXT:    packuswb %xmm2, %xmm0
1083; SSE-NEXT:    retq
1084;
1085; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1086; AVX1:       # %bb.0:
1087; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
1088; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1089; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1090; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
1091; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
1092; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1093; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1094; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
1095; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1096; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1097; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1098; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1099; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1100; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1101; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1102; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1103; AVX1-NEXT:    vzeroupper
1104; AVX1-NEXT:    retq
1105;
1106; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1107; AVX2:       # %bb.0:
1108; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
1109; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
1110; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1111; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1112; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1113; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1114; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1115; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1116; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1117; AVX2-NEXT:    vzeroupper
1118; AVX2-NEXT:    retq
1119;
1120; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1121; AVX512:       # %bb.0:
1122; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
1123; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1124; AVX512-NEXT:    vzeroupper
1125; AVX512-NEXT:    retq
1126  %1 = sub <16 x i32> %a0, %a1
1127  %2 = trunc <16 x i32> %1 to <16 x i8>
1128  ret <16 x i8> %2
1129}
1130
1131define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1132; SSE-LABEL: trunc_sub_v16i16_v16i8:
1133; SSE:       # %bb.0:
1134; SSE-NEXT:    psubw %xmm2, %xmm0
1135; SSE-NEXT:    psubw %xmm3, %xmm1
1136; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1137; SSE-NEXT:    pand %xmm2, %xmm1
1138; SSE-NEXT:    pand %xmm2, %xmm0
1139; SSE-NEXT:    packuswb %xmm1, %xmm0
1140; SSE-NEXT:    retq
1141;
1142; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1143; AVX1:       # %bb.0:
1144; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
1145; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1146; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1147; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1148; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1149; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1150; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
1151; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
1152; AVX1-NEXT:    vzeroupper
1153; AVX1-NEXT:    retq
1154;
1155; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1156; AVX2:       # %bb.0:
1157; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1158; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1159; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1160; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1161; AVX2-NEXT:    vzeroupper
1162; AVX2-NEXT:    retq
1163;
1164; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1165; AVX512F:       # %bb.0:
1166; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1167; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1168; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1169; AVX512F-NEXT:    vzeroupper
1170; AVX512F-NEXT:    retq
1171;
1172; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1173; AVX512BW:       # %bb.0:
1174; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1175; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1176; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1177; AVX512BW-NEXT:    vzeroupper
1178; AVX512BW-NEXT:    retq
1179;
1180; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1181; AVX512DQ:       # %bb.0:
1182; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1183; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1184; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1185; AVX512DQ-NEXT:    vzeroupper
1186; AVX512DQ-NEXT:    retq
1187  %1 = sub <16 x i16> %a0, %a1
1188  %2 = trunc <16 x i16> %1 to <16 x i8>
1189  ret <16 x i8> %2
1190}
1191
1192define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1193; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1194; SSE:       # %bb.0:
1195; SSE-NEXT:    psubb %xmm1, %xmm0
1196; SSE-NEXT:    retq
1197;
1198; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1199; AVX:       # %bb.0:
1200; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1201; AVX-NEXT:    retq
1202  %a = zext <16 x i8> %x to <16 x i16>
1203  %b = zext <16 x i8> %y to <16 x i16>
1204  %c = sub <16 x i16> %a, %b
1205  %d = trunc <16 x i16> %c to <16 x i8>
1206  ret <16 x i8> %d
1207}
1208
1209;
1210; sub to constant
1211;
1212
1213define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1214; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1215; SSE:       # %bb.0:
1216; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1217; SSE-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1218; SSE-NEXT:    retq
1219;
1220; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1221; AVX1:       # %bb.0:
1222; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1223; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1224; AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1225; AVX1-NEXT:    vzeroupper
1226; AVX1-NEXT:    retq
1227;
1228; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1229; AVX2-SLOW:       # %bb.0:
1230; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1231; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1232; AVX2-SLOW-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1233; AVX2-SLOW-NEXT:    vzeroupper
1234; AVX2-SLOW-NEXT:    retq
1235;
1236; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
1237; AVX2-FAST-ALL:       # %bb.0:
1238; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
1239; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1240; AVX2-FAST-ALL-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1241; AVX2-FAST-ALL-NEXT:    vzeroupper
1242; AVX2-FAST-ALL-NEXT:    retq
1243;
1244; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32:
1245; AVX2-FAST-PERLANE:       # %bb.0:
1246; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
1247; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1248; AVX2-FAST-PERLANE-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1249; AVX2-FAST-PERLANE-NEXT:    vzeroupper
1250; AVX2-FAST-PERLANE-NEXT:    retq
1251;
1252; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1253; AVX512:       # %bb.0:
1254; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1255; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1256; AVX512-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1257; AVX512-NEXT:    vzeroupper
1258; AVX512-NEXT:    retq
1259  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1260  %2 = trunc <4 x i64> %1 to <4 x i32>
1261  ret <4 x i32> %2
1262}
1263
1264define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1265; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1266; SSE:       # %bb.0:
1267; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1268; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1269; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1270; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1271; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1272; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1273; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1274; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1275; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1276; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1277; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1278; SSE-NEXT:    psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1279; SSE-NEXT:    retq
1280;
1281; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1282; AVX1:       # %bb.0:
1283; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
1284; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1285; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1286; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1287; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1288; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1289; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1290; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1291; AVX1-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1292; AVX1-NEXT:    vzeroupper
1293; AVX1-NEXT:    retq
1294;
1295; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1296; AVX2:       # %bb.0:
1297; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1298; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1299; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1300; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1301; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1302; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1303; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1304; AVX2-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1305; AVX2-NEXT:    vzeroupper
1306; AVX2-NEXT:    retq
1307;
1308; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1309; AVX512:       # %bb.0:
1310; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1311; AVX512-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1312; AVX512-NEXT:    vzeroupper
1313; AVX512-NEXT:    retq
1314  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1315  %2 = trunc <8 x i64> %1 to <8 x i16>
1316  ret <8 x i16> %2
1317}
1318
1319define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1320; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1321; SSE:       # %bb.0:
1322; SSE-NEXT:    pslld $16, %xmm1
1323; SSE-NEXT:    psrad $16, %xmm1
1324; SSE-NEXT:    pslld $16, %xmm0
1325; SSE-NEXT:    psrad $16, %xmm0
1326; SSE-NEXT:    packssdw %xmm1, %xmm0
1327; SSE-NEXT:    psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1328; SSE-NEXT:    retq
1329;
1330; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1331; AVX1:       # %bb.0:
1332; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1333; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1334; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1335; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1336; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1337; AVX1-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1338; AVX1-NEXT:    vzeroupper
1339; AVX1-NEXT:    retq
1340;
1341; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1342; AVX2:       # %bb.0:
1343; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1344; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1345; AVX2-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1346; AVX2-NEXT:    vzeroupper
1347; AVX2-NEXT:    retq
1348;
1349; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1350; AVX512:       # %bb.0:
1351; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1352; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1353; AVX512-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1354; AVX512-NEXT:    vzeroupper
1355; AVX512-NEXT:    retq
1356  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1357  %2 = trunc <8 x i32> %1 to <8 x i16>
1358  ret <8 x i16> %2
1359}
1360
1361define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1362; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1363; SSE:       # %bb.0:
1364; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1365; SSE-NEXT:    pand %xmm8, %xmm7
1366; SSE-NEXT:    pand %xmm8, %xmm6
1367; SSE-NEXT:    packuswb %xmm7, %xmm6
1368; SSE-NEXT:    pand %xmm8, %xmm5
1369; SSE-NEXT:    pand %xmm8, %xmm4
1370; SSE-NEXT:    packuswb %xmm5, %xmm4
1371; SSE-NEXT:    packuswb %xmm6, %xmm4
1372; SSE-NEXT:    pand %xmm8, %xmm3
1373; SSE-NEXT:    pand %xmm8, %xmm2
1374; SSE-NEXT:    packuswb %xmm3, %xmm2
1375; SSE-NEXT:    pand %xmm8, %xmm1
1376; SSE-NEXT:    pand %xmm8, %xmm0
1377; SSE-NEXT:    packuswb %xmm1, %xmm0
1378; SSE-NEXT:    packuswb %xmm2, %xmm0
1379; SSE-NEXT:    packuswb %xmm4, %xmm0
1380; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1381; SSE-NEXT:    retq
1382;
1383; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1384; AVX1:       # %bb.0:
1385; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
1386; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1387; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1388; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1389; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1390; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1391; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1392; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1393; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1394; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1395; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1396; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1397; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1398; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1399; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1400; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1401; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1402; AVX1-NEXT:    vzeroupper
1403; AVX1-NEXT:    retq
1404;
1405; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1406; AVX2:       # %bb.0:
1407; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1408; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1409; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1410; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1411; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1412; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1413; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1414; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1415; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1416; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1417; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1418; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1419; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1420; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1421; AVX2-NEXT:    vzeroupper
1422; AVX2-NEXT:    retq
1423;
1424; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1425; AVX512:       # %bb.0:
1426; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1427; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1428; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1429; AVX512-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1430; AVX512-NEXT:    vzeroupper
1431; AVX512-NEXT:    retq
1432  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1433  %2 = trunc <16 x i64> %1 to <16 x i8>
1434  ret <16 x i8> %2
1435}
1436
1437define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1438; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1439; SSE:       # %bb.0:
1440; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1441; SSE-NEXT:    pand %xmm4, %xmm3
1442; SSE-NEXT:    pand %xmm4, %xmm2
1443; SSE-NEXT:    packuswb %xmm3, %xmm2
1444; SSE-NEXT:    pand %xmm4, %xmm1
1445; SSE-NEXT:    pand %xmm4, %xmm0
1446; SSE-NEXT:    packuswb %xmm1, %xmm0
1447; SSE-NEXT:    packuswb %xmm2, %xmm0
1448; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1449; SSE-NEXT:    retq
1450;
1451; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1452; AVX1:       # %bb.0:
1453; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1454; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1455; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1456; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1457; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1458; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1459; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1460; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1461; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1462; AVX1-NEXT:    vzeroupper
1463; AVX1-NEXT:    retq
1464;
1465; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1466; AVX2:       # %bb.0:
1467; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1468; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1469; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1470; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1471; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1472; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1473; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1474; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1475; AVX2-NEXT:    vzeroupper
1476; AVX2-NEXT:    retq
1477;
1478; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1479; AVX512:       # %bb.0:
1480; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1481; AVX512-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1482; AVX512-NEXT:    vzeroupper
1483; AVX512-NEXT:    retq
1484  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1485  %2 = trunc <16 x i32> %1 to <16 x i8>
1486  ret <16 x i8> %2
1487}
1488
1489define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1490; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1491; SSE:       # %bb.0:
1492; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1493; SSE-NEXT:    pand %xmm2, %xmm1
1494; SSE-NEXT:    pand %xmm2, %xmm0
1495; SSE-NEXT:    packuswb %xmm1, %xmm0
1496; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1497; SSE-NEXT:    retq
1498;
1499; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1500; AVX1:       # %bb.0:
1501; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1502; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1503; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1504; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1505; AVX1-NEXT:    vzeroupper
1506; AVX1-NEXT:    retq
1507;
1508; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1509; AVX2:       # %bb.0:
1510; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1511; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1512; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1513; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1514; AVX2-NEXT:    vzeroupper
1515; AVX2-NEXT:    retq
1516;
1517; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1518; AVX512F:       # %bb.0:
1519; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1520; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1521; AVX512F-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1522; AVX512F-NEXT:    vzeroupper
1523; AVX512F-NEXT:    retq
1524;
1525; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1526; AVX512BW:       # %bb.0:
1527; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1528; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1529; AVX512BW-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1530; AVX512BW-NEXT:    vzeroupper
1531; AVX512BW-NEXT:    retq
1532;
1533; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1534; AVX512DQ:       # %bb.0:
1535; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1536; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1537; AVX512DQ-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1538; AVX512DQ-NEXT:    vzeroupper
1539; AVX512DQ-NEXT:    retq
1540  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1541  %2 = trunc <16 x i16> %1 to <16 x i8>
1542  ret <16 x i8> %2
1543}
1544
1545define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1546; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1547; SSE:       # %bb.0:
1548; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1549; SSE-NEXT:    retq
1550;
1551; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1552; AVX:       # %bb.0:
1553; AVX-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1554; AVX-NEXT:    retq
1555  %a = zext <16 x i8> %x to <16 x i16>
1556  %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1557  %c = trunc <16 x i16> %b to <16 x i8>
1558  ret <16 x i8> %c
1559}
1560
1561define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1562; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1563; SSE:       # %bb.0:
1564; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1565; SSE-NEXT:    psubb %xmm0, %xmm1
1566; SSE-NEXT:    movdqa %xmm1, %xmm0
1567; SSE-NEXT:    retq
1568;
1569; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1570; AVX:       # %bb.0:
1571; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1572; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1573; AVX-NEXT:    retq
1574  %a = zext <16 x i8> %x to <16 x i16>
1575  %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1576  %c = trunc <16 x i16> %b to <16 x i8>
1577  ret <16 x i8> %c
1578}
1579
1580;
1581; mul
1582;
1583
1584define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1585; SSE-LABEL: trunc_mul_v4i64_v4i32:
1586; SSE:       # %bb.0:
1587; SSE-NEXT:    pmuludq %xmm3, %xmm1
1588; SSE-NEXT:    pmuludq %xmm2, %xmm0
1589; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1590; SSE-NEXT:    retq
1591;
1592; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1593; AVX1:       # %bb.0:
1594; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1595; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1596; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1597; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1598; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1599; AVX1-NEXT:    vzeroupper
1600; AVX1-NEXT:    retq
1601;
1602; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1603; AVX2-SLOW:       # %bb.0:
1604; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1605; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1606; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
1607; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1608; AVX2-SLOW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1609; AVX2-SLOW-NEXT:    vzeroupper
1610; AVX2-SLOW-NEXT:    retq
1611;
1612; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32:
1613; AVX2-FAST-ALL:       # %bb.0:
1614; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1615; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1616; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1617; AVX2-FAST-ALL-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1618; AVX2-FAST-ALL-NEXT:    vzeroupper
1619; AVX2-FAST-ALL-NEXT:    retq
1620;
1621; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32:
1622; AVX2-FAST-PERLANE:       # %bb.0:
1623; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
1624; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1625; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm2
1626; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1627; AVX2-FAST-PERLANE-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1628; AVX2-FAST-PERLANE-NEXT:    vzeroupper
1629; AVX2-FAST-PERLANE-NEXT:    retq
1630;
1631; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1632; AVX512F:       # %bb.0:
1633; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1634; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1635; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1636; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1637; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1638; AVX512F-NEXT:    vzeroupper
1639; AVX512F-NEXT:    retq
1640;
1641; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1642; AVX512BW:       # %bb.0:
1643; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1644; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1645; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1646; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1647; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1648; AVX512BW-NEXT:    vzeroupper
1649; AVX512BW-NEXT:    retq
1650;
1651; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1652; AVX512DQ:       # %bb.0:
1653; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1654; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1655; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1656; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1657; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1658; AVX512DQ-NEXT:    vzeroupper
1659; AVX512DQ-NEXT:    retq
1660  %1 = mul <4 x i64> %a0, %a1
1661  %2 = trunc <4 x i64> %1 to <4 x i32>
1662  ret <4 x i32> %2
1663}
1664
1665define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1666; SSE-LABEL: trunc_mul_v8i64_v8i16:
1667; SSE:       # %bb.0:
1668; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1669; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1670; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1671; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1672; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1673; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1674; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1675; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1676; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1677; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1678; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1679; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1680; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1681; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1682; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1683; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1684; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1685; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1686; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1687; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1688; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1689; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1690; SSE-NEXT:    pmullw %xmm6, %xmm0
1691; SSE-NEXT:    retq
1692;
1693; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1694; AVX1:       # %bb.0:
1695; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
1696; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1697; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1698; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1699; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1700; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1701; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1702; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1703; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1704; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1705; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1706; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1707; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1708; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1709; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1710; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1711; AVX1-NEXT:    vzeroupper
1712; AVX1-NEXT:    retq
1713;
1714; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1715; AVX2:       # %bb.0:
1716; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1717; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15]
1718; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15]
1719; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1720; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1721; AVX2-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1722; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15]
1723; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15]
1724; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1725; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1726; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1727; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1728; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1729; AVX2-NEXT:    vzeroupper
1730; AVX2-NEXT:    retq
1731;
1732; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1733; AVX512F:       # %bb.0:
1734; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1735; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1736; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1737; AVX512F-NEXT:    vzeroupper
1738; AVX512F-NEXT:    retq
1739;
1740; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1741; AVX512BW:       # %bb.0:
1742; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1743; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1744; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1745; AVX512BW-NEXT:    vzeroupper
1746; AVX512BW-NEXT:    retq
1747;
1748; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1749; AVX512DQ:       # %bb.0:
1750; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1751; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
1752; AVX512DQ-NEXT:    vzeroupper
1753; AVX512DQ-NEXT:    retq
1754  %1 = mul <8 x i64> %a0, %a1
1755  %2 = trunc <8 x i64> %1 to <8 x i16>
1756  ret <8 x i16> %2
1757}
1758
1759define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1760; SSE-LABEL: trunc_mul_v8i32_v8i16:
1761; SSE:       # %bb.0:
1762; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1763; SSE-NEXT:    pmuludq %xmm2, %xmm0
1764; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1765; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1766; SSE-NEXT:    pmuludq %xmm4, %xmm2
1767; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1768; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1769; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1770; SSE-NEXT:    pmuludq %xmm3, %xmm1
1771; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1772; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1773; SSE-NEXT:    pmuludq %xmm2, %xmm3
1774; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1775; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1776; SSE-NEXT:    pslld $16, %xmm1
1777; SSE-NEXT:    psrad $16, %xmm1
1778; SSE-NEXT:    pslld $16, %xmm0
1779; SSE-NEXT:    psrad $16, %xmm0
1780; SSE-NEXT:    packssdw %xmm1, %xmm0
1781; SSE-NEXT:    retq
1782;
1783; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1784; AVX1:       # %bb.0:
1785; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
1786; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1787; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1788; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1789; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1790; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1791; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1792; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1793; AVX1-NEXT:    vzeroupper
1794; AVX1-NEXT:    retq
1795;
1796; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1797; AVX2:       # %bb.0:
1798; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1799; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1800; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1801; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1802; AVX2-NEXT:    vzeroupper
1803; AVX2-NEXT:    retq
1804;
1805; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1806; AVX512:       # %bb.0:
1807; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1808; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1809; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1810; AVX512-NEXT:    vzeroupper
1811; AVX512-NEXT:    retq
1812  %1 = mul <8 x i32> %a0, %a1
1813  %2 = trunc <8 x i32> %1 to <8 x i16>
1814  ret <8 x i16> %2
1815}
1816
1817define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1818; SSE-LABEL: trunc_mul_v16i64_v16i8:
1819; SSE:       # %bb.0:
1820; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm0
1821; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm1
1822; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm2
1823; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm3
1824; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm4
1825; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm5
1826; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm6
1827; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm7
1828; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1829; SSE-NEXT:    pand %xmm8, %xmm7
1830; SSE-NEXT:    pand %xmm8, %xmm6
1831; SSE-NEXT:    packuswb %xmm7, %xmm6
1832; SSE-NEXT:    pand %xmm8, %xmm5
1833; SSE-NEXT:    pand %xmm8, %xmm4
1834; SSE-NEXT:    packuswb %xmm5, %xmm4
1835; SSE-NEXT:    packuswb %xmm6, %xmm4
1836; SSE-NEXT:    pand %xmm8, %xmm3
1837; SSE-NEXT:    pand %xmm8, %xmm2
1838; SSE-NEXT:    packuswb %xmm3, %xmm2
1839; SSE-NEXT:    pand %xmm8, %xmm1
1840; SSE-NEXT:    pand %xmm8, %xmm0
1841; SSE-NEXT:    packuswb %xmm1, %xmm0
1842; SSE-NEXT:    packuswb %xmm2, %xmm0
1843; SSE-NEXT:    packuswb %xmm4, %xmm0
1844; SSE-NEXT:    retq
1845;
1846; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1847; AVX1:       # %bb.0:
1848; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm8
1849; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
1850; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1851; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm0
1852; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
1853; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1854; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1855; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
1856; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm5
1857; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1858; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1859; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
1860; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm6
1861; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1862; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1863; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
1864; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1865; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1866; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1867; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1868; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1869; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1870; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1871; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1872; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1873; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1874; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1875; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1876; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1877; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1878; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1879; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1880; AVX1-NEXT:    vzeroupper
1881; AVX1-NEXT:    retq
1882;
1883; AVX2-LABEL: trunc_mul_v16i64_v16i8:
1884; AVX2:       # %bb.0:
1885; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm0
1886; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm1
1887; AVX2-NEXT:    vpmuludq %ymm6, %ymm2, %ymm2
1888; AVX2-NEXT:    vpmuludq %ymm7, %ymm3, %ymm3
1889; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1890; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1891; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1892; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1893; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1894; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1895; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1896; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1897; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1898; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1899; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1900; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1901; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1902; AVX2-NEXT:    vzeroupper
1903; AVX2-NEXT:    retq
1904;
1905; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
1906; AVX512F:       # %bb.0:
1907; AVX512F-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
1908; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
1909; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
1910; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
1911; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1912; AVX512F-NEXT:    vzeroupper
1913; AVX512F-NEXT:    retq
1914;
1915; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
1916; AVX512BW:       # %bb.0:
1917; AVX512BW-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
1918; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
1919; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
1920; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
1921; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1922; AVX512BW-NEXT:    vzeroupper
1923; AVX512BW-NEXT:    retq
1924;
1925; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
1926; AVX512DQ:       # %bb.0:
1927; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
1928; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
1929; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
1930; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
1931; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1932; AVX512DQ-NEXT:    vzeroupper
1933; AVX512DQ-NEXT:    retq
1934  %1 = mul <16 x i64> %a0, %a1
1935  %2 = trunc <16 x i64> %1 to <16 x i8>
1936  ret <16 x i8> %2
1937}
1938
1939define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1940; SSE-LABEL: trunc_mul_v16i32_v16i8:
1941; SSE:       # %bb.0:
1942; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
1943; SSE-NEXT:    pmuludq %xmm4, %xmm0
1944; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1945; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1946; SSE-NEXT:    pmuludq %xmm8, %xmm4
1947; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1948; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1949; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1950; SSE-NEXT:    pmuludq %xmm5, %xmm1
1951; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1952; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1953; SSE-NEXT:    pmuludq %xmm4, %xmm5
1954; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1955; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1956; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
1957; SSE-NEXT:    pmuludq %xmm6, %xmm2
1958; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1959; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1960; SSE-NEXT:    pmuludq %xmm4, %xmm5
1961; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1962; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1963; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1964; SSE-NEXT:    pmuludq %xmm7, %xmm3
1965; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1966; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1967; SSE-NEXT:    pmuludq %xmm4, %xmm5
1968; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1969; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1970; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1971; SSE-NEXT:    pand %xmm4, %xmm3
1972; SSE-NEXT:    pand %xmm4, %xmm2
1973; SSE-NEXT:    packuswb %xmm3, %xmm2
1974; SSE-NEXT:    pand %xmm4, %xmm1
1975; SSE-NEXT:    pand %xmm4, %xmm0
1976; SSE-NEXT:    packuswb %xmm1, %xmm0
1977; SSE-NEXT:    packuswb %xmm2, %xmm0
1978; SSE-NEXT:    retq
1979;
1980; AVX1-LABEL: trunc_mul_v16i32_v16i8:
1981; AVX1:       # %bb.0:
1982; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
1983; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1984; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1985; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
1986; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
1987; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1988; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1989; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
1990; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1991; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1992; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1993; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1994; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1995; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1996; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1997; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1998; AVX1-NEXT:    vzeroupper
1999; AVX1-NEXT:    retq
2000;
2001; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2002; AVX2:       # %bb.0:
2003; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2004; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2005; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2006; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2007; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2008; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2009; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2010; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2011; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2012; AVX2-NEXT:    vzeroupper
2013; AVX2-NEXT:    retq
2014;
2015; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2016; AVX512:       # %bb.0:
2017; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2018; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2019; AVX512-NEXT:    vzeroupper
2020; AVX512-NEXT:    retq
2021  %1 = mul <16 x i32> %a0, %a1
2022  %2 = trunc <16 x i32> %1 to <16 x i8>
2023  ret <16 x i8> %2
2024}
2025
2026define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2027; SSE-LABEL: trunc_mul_v16i16_v16i8:
2028; SSE:       # %bb.0:
2029; SSE-NEXT:    pmullw %xmm2, %xmm0
2030; SSE-NEXT:    pmullw %xmm3, %xmm1
2031; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2032; SSE-NEXT:    pand %xmm2, %xmm1
2033; SSE-NEXT:    pand %xmm2, %xmm0
2034; SSE-NEXT:    packuswb %xmm1, %xmm0
2035; SSE-NEXT:    retq
2036;
2037; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2038; AVX1:       # %bb.0:
2039; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2040; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2041; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2042; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2043; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2044; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2045; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
2046; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2047; AVX1-NEXT:    vzeroupper
2048; AVX1-NEXT:    retq
2049;
2050; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2051; AVX2:       # %bb.0:
2052; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2053; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2054; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2055; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2056; AVX2-NEXT:    vzeroupper
2057; AVX2-NEXT:    retq
2058;
2059; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2060; AVX512F:       # %bb.0:
2061; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2062; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2063; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2064; AVX512F-NEXT:    vzeroupper
2065; AVX512F-NEXT:    retq
2066;
2067; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2068; AVX512BW:       # %bb.0:
2069; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2070; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2071; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2072; AVX512BW-NEXT:    vzeroupper
2073; AVX512BW-NEXT:    retq
2074;
2075; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2076; AVX512DQ:       # %bb.0:
2077; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2078; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2079; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2080; AVX512DQ-NEXT:    vzeroupper
2081; AVX512DQ-NEXT:    retq
2082  %1 = mul <16 x i16> %a0, %a1
2083  %2 = trunc <16 x i16> %1 to <16 x i8>
2084  ret <16 x i8> %2
2085}
2086
2087define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2088; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2089; SSE:       # %bb.0:
2090; SSE-NEXT:    pxor %xmm3, %xmm3
2091; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2092; SSE-NEXT:    pslld $16, %xmm2
2093; SSE-NEXT:    psrad $16, %xmm2
2094; SSE-NEXT:    pslld $16, %xmm1
2095; SSE-NEXT:    psrad $16, %xmm1
2096; SSE-NEXT:    packssdw %xmm2, %xmm1
2097; SSE-NEXT:    pmullw %xmm1, %xmm0
2098; SSE-NEXT:    retq
2099;
2100; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2101; AVX1:       # %bb.0:
2102; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2103; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2104; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2105; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2106; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2107; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2108; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2109; AVX1-NEXT:    vzeroupper
2110; AVX1-NEXT:    retq
2111;
2112; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2113; AVX2:       # %bb.0:
2114; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2115; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2116; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2117; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2118; AVX2-NEXT:    vzeroupper
2119; AVX2-NEXT:    retq
2120;
2121; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2122; AVX512:       # %bb.0:
2123; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2124; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
2125; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2126; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2127; AVX512-NEXT:    vzeroupper
2128; AVX512-NEXT:    retq
2129  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2130  %2 = zext <8 x i8> %1 to <8 x i32>
2131  %3 = mul <8 x i32> %2, %a1
2132  %4 = trunc <8 x i32> %3 to <8 x i16>
2133  ret <8 x i16> %4
2134}
2135
2136;
2137; mul to constant
2138;
2139
2140define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2141; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2142; SSE:       # %bb.0:
2143; SSE-NEXT:    xorps %xmm2, %xmm2
2144; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2145; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2146; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2147; SSE-NEXT:    movaps %xmm2, %xmm0
2148; SSE-NEXT:    retq
2149;
2150; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2151; AVX1:       # %bb.0:
2152; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2153; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2154; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2155; AVX1-NEXT:    vzeroupper
2156; AVX1-NEXT:    retq
2157;
2158; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2159; AVX2-SLOW:       # %bb.0:
2160; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2161; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2162; AVX2-SLOW-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2163; AVX2-SLOW-NEXT:    vzeroupper
2164; AVX2-SLOW-NEXT:    retq
2165;
2166; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
2167; AVX2-FAST-ALL:       # %bb.0:
2168; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2169; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2170; AVX2-FAST-ALL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2171; AVX2-FAST-ALL-NEXT:    vzeroupper
2172; AVX2-FAST-ALL-NEXT:    retq
2173;
2174; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32:
2175; AVX2-FAST-PERLANE:       # %bb.0:
2176; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2177; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2178; AVX2-FAST-PERLANE-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2179; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2180; AVX2-FAST-PERLANE-NEXT:    retq
2181;
2182; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2183; AVX512:       # %bb.0:
2184; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2185; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2186; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2187; AVX512-NEXT:    vzeroupper
2188; AVX512-NEXT:    retq
2189  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2190  %2 = trunc <4 x i64> %1 to <4 x i32>
2191  ret <4 x i32> %2
2192}
2193
2194define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2195; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2196; SSE:       # %bb.0:
2197; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2198; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2199; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2200; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2201; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2202; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2203; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2204; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2205; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2206; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2207; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2208; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2209; SSE-NEXT:    retq
2210;
2211; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2212; AVX1:       # %bb.0:
2213; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2214; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2215; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2216; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2217; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2218; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2219; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2220; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2221; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2222; AVX1-NEXT:    vzeroupper
2223; AVX1-NEXT:    retq
2224;
2225; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2226; AVX2:       # %bb.0:
2227; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2228; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2229; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2230; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2231; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2232; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2233; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2234; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2235; AVX2-NEXT:    vzeroupper
2236; AVX2-NEXT:    retq
2237;
2238; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2239; AVX512:       # %bb.0:
2240; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2241; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2242; AVX512-NEXT:    vzeroupper
2243; AVX512-NEXT:    retq
2244  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2245  %2 = trunc <8 x i64> %1 to <8 x i16>
2246  ret <8 x i16> %2
2247}
2248
2249define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2250; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2251; SSE:       # %bb.0:
2252; SSE-NEXT:    pslld $16, %xmm1
2253; SSE-NEXT:    psrad $16, %xmm1
2254; SSE-NEXT:    pslld $16, %xmm0
2255; SSE-NEXT:    psrad $16, %xmm0
2256; SSE-NEXT:    packssdw %xmm1, %xmm0
2257; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2258; SSE-NEXT:    retq
2259;
2260; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2261; AVX1:       # %bb.0:
2262; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2263; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2264; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2265; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2266; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2267; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2268; AVX1-NEXT:    vzeroupper
2269; AVX1-NEXT:    retq
2270;
2271; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2272; AVX2:       # %bb.0:
2273; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2274; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2275; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2276; AVX2-NEXT:    vzeroupper
2277; AVX2-NEXT:    retq
2278;
2279; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2280; AVX512:       # %bb.0:
2281; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2282; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2283; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2284; AVX512-NEXT:    vzeroupper
2285; AVX512-NEXT:    retq
2286  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2287  %2 = trunc <8 x i32> %1 to <8 x i16>
2288  ret <8 x i16> %2
2289}
2290
2291define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2292; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2293; SSE:       # %bb.0:
2294; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2295; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2296; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2297; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2298; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
2299; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
2300; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
2301; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2302; SSE-NEXT:    pand %xmm8, %xmm7
2303; SSE-NEXT:    pand %xmm8, %xmm6
2304; SSE-NEXT:    packuswb %xmm7, %xmm6
2305; SSE-NEXT:    pand %xmm8, %xmm5
2306; SSE-NEXT:    pand %xmm8, %xmm4
2307; SSE-NEXT:    packuswb %xmm5, %xmm4
2308; SSE-NEXT:    packuswb %xmm6, %xmm4
2309; SSE-NEXT:    pand %xmm8, %xmm3
2310; SSE-NEXT:    pand %xmm8, %xmm2
2311; SSE-NEXT:    packuswb %xmm3, %xmm2
2312; SSE-NEXT:    pand %xmm8, %xmm1
2313; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2314; SSE-NEXT:    packuswb %xmm1, %xmm0
2315; SSE-NEXT:    packuswb %xmm2, %xmm0
2316; SSE-NEXT:    packuswb %xmm4, %xmm0
2317; SSE-NEXT:    retq
2318;
2319; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2320; AVX1:       # %bb.0:
2321; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8
2322; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2323; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2324; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
2325; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2326; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2327; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
2328; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2329; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2330; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
2331; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2332; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2333; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
2334; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2335; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
2336; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
2337; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2338; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
2339; AVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
2340; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2341; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2342; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
2343; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2344; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2345; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
2346; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
2347; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2348; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2349; AVX1-NEXT:    vzeroupper
2350; AVX1-NEXT:    retq
2351;
2352; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2353; AVX2:       # %bb.0:
2354; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2355; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2356; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2357; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2358; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2359; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
2360; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
2361; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
2362; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2363; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
2364; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
2365; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2366; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2367; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
2368; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2369; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2370; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2371; AVX2-NEXT:    vzeroupper
2372; AVX2-NEXT:    retq
2373;
2374; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2375; AVX512F:       # %bb.0:
2376; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2377; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2378; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
2379; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
2380; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2381; AVX512F-NEXT:    vzeroupper
2382; AVX512F-NEXT:    retq
2383;
2384; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2385; AVX512BW:       # %bb.0:
2386; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2387; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2388; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
2389; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
2390; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2391; AVX512BW-NEXT:    vzeroupper
2392; AVX512BW-NEXT:    retq
2393;
2394; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2395; AVX512DQ:       # %bb.0:
2396; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2397; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2398; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
2399; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
2400; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2401; AVX512DQ-NEXT:    vzeroupper
2402; AVX512DQ-NEXT:    retq
2403  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2404  %2 = trunc <16 x i64> %1 to <16 x i8>
2405  ret <16 x i8> %2
2406}
2407
2408define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2409; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2410; SSE:       # %bb.0:
2411; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2412; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2413; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2414; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2415; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2416; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2417; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2418; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2419; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2420; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2421; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2422; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2423; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2424; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2425; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2426; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2427; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2428; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2429; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2430; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2431; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2432; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2433; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2434; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2435; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2436; SSE-NEXT:    pand %xmm4, %xmm3
2437; SSE-NEXT:    pand %xmm4, %xmm2
2438; SSE-NEXT:    packuswb %xmm3, %xmm2
2439; SSE-NEXT:    pand %xmm4, %xmm1
2440; SSE-NEXT:    pand %xmm4, %xmm0
2441; SSE-NEXT:    packuswb %xmm1, %xmm0
2442; SSE-NEXT:    packuswb %xmm2, %xmm0
2443; SSE-NEXT:    retq
2444;
2445; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2446; AVX1:       # %bb.0:
2447; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2448; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2449; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2450; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
2451; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2452; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2453; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
2454; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2455; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2456; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2457; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2458; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2459; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
2460; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2461; AVX1-NEXT:    vzeroupper
2462; AVX1-NEXT:    retq
2463;
2464; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2465; AVX2:       # %bb.0:
2466; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2467; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2468; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2469; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2470; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2471; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2472; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2473; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2474; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2475; AVX2-NEXT:    vzeroupper
2476; AVX2-NEXT:    retq
2477;
2478; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2479; AVX512:       # %bb.0:
2480; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2481; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2482; AVX512-NEXT:    vzeroupper
2483; AVX512-NEXT:    retq
2484  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2485  %2 = trunc <16 x i32> %1 to <16 x i8>
2486  ret <16 x i8> %2
2487}
2488
2489define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2490; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2491; SSE:       # %bb.0:
2492; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2493; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2494; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2495; SSE-NEXT:    pand %xmm2, %xmm1
2496; SSE-NEXT:    pand %xmm2, %xmm0
2497; SSE-NEXT:    packuswb %xmm1, %xmm0
2498; SSE-NEXT:    retq
2499;
2500; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2501; AVX1:       # %bb.0:
2502; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2503; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2504; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2505; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2506; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2507; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
2508; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2509; AVX1-NEXT:    vzeroupper
2510; AVX1-NEXT:    retq
2511;
2512; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2513; AVX2:       # %bb.0:
2514; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2515; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2516; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2517; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2518; AVX2-NEXT:    vzeroupper
2519; AVX2-NEXT:    retq
2520;
2521; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2522; AVX512F:       # %bb.0:
2523; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2524; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2525; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2526; AVX512F-NEXT:    vzeroupper
2527; AVX512F-NEXT:    retq
2528;
2529; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2530; AVX512BW:       # %bb.0:
2531; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2532; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2533; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2534; AVX512BW-NEXT:    vzeroupper
2535; AVX512BW-NEXT:    retq
2536;
2537; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2538; AVX512DQ:       # %bb.0:
2539; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2540; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2541; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2542; AVX512DQ-NEXT:    vzeroupper
2543; AVX512DQ-NEXT:    retq
2544  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2545  %2 = trunc <16 x i16> %1 to <16 x i8>
2546  ret <16 x i8> %2
2547}
2548
2549;
2550; and
2551;
2552
2553define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2554; SSE-LABEL: trunc_and_v4i64_v4i32:
2555; SSE:       # %bb.0:
2556; SSE-NEXT:    andps %xmm3, %xmm1
2557; SSE-NEXT:    andps %xmm2, %xmm0
2558; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2559; SSE-NEXT:    retq
2560;
2561; AVX1-LABEL: trunc_and_v4i64_v4i32:
2562; AVX1:       # %bb.0:
2563; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2564; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2565; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2566; AVX1-NEXT:    vzeroupper
2567; AVX1-NEXT:    retq
2568;
2569; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2570; AVX2-SLOW:       # %bb.0:
2571; AVX2-SLOW-NEXT:    vandps %ymm1, %ymm0, %ymm0
2572; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2573; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2574; AVX2-SLOW-NEXT:    vzeroupper
2575; AVX2-SLOW-NEXT:    retq
2576;
2577; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
2578; AVX2-FAST-ALL:       # %bb.0:
2579; AVX2-FAST-ALL-NEXT:    vandps %ymm1, %ymm0, %ymm0
2580; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2581; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2582; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2583; AVX2-FAST-ALL-NEXT:    vzeroupper
2584; AVX2-FAST-ALL-NEXT:    retq
2585;
2586; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32:
2587; AVX2-FAST-PERLANE:       # %bb.0:
2588; AVX2-FAST-PERLANE-NEXT:    vandps %ymm1, %ymm0, %ymm0
2589; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2590; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2591; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2592; AVX2-FAST-PERLANE-NEXT:    retq
2593;
2594; AVX512-LABEL: trunc_and_v4i64_v4i32:
2595; AVX512:       # %bb.0:
2596; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2597; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2598; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2599; AVX512-NEXT:    vzeroupper
2600; AVX512-NEXT:    retq
2601  %1 = and <4 x i64> %a0, %a1
2602  %2 = trunc <4 x i64> %1 to <4 x i32>
2603  ret <4 x i32> %2
2604}
2605
2606define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2607; SSE-LABEL: trunc_and_v8i64_v8i16:
2608; SSE:       # %bb.0:
2609; SSE-NEXT:    pand %xmm6, %xmm2
2610; SSE-NEXT:    pand %xmm7, %xmm3
2611; SSE-NEXT:    pand %xmm4, %xmm0
2612; SSE-NEXT:    pand %xmm5, %xmm1
2613; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2614; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2615; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2616; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2617; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2618; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2619; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2620; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2621; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2622; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2623; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2624; SSE-NEXT:    retq
2625;
2626; AVX1-LABEL: trunc_and_v8i64_v8i16:
2627; AVX1:       # %bb.0:
2628; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
2629; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2630; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2631; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2632; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2633; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2634; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2635; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2636; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2637; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2638; AVX1-NEXT:    vzeroupper
2639; AVX1-NEXT:    retq
2640;
2641; AVX2-LABEL: trunc_and_v8i64_v8i16:
2642; AVX2:       # %bb.0:
2643; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2644; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2645; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2646; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2647; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2648; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2649; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2650; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2651; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2652; AVX2-NEXT:    vzeroupper
2653; AVX2-NEXT:    retq
2654;
2655; AVX512-LABEL: trunc_and_v8i64_v8i16:
2656; AVX512:       # %bb.0:
2657; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2658; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2659; AVX512-NEXT:    vzeroupper
2660; AVX512-NEXT:    retq
2661  %1 = and <8 x i64> %a0, %a1
2662  %2 = trunc <8 x i64> %1 to <8 x i16>
2663  ret <8 x i16> %2
2664}
2665
2666define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2667; SSE-LABEL: trunc_and_v8i32_v8i16:
2668; SSE:       # %bb.0:
2669; SSE-NEXT:    pand %xmm2, %xmm0
2670; SSE-NEXT:    pand %xmm3, %xmm1
2671; SSE-NEXT:    pslld $16, %xmm1
2672; SSE-NEXT:    psrad $16, %xmm1
2673; SSE-NEXT:    pslld $16, %xmm0
2674; SSE-NEXT:    psrad $16, %xmm0
2675; SSE-NEXT:    packssdw %xmm1, %xmm0
2676; SSE-NEXT:    retq
2677;
2678; AVX1-LABEL: trunc_and_v8i32_v8i16:
2679; AVX1:       # %bb.0:
2680; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2681; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2682; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2683; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2684; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2685; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2686; AVX1-NEXT:    vzeroupper
2687; AVX1-NEXT:    retq
2688;
2689; AVX2-LABEL: trunc_and_v8i32_v8i16:
2690; AVX2:       # %bb.0:
2691; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2692; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2693; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2694; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2695; AVX2-NEXT:    vzeroupper
2696; AVX2-NEXT:    retq
2697;
2698; AVX512-LABEL: trunc_and_v8i32_v8i16:
2699; AVX512:       # %bb.0:
2700; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2701; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2702; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2703; AVX512-NEXT:    vzeroupper
2704; AVX512-NEXT:    retq
2705  %1 = and <8 x i32> %a0, %a1
2706  %2 = trunc <8 x i32> %1 to <8 x i16>
2707  ret <8 x i16> %2
2708}
2709
2710define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2711; SSE-LABEL: trunc_and_v16i64_v16i8:
2712; SSE:       # %bb.0:
2713; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
2714; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
2715; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
2716; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
2717; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
2718; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
2719; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
2720; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
2721; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2722; SSE-NEXT:    pand %xmm8, %xmm7
2723; SSE-NEXT:    pand %xmm8, %xmm6
2724; SSE-NEXT:    packuswb %xmm7, %xmm6
2725; SSE-NEXT:    pand %xmm8, %xmm5
2726; SSE-NEXT:    pand %xmm8, %xmm4
2727; SSE-NEXT:    packuswb %xmm5, %xmm4
2728; SSE-NEXT:    packuswb %xmm6, %xmm4
2729; SSE-NEXT:    pand %xmm8, %xmm3
2730; SSE-NEXT:    pand %xmm8, %xmm2
2731; SSE-NEXT:    packuswb %xmm3, %xmm2
2732; SSE-NEXT:    pand %xmm8, %xmm1
2733; SSE-NEXT:    pand %xmm8, %xmm0
2734; SSE-NEXT:    packuswb %xmm1, %xmm0
2735; SSE-NEXT:    packuswb %xmm2, %xmm0
2736; SSE-NEXT:    packuswb %xmm4, %xmm0
2737; SSE-NEXT:    retq
2738;
2739; AVX1-LABEL: trunc_and_v16i64_v16i8:
2740; AVX1:       # %bb.0:
2741; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255]
2742; AVX1-NEXT:    vandps %ymm7, %ymm8, %ymm7
2743; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
2744; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
2745; AVX1-NEXT:    vpackusdw %xmm7, %xmm3, %xmm3
2746; AVX1-NEXT:    vandps %ymm6, %ymm8, %ymm6
2747; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
2748; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
2749; AVX1-NEXT:    vpackusdw %xmm6, %xmm2, %xmm2
2750; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2751; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm3
2752; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2753; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2754; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2755; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm3
2756; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
2757; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2758; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
2759; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2760; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2761; AVX1-NEXT:    vzeroupper
2762; AVX1-NEXT:    retq
2763;
2764; AVX2-LABEL: trunc_and_v16i64_v16i8:
2765; AVX2:       # %bb.0:
2766; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255]
2767; AVX2-NEXT:    vpand %ymm7, %ymm8, %ymm7
2768; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
2769; AVX2-NEXT:    vpand %ymm6, %ymm8, %ymm6
2770; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
2771; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
2772; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2773; AVX2-NEXT:    vpand %ymm5, %ymm8, %ymm3
2774; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2775; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm3
2776; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2777; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2778; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2779; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
2780; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2781; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2782; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2783; AVX2-NEXT:    vzeroupper
2784; AVX2-NEXT:    retq
2785;
2786; AVX512-LABEL: trunc_and_v16i64_v16i8:
2787; AVX512:       # %bb.0:
2788; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
2789; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
2790; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
2791; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
2792; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2793; AVX512-NEXT:    vzeroupper
2794; AVX512-NEXT:    retq
2795  %1 = and <16 x i64> %a0, %a1
2796  %2 = trunc <16 x i64> %1 to <16 x i8>
2797  ret <16 x i8> %2
2798}
2799
2800define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2801; SSE-LABEL: trunc_and_v16i32_v16i8:
2802; SSE:       # %bb.0:
2803; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2804; SSE-NEXT:    pand %xmm8, %xmm7
2805; SSE-NEXT:    pand %xmm3, %xmm7
2806; SSE-NEXT:    pand %xmm8, %xmm6
2807; SSE-NEXT:    pand %xmm2, %xmm6
2808; SSE-NEXT:    packuswb %xmm7, %xmm6
2809; SSE-NEXT:    pand %xmm8, %xmm5
2810; SSE-NEXT:    pand %xmm1, %xmm5
2811; SSE-NEXT:    pand %xmm4, %xmm8
2812; SSE-NEXT:    pand %xmm8, %xmm0
2813; SSE-NEXT:    packuswb %xmm5, %xmm0
2814; SSE-NEXT:    packuswb %xmm6, %xmm0
2815; SSE-NEXT:    retq
2816;
2817; AVX1-LABEL: trunc_and_v16i32_v16i8:
2818; AVX1:       # %bb.0:
2819; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2820; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2821; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2822; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2823; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2824; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2825; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2826; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2827; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2828; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2829; AVX1-NEXT:    vzeroupper
2830; AVX1-NEXT:    retq
2831;
2832; AVX2-LABEL: trunc_and_v16i32_v16i8:
2833; AVX2:       # %bb.0:
2834; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2835; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
2836; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2837; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
2838; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2839; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2840; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2841; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2842; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2843; AVX2-NEXT:    vzeroupper
2844; AVX2-NEXT:    retq
2845;
2846; AVX512-LABEL: trunc_and_v16i32_v16i8:
2847; AVX512:       # %bb.0:
2848; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
2849; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2850; AVX512-NEXT:    vzeroupper
2851; AVX512-NEXT:    retq
2852  %1 = and <16 x i32> %a0, %a1
2853  %2 = trunc <16 x i32> %1 to <16 x i8>
2854  ret <16 x i8> %2
2855}
2856
2857define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2858; SSE-LABEL: trunc_and_v16i16_v16i8:
2859; SSE:       # %bb.0:
2860; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2861; SSE-NEXT:    pand %xmm4, %xmm3
2862; SSE-NEXT:    pand %xmm1, %xmm3
2863; SSE-NEXT:    pand %xmm2, %xmm4
2864; SSE-NEXT:    pand %xmm4, %xmm0
2865; SSE-NEXT:    packuswb %xmm3, %xmm0
2866; SSE-NEXT:    retq
2867;
2868; AVX1-LABEL: trunc_and_v16i16_v16i8:
2869; AVX1:       # %bb.0:
2870; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2871; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2872; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2873; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2874; AVX1-NEXT:    vzeroupper
2875; AVX1-NEXT:    retq
2876;
2877; AVX2-LABEL: trunc_and_v16i16_v16i8:
2878; AVX2:       # %bb.0:
2879; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2880; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2881; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2882; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2883; AVX2-NEXT:    vzeroupper
2884; AVX2-NEXT:    retq
2885;
2886; AVX512F-LABEL: trunc_and_v16i16_v16i8:
2887; AVX512F:       # %bb.0:
2888; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
2889; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2890; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2891; AVX512F-NEXT:    vzeroupper
2892; AVX512F-NEXT:    retq
2893;
2894; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
2895; AVX512BW:       # %bb.0:
2896; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
2897; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2898; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2899; AVX512BW-NEXT:    vzeroupper
2900; AVX512BW-NEXT:    retq
2901;
2902; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
2903; AVX512DQ:       # %bb.0:
2904; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
2905; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2906; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2907; AVX512DQ-NEXT:    vzeroupper
2908; AVX512DQ-NEXT:    retq
2909  %1 = and <16 x i16> %a0, %a1
2910  %2 = trunc <16 x i16> %1 to <16 x i8>
2911  ret <16 x i8> %2
2912}
2913
2914;
2915; and to constant
2916;
2917
2918define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2919; SSE-LABEL: trunc_and_const_v4i64_v4i32:
2920; SSE:       # %bb.0:
2921; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2922; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2923; SSE-NEXT:    retq
2924;
2925; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
2926; AVX1:       # %bb.0:
2927; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2928; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2929; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2930; AVX1-NEXT:    vzeroupper
2931; AVX1-NEXT:    retq
2932;
2933; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
2934; AVX2-SLOW:       # %bb.0:
2935; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2936; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2937; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2938; AVX2-SLOW-NEXT:    vzeroupper
2939; AVX2-SLOW-NEXT:    retq
2940;
2941; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
2942; AVX2-FAST-ALL:       # %bb.0:
2943; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,2,4,6,u,u,u,u>
2944; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2945; AVX2-FAST-ALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2946; AVX2-FAST-ALL-NEXT:    vzeroupper
2947; AVX2-FAST-ALL-NEXT:    retq
2948;
2949; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32:
2950; AVX2-FAST-PERLANE:       # %bb.0:
2951; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2952; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2953; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2954; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2955; AVX2-FAST-PERLANE-NEXT:    retq
2956;
2957; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
2958; AVX512:       # %bb.0:
2959; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2960; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2961; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2962; AVX512-NEXT:    vzeroupper
2963; AVX512-NEXT:    retq
2964  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2965  %2 = trunc <4 x i64> %1 to <4 x i32>
2966  ret <4 x i32> %2
2967}
2968
2969define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2970; SSE-LABEL: trunc_and_const_v8i64_v8i16:
2971; SSE:       # %bb.0:
2972; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2973; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2974; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2975; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2976; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2977; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2978; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2979; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2980; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2981; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2982; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2983; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2984; SSE-NEXT:    retq
2985;
2986; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
2987; AVX1:       # %bb.0:
2988; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2989; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2990; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2991; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2992; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2993; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2994; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2995; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2996; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2997; AVX1-NEXT:    vzeroupper
2998; AVX1-NEXT:    retq
2999;
3000; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
3001; AVX2:       # %bb.0:
3002; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3003; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3004; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3005; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3006; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3007; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3008; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3009; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3010; AVX2-NEXT:    vzeroupper
3011; AVX2-NEXT:    retq
3012;
3013; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3014; AVX512:       # %bb.0:
3015; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3016; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3017; AVX512-NEXT:    vzeroupper
3018; AVX512-NEXT:    retq
3019  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3020  %2 = trunc <8 x i64> %1 to <8 x i16>
3021  ret <8 x i16> %2
3022}
3023
3024define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3025; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3026; SSE:       # %bb.0:
3027; SSE-NEXT:    pslld $16, %xmm1
3028; SSE-NEXT:    psrad $16, %xmm1
3029; SSE-NEXT:    pslld $16, %xmm0
3030; SSE-NEXT:    psrad $16, %xmm0
3031; SSE-NEXT:    packssdw %xmm1, %xmm0
3032; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3033; SSE-NEXT:    retq
3034;
3035; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3036; AVX1:       # %bb.0:
3037; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3038; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3039; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3040; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3041; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3042; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3043; AVX1-NEXT:    vzeroupper
3044; AVX1-NEXT:    retq
3045;
3046; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3047; AVX2:       # %bb.0:
3048; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3049; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3050; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3051; AVX2-NEXT:    vzeroupper
3052; AVX2-NEXT:    retq
3053;
3054; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3055; AVX512:       # %bb.0:
3056; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3057; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3058; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3059; AVX512-NEXT:    vzeroupper
3060; AVX512-NEXT:    retq
3061  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3062  %2 = trunc <8 x i32> %1 to <8 x i16>
3063  ret <8 x i16> %2
3064}
3065
3066define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3067; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3068; SSE:       # %bb.0:
3069; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3070; SSE-NEXT:    pand %xmm8, %xmm7
3071; SSE-NEXT:    pand %xmm8, %xmm6
3072; SSE-NEXT:    packuswb %xmm7, %xmm6
3073; SSE-NEXT:    pand %xmm8, %xmm5
3074; SSE-NEXT:    pand %xmm8, %xmm4
3075; SSE-NEXT:    packuswb %xmm5, %xmm4
3076; SSE-NEXT:    packuswb %xmm6, %xmm4
3077; SSE-NEXT:    pand %xmm8, %xmm3
3078; SSE-NEXT:    pand %xmm8, %xmm2
3079; SSE-NEXT:    packuswb %xmm3, %xmm2
3080; SSE-NEXT:    pand %xmm8, %xmm1
3081; SSE-NEXT:    pand %xmm8, %xmm0
3082; SSE-NEXT:    packuswb %xmm1, %xmm0
3083; SSE-NEXT:    packuswb %xmm2, %xmm0
3084; SSE-NEXT:    packuswb %xmm4, %xmm0
3085; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3086; SSE-NEXT:    retq
3087;
3088; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3089; AVX1:       # %bb.0:
3090; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3091; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3092; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3093; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3094; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3095; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3096; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3097; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3098; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3099; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3100; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3101; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3102; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3103; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3104; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3105; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3106; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3107; AVX1-NEXT:    vzeroupper
3108; AVX1-NEXT:    retq
3109;
3110; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3111; AVX2:       # %bb.0:
3112; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3113; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3114; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3115; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3116; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3117; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3118; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3119; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3120; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3121; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3122; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3123; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3124; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3125; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3126; AVX2-NEXT:    vzeroupper
3127; AVX2-NEXT:    retq
3128;
3129; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3130; AVX512:       # %bb.0:
3131; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3132; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3133; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3134; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3135; AVX512-NEXT:    vzeroupper
3136; AVX512-NEXT:    retq
3137  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3138  %2 = trunc <16 x i64> %1 to <16 x i8>
3139  ret <16 x i8> %2
3140}
3141
3142define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3143; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3144; SSE:       # %bb.0:
3145; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3146; SSE-NEXT:    pand %xmm4, %xmm3
3147; SSE-NEXT:    pand %xmm4, %xmm2
3148; SSE-NEXT:    packuswb %xmm3, %xmm2
3149; SSE-NEXT:    pand %xmm4, %xmm1
3150; SSE-NEXT:    pand %xmm4, %xmm0
3151; SSE-NEXT:    packuswb %xmm1, %xmm0
3152; SSE-NEXT:    packuswb %xmm2, %xmm0
3153; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3154; SSE-NEXT:    retq
3155;
3156; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3157; AVX1:       # %bb.0:
3158; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3159; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3160; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3161; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3162; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3163; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3164; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3165; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3166; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3167; AVX1-NEXT:    vzeroupper
3168; AVX1-NEXT:    retq
3169;
3170; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3171; AVX2:       # %bb.0:
3172; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3173; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3174; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3175; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3176; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3177; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3178; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3179; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3180; AVX2-NEXT:    vzeroupper
3181; AVX2-NEXT:    retq
3182;
3183; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3184; AVX512:       # %bb.0:
3185; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3186; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3187; AVX512-NEXT:    vzeroupper
3188; AVX512-NEXT:    retq
3189  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3190  %2 = trunc <16 x i32> %1 to <16 x i8>
3191  ret <16 x i8> %2
3192}
3193
3194define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3195; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3196; SSE:       # %bb.0:
3197; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3198; SSE-NEXT:    pand %xmm2, %xmm1
3199; SSE-NEXT:    pand %xmm2, %xmm0
3200; SSE-NEXT:    packuswb %xmm1, %xmm0
3201; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3202; SSE-NEXT:    retq
3203;
3204; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3205; AVX1:       # %bb.0:
3206; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3207; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3208; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3209; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3210; AVX1-NEXT:    vzeroupper
3211; AVX1-NEXT:    retq
3212;
3213; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3214; AVX2:       # %bb.0:
3215; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3216; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3217; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3218; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3219; AVX2-NEXT:    vzeroupper
3220; AVX2-NEXT:    retq
3221;
3222; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3223; AVX512F:       # %bb.0:
3224; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3225; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3226; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3227; AVX512F-NEXT:    vzeroupper
3228; AVX512F-NEXT:    retq
3229;
3230; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3231; AVX512BW:       # %bb.0:
3232; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3233; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3234; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3235; AVX512BW-NEXT:    vzeroupper
3236; AVX512BW-NEXT:    retq
3237;
3238; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3239; AVX512DQ:       # %bb.0:
3240; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3241; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3242; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3243; AVX512DQ-NEXT:    vzeroupper
3244; AVX512DQ-NEXT:    retq
3245  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3246  %2 = trunc <16 x i16> %1 to <16 x i8>
3247  ret <16 x i8> %2
3248}
3249
3250;
3251; xor
3252;
3253
3254define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3255; SSE-LABEL: trunc_xor_v4i64_v4i32:
3256; SSE:       # %bb.0:
3257; SSE-NEXT:    xorps %xmm3, %xmm1
3258; SSE-NEXT:    xorps %xmm2, %xmm0
3259; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3260; SSE-NEXT:    retq
3261;
3262; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3263; AVX1:       # %bb.0:
3264; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3265; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3266; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3267; AVX1-NEXT:    vzeroupper
3268; AVX1-NEXT:    retq
3269;
3270; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3271; AVX2-SLOW:       # %bb.0:
3272; AVX2-SLOW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3273; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3274; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3275; AVX2-SLOW-NEXT:    vzeroupper
3276; AVX2-SLOW-NEXT:    retq
3277;
3278; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
3279; AVX2-FAST-ALL:       # %bb.0:
3280; AVX2-FAST-ALL-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3281; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3282; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3283; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3284; AVX2-FAST-ALL-NEXT:    vzeroupper
3285; AVX2-FAST-ALL-NEXT:    retq
3286;
3287; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32:
3288; AVX2-FAST-PERLANE:       # %bb.0:
3289; AVX2-FAST-PERLANE-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3290; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3291; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3292; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3293; AVX2-FAST-PERLANE-NEXT:    retq
3294;
3295; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3296; AVX512:       # %bb.0:
3297; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3298; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3299; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3300; AVX512-NEXT:    vzeroupper
3301; AVX512-NEXT:    retq
3302  %1 = xor <4 x i64> %a0, %a1
3303  %2 = trunc <4 x i64> %1 to <4 x i32>
3304  ret <4 x i32> %2
3305}
3306
3307define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3308; SSE-LABEL: trunc_xor_v8i64_v8i16:
3309; SSE:       # %bb.0:
3310; SSE-NEXT:    pxor %xmm6, %xmm2
3311; SSE-NEXT:    pxor %xmm7, %xmm3
3312; SSE-NEXT:    pxor %xmm4, %xmm0
3313; SSE-NEXT:    pxor %xmm5, %xmm1
3314; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3315; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3316; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3317; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3318; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3319; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3320; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3321; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3322; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3323; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3324; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3325; SSE-NEXT:    retq
3326;
3327; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3328; AVX1:       # %bb.0:
3329; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3330; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3331; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3332; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3333; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3334; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3335; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3336; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3337; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3338; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3339; AVX1-NEXT:    vzeroupper
3340; AVX1-NEXT:    retq
3341;
3342; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3343; AVX2:       # %bb.0:
3344; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3345; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3346; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3347; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3348; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3349; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3350; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3351; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3352; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3353; AVX2-NEXT:    vzeroupper
3354; AVX2-NEXT:    retq
3355;
3356; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3357; AVX512:       # %bb.0:
3358; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
3359; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3360; AVX512-NEXT:    vzeroupper
3361; AVX512-NEXT:    retq
3362  %1 = xor <8 x i64> %a0, %a1
3363  %2 = trunc <8 x i64> %1 to <8 x i16>
3364  ret <8 x i16> %2
3365}
3366
3367define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3368; SSE-LABEL: trunc_xor_v8i32_v8i16:
3369; SSE:       # %bb.0:
3370; SSE-NEXT:    pxor %xmm2, %xmm0
3371; SSE-NEXT:    pxor %xmm3, %xmm1
3372; SSE-NEXT:    pslld $16, %xmm1
3373; SSE-NEXT:    psrad $16, %xmm1
3374; SSE-NEXT:    pslld $16, %xmm0
3375; SSE-NEXT:    psrad $16, %xmm0
3376; SSE-NEXT:    packssdw %xmm1, %xmm0
3377; SSE-NEXT:    retq
3378;
3379; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3380; AVX1:       # %bb.0:
3381; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3382; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3383; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3384; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3385; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3386; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3387; AVX1-NEXT:    vzeroupper
3388; AVX1-NEXT:    retq
3389;
3390; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3391; AVX2:       # %bb.0:
3392; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3393; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3394; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3395; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3396; AVX2-NEXT:    vzeroupper
3397; AVX2-NEXT:    retq
3398;
3399; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3400; AVX512:       # %bb.0:
3401; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3402; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3403; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3404; AVX512-NEXT:    vzeroupper
3405; AVX512-NEXT:    retq
3406  %1 = xor <8 x i32> %a0, %a1
3407  %2 = trunc <8 x i32> %1 to <8 x i16>
3408  ret <8 x i16> %2
3409}
3410
3411define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3412; SSE-LABEL: trunc_xor_v16i64_v16i8:
3413; SSE:       # %bb.0:
3414; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
3415; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
3416; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
3417; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
3418; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
3419; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
3420; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
3421; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
3422; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3423; SSE-NEXT:    pand %xmm8, %xmm7
3424; SSE-NEXT:    pand %xmm8, %xmm6
3425; SSE-NEXT:    packuswb %xmm7, %xmm6
3426; SSE-NEXT:    pand %xmm8, %xmm5
3427; SSE-NEXT:    pand %xmm8, %xmm4
3428; SSE-NEXT:    packuswb %xmm5, %xmm4
3429; SSE-NEXT:    packuswb %xmm6, %xmm4
3430; SSE-NEXT:    pand %xmm8, %xmm3
3431; SSE-NEXT:    pand %xmm8, %xmm2
3432; SSE-NEXT:    packuswb %xmm3, %xmm2
3433; SSE-NEXT:    pand %xmm8, %xmm1
3434; SSE-NEXT:    pand %xmm8, %xmm0
3435; SSE-NEXT:    packuswb %xmm1, %xmm0
3436; SSE-NEXT:    packuswb %xmm2, %xmm0
3437; SSE-NEXT:    packuswb %xmm4, %xmm0
3438; SSE-NEXT:    retq
3439;
3440; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3441; AVX1:       # %bb.0:
3442; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3443; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3444; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3445; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3446; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3447; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3448; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3449; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3450; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3451; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3452; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3453; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3454; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3455; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3456; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3457; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3458; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3459; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3460; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3461; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3462; AVX1-NEXT:    vzeroupper
3463; AVX1-NEXT:    retq
3464;
3465; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3466; AVX2:       # %bb.0:
3467; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
3468; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
3469; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
3470; AVX2-NEXT:    vpxor %ymm7, %ymm3, %ymm3
3471; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3472; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3473; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3474; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3475; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3476; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3477; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3478; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3479; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3480; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3481; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3482; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3483; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3484; AVX2-NEXT:    vzeroupper
3485; AVX2-NEXT:    retq
3486;
3487; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3488; AVX512:       # %bb.0:
3489; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3490; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3491; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3492; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3493; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3494; AVX512-NEXT:    vzeroupper
3495; AVX512-NEXT:    retq
3496  %1 = xor <16 x i64> %a0, %a1
3497  %2 = trunc <16 x i64> %1 to <16 x i8>
3498  ret <16 x i8> %2
3499}
3500
3501define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3502; SSE-LABEL: trunc_xor_v16i32_v16i8:
3503; SSE:       # %bb.0:
3504; SSE-NEXT:    pxor %xmm4, %xmm0
3505; SSE-NEXT:    pxor %xmm5, %xmm1
3506; SSE-NEXT:    pxor %xmm6, %xmm2
3507; SSE-NEXT:    pxor %xmm7, %xmm3
3508; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3509; SSE-NEXT:    pand %xmm4, %xmm3
3510; SSE-NEXT:    pand %xmm4, %xmm2
3511; SSE-NEXT:    packuswb %xmm3, %xmm2
3512; SSE-NEXT:    pand %xmm4, %xmm1
3513; SSE-NEXT:    pand %xmm4, %xmm0
3514; SSE-NEXT:    packuswb %xmm1, %xmm0
3515; SSE-NEXT:    packuswb %xmm2, %xmm0
3516; SSE-NEXT:    retq
3517;
3518; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3519; AVX1:       # %bb.0:
3520; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3521; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3522; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3523; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3524; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3525; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3526; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3527; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3528; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3529; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3530; AVX1-NEXT:    vzeroupper
3531; AVX1-NEXT:    retq
3532;
3533; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3534; AVX2:       # %bb.0:
3535; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3536; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3537; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3538; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3539; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3540; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3541; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3542; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3543; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3544; AVX2-NEXT:    vzeroupper
3545; AVX2-NEXT:    retq
3546;
3547; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3548; AVX512:       # %bb.0:
3549; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
3550; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3551; AVX512-NEXT:    vzeroupper
3552; AVX512-NEXT:    retq
3553  %1 = xor <16 x i32> %a0, %a1
3554  %2 = trunc <16 x i32> %1 to <16 x i8>
3555  ret <16 x i8> %2
3556}
3557
3558define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3559; SSE-LABEL: trunc_xor_v16i16_v16i8:
3560; SSE:       # %bb.0:
3561; SSE-NEXT:    pxor %xmm2, %xmm0
3562; SSE-NEXT:    pxor %xmm3, %xmm1
3563; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3564; SSE-NEXT:    pand %xmm2, %xmm1
3565; SSE-NEXT:    pand %xmm2, %xmm0
3566; SSE-NEXT:    packuswb %xmm1, %xmm0
3567; SSE-NEXT:    retq
3568;
3569; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3570; AVX1:       # %bb.0:
3571; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3572; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3573; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3574; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3575; AVX1-NEXT:    vzeroupper
3576; AVX1-NEXT:    retq
3577;
3578; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3579; AVX2:       # %bb.0:
3580; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3581; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3582; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3583; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3584; AVX2-NEXT:    vzeroupper
3585; AVX2-NEXT:    retq
3586;
3587; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3588; AVX512F:       # %bb.0:
3589; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3590; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3591; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3592; AVX512F-NEXT:    vzeroupper
3593; AVX512F-NEXT:    retq
3594;
3595; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3596; AVX512BW:       # %bb.0:
3597; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3598; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3599; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3600; AVX512BW-NEXT:    vzeroupper
3601; AVX512BW-NEXT:    retq
3602;
3603; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3604; AVX512DQ:       # %bb.0:
3605; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3606; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3607; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3608; AVX512DQ-NEXT:    vzeroupper
3609; AVX512DQ-NEXT:    retq
3610  %1 = xor <16 x i16> %a0, %a1
3611  %2 = trunc <16 x i16> %1 to <16 x i8>
3612  ret <16 x i8> %2
3613}
3614
3615;
3616; xor to constant
3617;
3618
3619define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3620; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3621; SSE:       # %bb.0:
3622; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3623; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3624; SSE-NEXT:    retq
3625;
3626; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3627; AVX1:       # %bb.0:
3628; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3629; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3630; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3631; AVX1-NEXT:    vzeroupper
3632; AVX1-NEXT:    retq
3633;
3634; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3635; AVX2-SLOW:       # %bb.0:
3636; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3637; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3638; AVX2-SLOW-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3639; AVX2-SLOW-NEXT:    vzeroupper
3640; AVX2-SLOW-NEXT:    retq
3641;
3642; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
3643; AVX2-FAST-ALL:       # %bb.0:
3644; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3645; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3646; AVX2-FAST-ALL-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3647; AVX2-FAST-ALL-NEXT:    vzeroupper
3648; AVX2-FAST-ALL-NEXT:    retq
3649;
3650; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32:
3651; AVX2-FAST-PERLANE:       # %bb.0:
3652; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3653; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3654; AVX2-FAST-PERLANE-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3655; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3656; AVX2-FAST-PERLANE-NEXT:    retq
3657;
3658; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3659; AVX512:       # %bb.0:
3660; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3661; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3662; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3663; AVX512-NEXT:    vzeroupper
3664; AVX512-NEXT:    retq
3665  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3666  %2 = trunc <4 x i64> %1 to <4 x i32>
3667  ret <4 x i32> %2
3668}
3669
3670define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3671; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3672; SSE:       # %bb.0:
3673; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3674; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3675; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3676; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3677; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3678; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3679; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3680; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3681; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3682; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3683; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3684; SSE-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3685; SSE-NEXT:    retq
3686;
3687; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3688; AVX1:       # %bb.0:
3689; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3690; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3691; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3692; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3693; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3694; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3695; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3696; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3697; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3698; AVX1-NEXT:    vzeroupper
3699; AVX1-NEXT:    retq
3700;
3701; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3702; AVX2:       # %bb.0:
3703; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3704; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3705; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3706; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3707; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3708; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3709; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3710; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3711; AVX2-NEXT:    vzeroupper
3712; AVX2-NEXT:    retq
3713;
3714; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3715; AVX512:       # %bb.0:
3716; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3717; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3718; AVX512-NEXT:    vzeroupper
3719; AVX512-NEXT:    retq
3720  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3721  %2 = trunc <8 x i64> %1 to <8 x i16>
3722  ret <8 x i16> %2
3723}
3724
3725define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3726; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3727; SSE:       # %bb.0:
3728; SSE-NEXT:    pslld $16, %xmm1
3729; SSE-NEXT:    psrad $16, %xmm1
3730; SSE-NEXT:    pslld $16, %xmm0
3731; SSE-NEXT:    psrad $16, %xmm0
3732; SSE-NEXT:    packssdw %xmm1, %xmm0
3733; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3734; SSE-NEXT:    retq
3735;
3736; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3737; AVX1:       # %bb.0:
3738; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3739; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3740; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3741; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3742; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3743; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3744; AVX1-NEXT:    vzeroupper
3745; AVX1-NEXT:    retq
3746;
3747; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
3748; AVX2:       # %bb.0:
3749; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3750; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3751; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3752; AVX2-NEXT:    vzeroupper
3753; AVX2-NEXT:    retq
3754;
3755; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
3756; AVX512:       # %bb.0:
3757; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3758; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3759; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3760; AVX512-NEXT:    vzeroupper
3761; AVX512-NEXT:    retq
3762  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3763  %2 = trunc <8 x i32> %1 to <8 x i16>
3764  ret <8 x i16> %2
3765}
3766
3767define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3768; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
3769; SSE:       # %bb.0:
3770; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3771; SSE-NEXT:    pand %xmm8, %xmm7
3772; SSE-NEXT:    pand %xmm8, %xmm6
3773; SSE-NEXT:    packuswb %xmm7, %xmm6
3774; SSE-NEXT:    pand %xmm8, %xmm5
3775; SSE-NEXT:    pand %xmm8, %xmm4
3776; SSE-NEXT:    packuswb %xmm5, %xmm4
3777; SSE-NEXT:    packuswb %xmm6, %xmm4
3778; SSE-NEXT:    pand %xmm8, %xmm3
3779; SSE-NEXT:    pand %xmm8, %xmm2
3780; SSE-NEXT:    packuswb %xmm3, %xmm2
3781; SSE-NEXT:    pand %xmm8, %xmm1
3782; SSE-NEXT:    pand %xmm8, %xmm0
3783; SSE-NEXT:    packuswb %xmm1, %xmm0
3784; SSE-NEXT:    packuswb %xmm2, %xmm0
3785; SSE-NEXT:    packuswb %xmm4, %xmm0
3786; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3787; SSE-NEXT:    retq
3788;
3789; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
3790; AVX1:       # %bb.0:
3791; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3792; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3793; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3794; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3795; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3796; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3797; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3798; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3799; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3800; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3801; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3802; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3803; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3804; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3805; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3806; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3807; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3808; AVX1-NEXT:    vzeroupper
3809; AVX1-NEXT:    retq
3810;
3811; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
3812; AVX2:       # %bb.0:
3813; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3814; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3815; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3816; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3817; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3818; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3819; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3820; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3821; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3822; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3823; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3824; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3825; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3826; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3827; AVX2-NEXT:    vzeroupper
3828; AVX2-NEXT:    retq
3829;
3830; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
3831; AVX512:       # %bb.0:
3832; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3833; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3834; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3835; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3836; AVX512-NEXT:    vzeroupper
3837; AVX512-NEXT:    retq
3838  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3839  %2 = trunc <16 x i64> %1 to <16 x i8>
3840  ret <16 x i8> %2
3841}
3842
3843define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3844; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
3845; SSE:       # %bb.0:
3846; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3847; SSE-NEXT:    pand %xmm4, %xmm3
3848; SSE-NEXT:    pand %xmm4, %xmm2
3849; SSE-NEXT:    packuswb %xmm3, %xmm2
3850; SSE-NEXT:    pand %xmm4, %xmm1
3851; SSE-NEXT:    pand %xmm4, %xmm0
3852; SSE-NEXT:    packuswb %xmm1, %xmm0
3853; SSE-NEXT:    packuswb %xmm2, %xmm0
3854; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3855; SSE-NEXT:    retq
3856;
3857; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
3858; AVX1:       # %bb.0:
3859; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3860; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3861; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3862; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3863; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3864; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3865; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3866; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3867; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3868; AVX1-NEXT:    vzeroupper
3869; AVX1-NEXT:    retq
3870;
3871; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
3872; AVX2:       # %bb.0:
3873; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3874; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3875; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3876; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3877; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3878; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3879; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3880; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3881; AVX2-NEXT:    vzeroupper
3882; AVX2-NEXT:    retq
3883;
3884; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
3885; AVX512:       # %bb.0:
3886; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3887; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3888; AVX512-NEXT:    vzeroupper
3889; AVX512-NEXT:    retq
3890  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3891  %2 = trunc <16 x i32> %1 to <16 x i8>
3892  ret <16 x i8> %2
3893}
3894
3895define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3896; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
3897; SSE:       # %bb.0:
3898; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3899; SSE-NEXT:    pand %xmm2, %xmm1
3900; SSE-NEXT:    pand %xmm2, %xmm0
3901; SSE-NEXT:    packuswb %xmm1, %xmm0
3902; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3903; SSE-NEXT:    retq
3904;
3905; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
3906; AVX1:       # %bb.0:
3907; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3908; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3909; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3910; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3911; AVX1-NEXT:    vzeroupper
3912; AVX1-NEXT:    retq
3913;
3914; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
3915; AVX2:       # %bb.0:
3916; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3917; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3918; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3919; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3920; AVX2-NEXT:    vzeroupper
3921; AVX2-NEXT:    retq
3922;
3923; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
3924; AVX512F:       # %bb.0:
3925; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3926; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3927; AVX512F-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3928; AVX512F-NEXT:    vzeroupper
3929; AVX512F-NEXT:    retq
3930;
3931; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
3932; AVX512BW:       # %bb.0:
3933; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3934; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3935; AVX512BW-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3936; AVX512BW-NEXT:    vzeroupper
3937; AVX512BW-NEXT:    retq
3938;
3939; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
3940; AVX512DQ:       # %bb.0:
3941; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3942; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3943; AVX512DQ-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3944; AVX512DQ-NEXT:    vzeroupper
3945; AVX512DQ-NEXT:    retq
3946  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3947  %2 = trunc <16 x i16> %1 to <16 x i8>
3948  ret <16 x i8> %2
3949}
3950
3951;
3952; or
3953;
3954
3955define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3956; SSE-LABEL: trunc_or_v4i64_v4i32:
3957; SSE:       # %bb.0:
3958; SSE-NEXT:    orps %xmm3, %xmm1
3959; SSE-NEXT:    orps %xmm2, %xmm0
3960; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3961; SSE-NEXT:    retq
3962;
3963; AVX1-LABEL: trunc_or_v4i64_v4i32:
3964; AVX1:       # %bb.0:
3965; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
3966; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3967; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3968; AVX1-NEXT:    vzeroupper
3969; AVX1-NEXT:    retq
3970;
3971; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
3972; AVX2-SLOW:       # %bb.0:
3973; AVX2-SLOW-NEXT:    vorps %ymm1, %ymm0, %ymm0
3974; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3975; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3976; AVX2-SLOW-NEXT:    vzeroupper
3977; AVX2-SLOW-NEXT:    retq
3978;
3979; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
3980; AVX2-FAST-ALL:       # %bb.0:
3981; AVX2-FAST-ALL-NEXT:    vorps %ymm1, %ymm0, %ymm0
3982; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3983; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3984; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3985; AVX2-FAST-ALL-NEXT:    vzeroupper
3986; AVX2-FAST-ALL-NEXT:    retq
3987;
3988; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32:
3989; AVX2-FAST-PERLANE:       # %bb.0:
3990; AVX2-FAST-PERLANE-NEXT:    vorps %ymm1, %ymm0, %ymm0
3991; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3992; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3993; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3994; AVX2-FAST-PERLANE-NEXT:    retq
3995;
3996; AVX512-LABEL: trunc_or_v4i64_v4i32:
3997; AVX512:       # %bb.0:
3998; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
3999; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4000; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4001; AVX512-NEXT:    vzeroupper
4002; AVX512-NEXT:    retq
4003  %1 = or <4 x i64> %a0, %a1
4004  %2 = trunc <4 x i64> %1 to <4 x i32>
4005  ret <4 x i32> %2
4006}
4007
4008define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4009; SSE-LABEL: trunc_or_v8i64_v8i16:
4010; SSE:       # %bb.0:
4011; SSE-NEXT:    por %xmm6, %xmm2
4012; SSE-NEXT:    por %xmm7, %xmm3
4013; SSE-NEXT:    por %xmm4, %xmm0
4014; SSE-NEXT:    por %xmm5, %xmm1
4015; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4016; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4017; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4018; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4019; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4020; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4021; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4022; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4023; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4024; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4025; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4026; SSE-NEXT:    retq
4027;
4028; AVX1-LABEL: trunc_or_v8i64_v8i16:
4029; AVX1:       # %bb.0:
4030; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4031; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4032; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4033; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4034; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4035; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4036; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4037; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4038; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4039; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4040; AVX1-NEXT:    vzeroupper
4041; AVX1-NEXT:    retq
4042;
4043; AVX2-LABEL: trunc_or_v8i64_v8i16:
4044; AVX2:       # %bb.0:
4045; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4046; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4047; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4048; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4049; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4050; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4051; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4052; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4053; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4054; AVX2-NEXT:    vzeroupper
4055; AVX2-NEXT:    retq
4056;
4057; AVX512-LABEL: trunc_or_v8i64_v8i16:
4058; AVX512:       # %bb.0:
4059; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
4060; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4061; AVX512-NEXT:    vzeroupper
4062; AVX512-NEXT:    retq
4063  %1 = or <8 x i64> %a0, %a1
4064  %2 = trunc <8 x i64> %1 to <8 x i16>
4065  ret <8 x i16> %2
4066}
4067
4068define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4069; SSE-LABEL: trunc_or_v8i32_v8i16:
4070; SSE:       # %bb.0:
4071; SSE-NEXT:    por %xmm2, %xmm0
4072; SSE-NEXT:    por %xmm3, %xmm1
4073; SSE-NEXT:    pslld $16, %xmm1
4074; SSE-NEXT:    psrad $16, %xmm1
4075; SSE-NEXT:    pslld $16, %xmm0
4076; SSE-NEXT:    psrad $16, %xmm0
4077; SSE-NEXT:    packssdw %xmm1, %xmm0
4078; SSE-NEXT:    retq
4079;
4080; AVX1-LABEL: trunc_or_v8i32_v8i16:
4081; AVX1:       # %bb.0:
4082; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4083; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4084; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4085; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4086; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4087; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4088; AVX1-NEXT:    vzeroupper
4089; AVX1-NEXT:    retq
4090;
4091; AVX2-LABEL: trunc_or_v8i32_v8i16:
4092; AVX2:       # %bb.0:
4093; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4094; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4095; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4096; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4097; AVX2-NEXT:    vzeroupper
4098; AVX2-NEXT:    retq
4099;
4100; AVX512-LABEL: trunc_or_v8i32_v8i16:
4101; AVX512:       # %bb.0:
4102; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4103; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4104; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4105; AVX512-NEXT:    vzeroupper
4106; AVX512-NEXT:    retq
4107  %1 = or <8 x i32> %a0, %a1
4108  %2 = trunc <8 x i32> %1 to <8 x i16>
4109  ret <8 x i16> %2
4110}
4111
4112define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4113; SSE-LABEL: trunc_or_v16i64_v16i8:
4114; SSE:       # %bb.0:
4115; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
4116; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
4117; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
4118; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
4119; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
4120; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
4121; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
4122; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
4123; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4124; SSE-NEXT:    pand %xmm8, %xmm7
4125; SSE-NEXT:    pand %xmm8, %xmm6
4126; SSE-NEXT:    packuswb %xmm7, %xmm6
4127; SSE-NEXT:    pand %xmm8, %xmm5
4128; SSE-NEXT:    pand %xmm8, %xmm4
4129; SSE-NEXT:    packuswb %xmm5, %xmm4
4130; SSE-NEXT:    packuswb %xmm6, %xmm4
4131; SSE-NEXT:    pand %xmm8, %xmm3
4132; SSE-NEXT:    pand %xmm8, %xmm2
4133; SSE-NEXT:    packuswb %xmm3, %xmm2
4134; SSE-NEXT:    pand %xmm8, %xmm1
4135; SSE-NEXT:    pand %xmm8, %xmm0
4136; SSE-NEXT:    packuswb %xmm1, %xmm0
4137; SSE-NEXT:    packuswb %xmm2, %xmm0
4138; SSE-NEXT:    packuswb %xmm4, %xmm0
4139; SSE-NEXT:    retq
4140;
4141; AVX1-LABEL: trunc_or_v16i64_v16i8:
4142; AVX1:       # %bb.0:
4143; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
4144; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
4145; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
4146; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
4147; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4148; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4149; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4150; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4151; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4152; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4153; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4154; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4155; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4156; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4157; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4158; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4159; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4160; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4161; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4162; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4163; AVX1-NEXT:    vzeroupper
4164; AVX1-NEXT:    retq
4165;
4166; AVX2-LABEL: trunc_or_v16i64_v16i8:
4167; AVX2:       # %bb.0:
4168; AVX2-NEXT:    vpor %ymm4, %ymm0, %ymm0
4169; AVX2-NEXT:    vpor %ymm5, %ymm1, %ymm1
4170; AVX2-NEXT:    vpor %ymm6, %ymm2, %ymm2
4171; AVX2-NEXT:    vpor %ymm7, %ymm3, %ymm3
4172; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4173; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
4174; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
4175; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
4176; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4177; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
4178; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
4179; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4180; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4181; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
4182; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4183; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4184; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4185; AVX2-NEXT:    vzeroupper
4186; AVX2-NEXT:    retq
4187;
4188; AVX512-LABEL: trunc_or_v16i64_v16i8:
4189; AVX512:       # %bb.0:
4190; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
4191; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
4192; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4193; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4194; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4195; AVX512-NEXT:    vzeroupper
4196; AVX512-NEXT:    retq
4197  %1 = or <16 x i64> %a0, %a1
4198  %2 = trunc <16 x i64> %1 to <16 x i8>
4199  ret <16 x i8> %2
4200}
4201
4202define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4203; SSE-LABEL: trunc_or_v16i32_v16i8:
4204; SSE:       # %bb.0:
4205; SSE-NEXT:    por %xmm4, %xmm0
4206; SSE-NEXT:    por %xmm5, %xmm1
4207; SSE-NEXT:    por %xmm6, %xmm2
4208; SSE-NEXT:    por %xmm7, %xmm3
4209; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4210; SSE-NEXT:    pand %xmm4, %xmm3
4211; SSE-NEXT:    pand %xmm4, %xmm2
4212; SSE-NEXT:    packuswb %xmm3, %xmm2
4213; SSE-NEXT:    pand %xmm4, %xmm1
4214; SSE-NEXT:    pand %xmm4, %xmm0
4215; SSE-NEXT:    packuswb %xmm1, %xmm0
4216; SSE-NEXT:    packuswb %xmm2, %xmm0
4217; SSE-NEXT:    retq
4218;
4219; AVX1-LABEL: trunc_or_v16i32_v16i8:
4220; AVX1:       # %bb.0:
4221; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4222; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4223; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4224; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4225; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4226; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4227; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4228; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4229; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4230; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4231; AVX1-NEXT:    vzeroupper
4232; AVX1-NEXT:    retq
4233;
4234; AVX2-LABEL: trunc_or_v16i32_v16i8:
4235; AVX2:       # %bb.0:
4236; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4237; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4238; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4239; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
4240; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
4241; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4242; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4243; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4244; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4245; AVX2-NEXT:    vzeroupper
4246; AVX2-NEXT:    retq
4247;
4248; AVX512-LABEL: trunc_or_v16i32_v16i8:
4249; AVX512:       # %bb.0:
4250; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
4251; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4252; AVX512-NEXT:    vzeroupper
4253; AVX512-NEXT:    retq
4254  %1 = or <16 x i32> %a0, %a1
4255  %2 = trunc <16 x i32> %1 to <16 x i8>
4256  ret <16 x i8> %2
4257}
4258
4259define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4260; SSE-LABEL: trunc_or_v16i16_v16i8:
4261; SSE:       # %bb.0:
4262; SSE-NEXT:    por %xmm2, %xmm0
4263; SSE-NEXT:    por %xmm3, %xmm1
4264; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4265; SSE-NEXT:    pand %xmm2, %xmm1
4266; SSE-NEXT:    pand %xmm2, %xmm0
4267; SSE-NEXT:    packuswb %xmm1, %xmm0
4268; SSE-NEXT:    retq
4269;
4270; AVX1-LABEL: trunc_or_v16i16_v16i8:
4271; AVX1:       # %bb.0:
4272; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4273; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4274; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4275; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4276; AVX1-NEXT:    vzeroupper
4277; AVX1-NEXT:    retq
4278;
4279; AVX2-LABEL: trunc_or_v16i16_v16i8:
4280; AVX2:       # %bb.0:
4281; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4282; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4283; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4284; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4285; AVX2-NEXT:    vzeroupper
4286; AVX2-NEXT:    retq
4287;
4288; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4289; AVX512F:       # %bb.0:
4290; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
4291; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4292; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4293; AVX512F-NEXT:    vzeroupper
4294; AVX512F-NEXT:    retq
4295;
4296; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4297; AVX512BW:       # %bb.0:
4298; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
4299; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4300; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4301; AVX512BW-NEXT:    vzeroupper
4302; AVX512BW-NEXT:    retq
4303;
4304; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4305; AVX512DQ:       # %bb.0:
4306; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
4307; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4308; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4309; AVX512DQ-NEXT:    vzeroupper
4310; AVX512DQ-NEXT:    retq
4311  %1 = or <16 x i16> %a0, %a1
4312  %2 = trunc <16 x i16> %1 to <16 x i8>
4313  ret <16 x i8> %2
4314}
4315
4316;
4317; or to constant
4318;
4319
4320define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4321; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4322; SSE:       # %bb.0:
4323; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4324; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4325; SSE-NEXT:    retq
4326;
4327; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4328; AVX1:       # %bb.0:
4329; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4330; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4331; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4332; AVX1-NEXT:    vzeroupper
4333; AVX1-NEXT:    retq
4334;
4335; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4336; AVX2-SLOW:       # %bb.0:
4337; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
4338; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4339; AVX2-SLOW-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4340; AVX2-SLOW-NEXT:    vzeroupper
4341; AVX2-SLOW-NEXT:    retq
4342;
4343; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
4344; AVX2-FAST-ALL:       # %bb.0:
4345; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
4346; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
4347; AVX2-FAST-ALL-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4348; AVX2-FAST-ALL-NEXT:    vzeroupper
4349; AVX2-FAST-ALL-NEXT:    retq
4350;
4351; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32:
4352; AVX2-FAST-PERLANE:       # %bb.0:
4353; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
4354; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4355; AVX2-FAST-PERLANE-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4356; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4357; AVX2-FAST-PERLANE-NEXT:    retq
4358;
4359; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4360; AVX512:       # %bb.0:
4361; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4362; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4363; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4364; AVX512-NEXT:    vzeroupper
4365; AVX512-NEXT:    retq
4366  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4367  %2 = trunc <4 x i64> %1 to <4 x i32>
4368  ret <4 x i32> %2
4369}
4370
4371define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4372; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4373; SSE:       # %bb.0:
4374; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4375; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4376; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4377; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4378; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4379; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4380; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4381; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4382; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4383; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4384; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4385; SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4386; SSE-NEXT:    retq
4387;
4388; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4389; AVX1:       # %bb.0:
4390; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4391; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4392; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4393; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4394; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4395; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4396; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4397; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4398; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4399; AVX1-NEXT:    vzeroupper
4400; AVX1-NEXT:    retq
4401;
4402; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4403; AVX2:       # %bb.0:
4404; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4405; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4406; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4407; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4408; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4409; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4410; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4411; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4412; AVX2-NEXT:    vzeroupper
4413; AVX2-NEXT:    retq
4414;
4415; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4416; AVX512:       # %bb.0:
4417; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4418; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4419; AVX512-NEXT:    vzeroupper
4420; AVX512-NEXT:    retq
4421  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4422  %2 = trunc <8 x i64> %1 to <8 x i16>
4423  ret <8 x i16> %2
4424}
4425
4426define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4427; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4428; SSE:       # %bb.0:
4429; SSE-NEXT:    pslld $16, %xmm1
4430; SSE-NEXT:    psrad $16, %xmm1
4431; SSE-NEXT:    pslld $16, %xmm0
4432; SSE-NEXT:    psrad $16, %xmm0
4433; SSE-NEXT:    packssdw %xmm1, %xmm0
4434; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4435; SSE-NEXT:    retq
4436;
4437; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4438; AVX1:       # %bb.0:
4439; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4440; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4441; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4442; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4443; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4444; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4445; AVX1-NEXT:    vzeroupper
4446; AVX1-NEXT:    retq
4447;
4448; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4449; AVX2:       # %bb.0:
4450; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4451; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4452; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4453; AVX2-NEXT:    vzeroupper
4454; AVX2-NEXT:    retq
4455;
4456; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4457; AVX512:       # %bb.0:
4458; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4459; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4460; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4461; AVX512-NEXT:    vzeroupper
4462; AVX512-NEXT:    retq
4463  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4464  %2 = trunc <8 x i32> %1 to <8 x i16>
4465  ret <8 x i16> %2
4466}
4467
4468define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4469; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4470; SSE:       # %bb.0:
4471; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4472; SSE-NEXT:    pand %xmm8, %xmm7
4473; SSE-NEXT:    pand %xmm8, %xmm6
4474; SSE-NEXT:    packuswb %xmm7, %xmm6
4475; SSE-NEXT:    pand %xmm8, %xmm5
4476; SSE-NEXT:    pand %xmm8, %xmm4
4477; SSE-NEXT:    packuswb %xmm5, %xmm4
4478; SSE-NEXT:    packuswb %xmm6, %xmm4
4479; SSE-NEXT:    pand %xmm8, %xmm3
4480; SSE-NEXT:    pand %xmm8, %xmm2
4481; SSE-NEXT:    packuswb %xmm3, %xmm2
4482; SSE-NEXT:    pand %xmm8, %xmm1
4483; SSE-NEXT:    pand %xmm8, %xmm0
4484; SSE-NEXT:    packuswb %xmm1, %xmm0
4485; SSE-NEXT:    packuswb %xmm2, %xmm0
4486; SSE-NEXT:    packuswb %xmm4, %xmm0
4487; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4488; SSE-NEXT:    retq
4489;
4490; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4491; AVX1:       # %bb.0:
4492; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4493; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4494; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4495; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4496; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4497; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4498; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4499; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4500; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4501; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4502; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4503; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4504; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4505; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4506; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4507; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4508; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4509; AVX1-NEXT:    vzeroupper
4510; AVX1-NEXT:    retq
4511;
4512; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4513; AVX2:       # %bb.0:
4514; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4515; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
4516; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
4517; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
4518; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4519; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
4520; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
4521; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4522; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4523; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
4524; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4525; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4526; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4527; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4528; AVX2-NEXT:    vzeroupper
4529; AVX2-NEXT:    retq
4530;
4531; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4532; AVX512:       # %bb.0:
4533; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4534; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4535; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4536; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4537; AVX512-NEXT:    vzeroupper
4538; AVX512-NEXT:    retq
4539  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4540  %2 = trunc <16 x i64> %1 to <16 x i8>
4541  ret <16 x i8> %2
4542}
4543
4544define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4545; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4546; SSE:       # %bb.0:
4547; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4548; SSE-NEXT:    pand %xmm4, %xmm3
4549; SSE-NEXT:    pand %xmm4, %xmm2
4550; SSE-NEXT:    packuswb %xmm3, %xmm2
4551; SSE-NEXT:    pand %xmm4, %xmm1
4552; SSE-NEXT:    pand %xmm4, %xmm0
4553; SSE-NEXT:    packuswb %xmm1, %xmm0
4554; SSE-NEXT:    packuswb %xmm2, %xmm0
4555; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4556; SSE-NEXT:    retq
4557;
4558; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4559; AVX1:       # %bb.0:
4560; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4561; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4562; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4563; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4564; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4565; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4566; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4567; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4568; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4569; AVX1-NEXT:    vzeroupper
4570; AVX1-NEXT:    retq
4571;
4572; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4573; AVX2:       # %bb.0:
4574; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4575; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
4576; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
4577; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4578; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4579; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4580; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4581; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4582; AVX2-NEXT:    vzeroupper
4583; AVX2-NEXT:    retq
4584;
4585; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4586; AVX512:       # %bb.0:
4587; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4588; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4589; AVX512-NEXT:    vzeroupper
4590; AVX512-NEXT:    retq
4591  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4592  %2 = trunc <16 x i32> %1 to <16 x i8>
4593  ret <16 x i8> %2
4594}
4595
4596define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4597; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4598; SSE:       # %bb.0:
4599; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4600; SSE-NEXT:    pand %xmm2, %xmm1
4601; SSE-NEXT:    pand %xmm2, %xmm0
4602; SSE-NEXT:    packuswb %xmm1, %xmm0
4603; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4604; SSE-NEXT:    retq
4605;
4606; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4607; AVX1:       # %bb.0:
4608; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4609; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4610; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4611; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4612; AVX1-NEXT:    vzeroupper
4613; AVX1-NEXT:    retq
4614;
4615; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4616; AVX2:       # %bb.0:
4617; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4618; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4619; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4620; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4621; AVX2-NEXT:    vzeroupper
4622; AVX2-NEXT:    retq
4623;
4624; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4625; AVX512F:       # %bb.0:
4626; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4627; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4628; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4629; AVX512F-NEXT:    vzeroupper
4630; AVX512F-NEXT:    retq
4631;
4632; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4633; AVX512BW:       # %bb.0:
4634; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4635; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4636; AVX512BW-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4637; AVX512BW-NEXT:    vzeroupper
4638; AVX512BW-NEXT:    retq
4639;
4640; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4641; AVX512DQ:       # %bb.0:
4642; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4643; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4644; AVX512DQ-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4645; AVX512DQ-NEXT:    vzeroupper
4646; AVX512DQ-NEXT:    retq
4647  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4648  %2 = trunc <16 x i16> %1 to <16 x i8>
4649  ret <16 x i8> %2
4650}
4651
4652;
4653; complex patterns - often created by vectorizer
4654;
4655
4656define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4657; SSE-LABEL: mul_add_const_v4i64_v4i32:
4658; SSE:       # %bb.0:
4659; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4660; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4661; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4662; SSE-NEXT:    pmuludq %xmm2, %xmm0
4663; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4664; SSE-NEXT:    pmuludq %xmm3, %xmm1
4665; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4666; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4667; SSE-NEXT:    retq
4668;
4669; AVX-LABEL: mul_add_const_v4i64_v4i32:
4670; AVX:       # %bb.0:
4671; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
4672; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4673; AVX-NEXT:    retq
4674  %1 = sext <4 x i32> %a0 to <4 x i64>
4675  %2 = sext <4 x i32> %a1 to <4 x i64>
4676  %3 = mul <4 x i64> %1, %2
4677  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
4678  %5 = trunc <4 x i64> %4 to <4 x i32>
4679  ret <4 x i32> %5
4680}
4681
4682define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4683; SSE-LABEL: mul_add_self_v4i64_v4i32:
4684; SSE:       # %bb.0:
4685; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4686; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4687; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4688; SSE-NEXT:    pmuludq %xmm2, %xmm0
4689; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4690; SSE-NEXT:    pmuludq %xmm3, %xmm1
4691; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4692; SSE-NEXT:    paddd %xmm0, %xmm0
4693; SSE-NEXT:    retq
4694;
4695; AVX-LABEL: mul_add_self_v4i64_v4i32:
4696; AVX:       # %bb.0:
4697; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
4698; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
4699; AVX-NEXT:    retq
4700  %1 = sext <4 x i32> %a0 to <4 x i64>
4701  %2 = sext <4 x i32> %a1 to <4 x i64>
4702  %3 = mul <4 x i64> %1, %2
4703  %4 = add <4 x i64> %3, %3
4704  %5 = trunc <4 x i64> %4 to <4 x i32>
4705  ret <4 x i32> %5
4706}
4707
4708define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4709; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
4710; SSE:       # %bb.0:
4711; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4712; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
4713; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
4714; SSE-NEXT:    pmuludq %xmm2, %xmm4
4715; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
4716; SSE-NEXT:    pmuludq %xmm3, %xmm1
4717; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
4718; SSE-NEXT:    paddd %xmm4, %xmm0
4719; SSE-NEXT:    retq
4720;
4721; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
4722; AVX:       # %bb.0:
4723; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
4724; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
4725; AVX-NEXT:    retq
4726  %1 = sext <4 x i32> %a0 to <4 x i64>
4727  %2 = sext <4 x i32> %a1 to <4 x i64>
4728  %3 = mul <4 x i64> %1, %2
4729  %4 = add <4 x i64> %1, %3
4730  %5 = trunc <4 x i64> %4 to <4 x i32>
4731  ret <4 x i32> %5
4732}
4733