1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
12
13;
14; add
15;
16
17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18; SSE-LABEL: trunc_add_v4i64_v4i32:
19; SSE:       # %bb.0:
20; SSE-NEXT:    paddq %xmm3, %xmm1
21; SSE-NEXT:    paddq %xmm2, %xmm0
22; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
23; SSE-NEXT:    retq
24;
25; AVX1-LABEL: trunc_add_v4i64_v4i32:
26; AVX1:       # %bb.0:
27; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
28; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
29; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
30; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
31; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
32; AVX1-NEXT:    vzeroupper
33; AVX1-NEXT:    retq
34;
35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
36; AVX2-SLOW:       # %bb.0:
37; AVX2-SLOW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
38; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
39; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
40; AVX2-SLOW-NEXT:    vzeroupper
41; AVX2-SLOW-NEXT:    retq
42;
43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32:
44; AVX2-FAST-ALL:       # %bb.0:
45; AVX2-FAST-ALL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
46; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
47; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
48; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
49; AVX2-FAST-ALL-NEXT:    vzeroupper
50; AVX2-FAST-ALL-NEXT:    retq
51;
52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32:
53; AVX2-FAST-PERLANE:       # %bb.0:
54; AVX2-FAST-PERLANE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
55; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
56; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
57; AVX2-FAST-PERLANE-NEXT:    vzeroupper
58; AVX2-FAST-PERLANE-NEXT:    retq
59;
60; AVX512-LABEL: trunc_add_v4i64_v4i32:
61; AVX512:       # %bb.0:
62; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
63; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
64; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
65; AVX512-NEXT:    vzeroupper
66; AVX512-NEXT:    retq
67  %1 = add <4 x i64> %a0, %a1
68  %2 = trunc <4 x i64> %1 to <4 x i32>
69  ret <4 x i32> %2
70}
71
72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
73; SSE-LABEL: trunc_add_v8i64_v8i16:
74; SSE:       # %bb.0:
75; SSE-NEXT:    paddq %xmm6, %xmm2
76; SSE-NEXT:    paddq %xmm7, %xmm3
77; SSE-NEXT:    paddq %xmm4, %xmm0
78; SSE-NEXT:    paddq %xmm5, %xmm1
79; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
80; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
81; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
82; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
83; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
84; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
85; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
86; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
87; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
88; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
89; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
90; SSE-NEXT:    retq
91;
92; AVX1-LABEL: trunc_add_v8i64_v8i16:
93; AVX1:       # %bb.0:
94; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
95; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
96; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
97; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
98; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
99; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
100; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
101; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
102; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
103; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
104; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
105; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
106; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
107; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
108; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
109; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
110; AVX1-NEXT:    vzeroupper
111; AVX1-NEXT:    retq
112;
113; AVX2-LABEL: trunc_add_v8i64_v8i16:
114; AVX2:       # %bb.0:
115; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
116; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
117; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
118; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
119; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
120; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
121; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
122; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
123; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
124; AVX2-NEXT:    vzeroupper
125; AVX2-NEXT:    retq
126;
127; AVX512-LABEL: trunc_add_v8i64_v8i16:
128; AVX512:       # %bb.0:
129; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
130; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
131; AVX512-NEXT:    vzeroupper
132; AVX512-NEXT:    retq
133  %1 = add <8 x i64> %a0, %a1
134  %2 = trunc <8 x i64> %1 to <8 x i16>
135  ret <8 x i16> %2
136}
137
138define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
139; SSE-LABEL: trunc_add_v8i32_v8i16:
140; SSE:       # %bb.0:
141; SSE-NEXT:    paddd %xmm2, %xmm0
142; SSE-NEXT:    paddd %xmm3, %xmm1
143; SSE-NEXT:    pslld $16, %xmm1
144; SSE-NEXT:    psrad $16, %xmm1
145; SSE-NEXT:    pslld $16, %xmm0
146; SSE-NEXT:    psrad $16, %xmm0
147; SSE-NEXT:    packssdw %xmm1, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX1-LABEL: trunc_add_v8i32_v8i16:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
153; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
154; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
155; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
156; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
157; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
158; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
159; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
160; AVX1-NEXT:    vzeroupper
161; AVX1-NEXT:    retq
162;
163; AVX2-LABEL: trunc_add_v8i32_v8i16:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
166; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
167; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
168; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
169; AVX2-NEXT:    vzeroupper
170; AVX2-NEXT:    retq
171;
172; AVX512-LABEL: trunc_add_v8i32_v8i16:
173; AVX512:       # %bb.0:
174; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
175; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
176; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
177; AVX512-NEXT:    vzeroupper
178; AVX512-NEXT:    retq
179  %1 = add <8 x i32> %a0, %a1
180  %2 = trunc <8 x i32> %1 to <8 x i16>
181  ret <8 x i16> %2
182}
183
184define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
185; SSE-LABEL: trunc_add_v16i64_v16i8:
186; SSE:       # %bb.0:
187; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
188; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
189; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
190; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
191; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
192; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
193; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
194; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
195; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
196; SSE-NEXT:    pand %xmm8, %xmm7
197; SSE-NEXT:    pand %xmm8, %xmm6
198; SSE-NEXT:    packuswb %xmm7, %xmm6
199; SSE-NEXT:    pand %xmm8, %xmm5
200; SSE-NEXT:    pand %xmm8, %xmm4
201; SSE-NEXT:    packuswb %xmm5, %xmm4
202; SSE-NEXT:    packuswb %xmm6, %xmm4
203; SSE-NEXT:    pand %xmm8, %xmm3
204; SSE-NEXT:    pand %xmm8, %xmm2
205; SSE-NEXT:    packuswb %xmm3, %xmm2
206; SSE-NEXT:    pand %xmm8, %xmm1
207; SSE-NEXT:    pand %xmm8, %xmm0
208; SSE-NEXT:    packuswb %xmm1, %xmm0
209; SSE-NEXT:    packuswb %xmm2, %xmm0
210; SSE-NEXT:    packuswb %xmm4, %xmm0
211; SSE-NEXT:    retq
212;
213; AVX1-LABEL: trunc_add_v16i64_v16i8:
214; AVX1:       # %bb.0:
215; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
216; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
217; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
218; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
219; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
220; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
221; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
222; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
223; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
224; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
225; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
226; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
227; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
228; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
229; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
230; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
231; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
232; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
233; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
234; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
235; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
236; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
237; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
238; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
239; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
240; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
241; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
242; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
243; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
244; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
245; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
246; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
247; AVX1-NEXT:    vzeroupper
248; AVX1-NEXT:    retq
249;
250; AVX2-LABEL: trunc_add_v16i64_v16i8:
251; AVX2:       # %bb.0:
252; AVX2-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
253; AVX2-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
254; AVX2-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
255; AVX2-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
256; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
257; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
258; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
259; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
260; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
261; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
262; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
263; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
264; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
265; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
266; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
267; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
268; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
269; AVX2-NEXT:    vzeroupper
270; AVX2-NEXT:    retq
271;
272; AVX512-LABEL: trunc_add_v16i64_v16i8:
273; AVX512:       # %bb.0:
274; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
275; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
276; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
277; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
278; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279; AVX512-NEXT:    vzeroupper
280; AVX512-NEXT:    retq
281  %1 = add <16 x i64> %a0, %a1
282  %2 = trunc <16 x i64> %1 to <16 x i8>
283  ret <16 x i8> %2
284}
285
286define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
287; SSE-LABEL: trunc_add_v16i32_v16i8:
288; SSE:       # %bb.0:
289; SSE-NEXT:    paddd %xmm4, %xmm0
290; SSE-NEXT:    paddd %xmm5, %xmm1
291; SSE-NEXT:    paddd %xmm6, %xmm2
292; SSE-NEXT:    paddd %xmm7, %xmm3
293; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
294; SSE-NEXT:    pand %xmm4, %xmm3
295; SSE-NEXT:    pand %xmm4, %xmm2
296; SSE-NEXT:    packuswb %xmm3, %xmm2
297; SSE-NEXT:    pand %xmm4, %xmm1
298; SSE-NEXT:    pand %xmm4, %xmm0
299; SSE-NEXT:    packuswb %xmm1, %xmm0
300; SSE-NEXT:    packuswb %xmm2, %xmm0
301; SSE-NEXT:    retq
302;
303; AVX1-LABEL: trunc_add_v16i32_v16i8:
304; AVX1:       # %bb.0:
305; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
306; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
307; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
308; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
309; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
310; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
311; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
312; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
313; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
314; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
315; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
316; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
317; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
318; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
319; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
320; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
321; AVX1-NEXT:    vzeroupper
322; AVX1-NEXT:    retq
323;
324; AVX2-LABEL: trunc_add_v16i32_v16i8:
325; AVX2:       # %bb.0:
326; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
327; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
328; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
329; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
330; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
331; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
332; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
333; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
334; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
335; AVX2-NEXT:    vzeroupper
336; AVX2-NEXT:    retq
337;
338; AVX512-LABEL: trunc_add_v16i32_v16i8:
339; AVX512:       # %bb.0:
340; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
341; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
342; AVX512-NEXT:    vzeroupper
343; AVX512-NEXT:    retq
344  %1 = add <16 x i32> %a0, %a1
345  %2 = trunc <16 x i32> %1 to <16 x i8>
346  ret <16 x i8> %2
347}
348
349define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
350; SSE-LABEL: trunc_add_v16i16_v16i8:
351; SSE:       # %bb.0:
352; SSE-NEXT:    paddw %xmm2, %xmm0
353; SSE-NEXT:    paddw %xmm3, %xmm1
354; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
355; SSE-NEXT:    pand %xmm2, %xmm1
356; SSE-NEXT:    pand %xmm2, %xmm0
357; SSE-NEXT:    packuswb %xmm1, %xmm0
358; SSE-NEXT:    retq
359;
360; AVX1-LABEL: trunc_add_v16i16_v16i8:
361; AVX1:       # %bb.0:
362; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
363; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
364; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
365; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
366; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
367; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
368; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
369; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
370; AVX1-NEXT:    vzeroupper
371; AVX1-NEXT:    retq
372;
373; AVX2-LABEL: trunc_add_v16i16_v16i8:
374; AVX2:       # %bb.0:
375; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
376; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
377; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
378; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
379; AVX2-NEXT:    vzeroupper
380; AVX2-NEXT:    retq
381;
382; AVX512F-LABEL: trunc_add_v16i16_v16i8:
383; AVX512F:       # %bb.0:
384; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
385; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
386; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
387; AVX512F-NEXT:    vzeroupper
388; AVX512F-NEXT:    retq
389;
390; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
391; AVX512BW:       # %bb.0:
392; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
393; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
394; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
395; AVX512BW-NEXT:    vzeroupper
396; AVX512BW-NEXT:    retq
397;
398; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
399; AVX512DQ:       # %bb.0:
400; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
401; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
402; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
403; AVX512DQ-NEXT:    vzeroupper
404; AVX512DQ-NEXT:    retq
405  %1 = add <16 x i16> %a0, %a1
406  %2 = trunc <16 x i16> %1 to <16 x i8>
407  ret <16 x i8> %2
408}
409
410define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
411; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
412; SSE:       # %bb.0:
413; SSE-NEXT:    pslld $16, %xmm2
414; SSE-NEXT:    psrad $16, %xmm2
415; SSE-NEXT:    pslld $16, %xmm1
416; SSE-NEXT:    psrad $16, %xmm1
417; SSE-NEXT:    packssdw %xmm2, %xmm1
418; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
419; SSE-NEXT:    psraw $8, %xmm0
420; SSE-NEXT:    paddw %xmm1, %xmm0
421; SSE-NEXT:    retq
422;
423; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
426; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
427; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
428; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
429; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
430; AVX1-NEXT:    vzeroupper
431; AVX1-NEXT:    retq
432;
433; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
434; AVX2:       # %bb.0:
435; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
436; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
437; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
438; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
439; AVX2-NEXT:    vzeroupper
440; AVX2-NEXT:    retq
441;
442; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
443; AVX512:       # %bb.0:
444; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
445; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
446; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
447; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
448; AVX512-NEXT:    vzeroupper
449; AVX512-NEXT:    retq
450  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
451  %2 = sext <8 x i8> %1 to <8 x i32>
452  %3 = add <8 x i32> %2, %a1
453  %4 = trunc <8 x i32> %3 to <8 x i16>
454  ret <8 x i16> %4
455}
456
457;
458; add to constant
459;
460
461define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
462; SSE-LABEL: trunc_add_const_v4i64_v4i32:
463; SSE:       # %bb.0:
464; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
465; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
466; SSE-NEXT:    retq
467;
468; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
469; AVX1:       # %bb.0:
470; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
471; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
472; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
473; AVX1-NEXT:    vzeroupper
474; AVX1-NEXT:    retq
475;
476; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
477; AVX2-SLOW:       # %bb.0:
478; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
479; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
480; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
481; AVX2-SLOW-NEXT:    vzeroupper
482; AVX2-SLOW-NEXT:    retq
483;
484; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32:
485; AVX2-FAST-ALL:       # %bb.0:
486; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
487; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
488; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
489; AVX2-FAST-ALL-NEXT:    vzeroupper
490; AVX2-FAST-ALL-NEXT:    retq
491;
492; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32:
493; AVX2-FAST-PERLANE:       # %bb.0:
494; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
495; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
496; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
497; AVX2-FAST-PERLANE-NEXT:    vzeroupper
498; AVX2-FAST-PERLANE-NEXT:    retq
499;
500; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
501; AVX512:       # %bb.0:
502; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
503; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
504; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
505; AVX512-NEXT:    vzeroupper
506; AVX512-NEXT:    retq
507  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
508  %2 = trunc <4 x i64> %1 to <4 x i32>
509  ret <4 x i32> %2
510}
511
512define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
513; SSE-LABEL: trunc_add_const_v8i64_v8i16:
514; SSE:       # %bb.0:
515; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
516; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
517; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
518; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
519; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
520; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
521; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
522; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
523; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
524; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
525; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
526; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
527; SSE-NEXT:    retq
528;
529; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
530; AVX1:       # %bb.0:
531; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
532; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
533; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
534; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
535; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
536; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
537; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
538; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
539; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
540; AVX1-NEXT:    vzeroupper
541; AVX1-NEXT:    retq
542;
543; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
544; AVX2:       # %bb.0:
545; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
546; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
547; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
548; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
549; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
550; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
551; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
552; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
553; AVX2-NEXT:    vzeroupper
554; AVX2-NEXT:    retq
555;
556; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
557; AVX512:       # %bb.0:
558; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
559; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
560; AVX512-NEXT:    vzeroupper
561; AVX512-NEXT:    retq
562  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
563  %2 = trunc <8 x i64> %1 to <8 x i16>
564  ret <8 x i16> %2
565}
566
567define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
568; SSE-LABEL: trunc_add_const_v8i32_v8i16:
569; SSE:       # %bb.0:
570; SSE-NEXT:    pslld $16, %xmm1
571; SSE-NEXT:    psrad $16, %xmm1
572; SSE-NEXT:    pslld $16, %xmm0
573; SSE-NEXT:    psrad $16, %xmm0
574; SSE-NEXT:    packssdw %xmm1, %xmm0
575; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
576; SSE-NEXT:    retq
577;
578; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
579; AVX1:       # %bb.0:
580; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
581; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
582; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
583; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
584; AVX1-NEXT:    vzeroupper
585; AVX1-NEXT:    retq
586;
587; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
588; AVX2:       # %bb.0:
589; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
590; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
591; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
592; AVX2-NEXT:    vzeroupper
593; AVX2-NEXT:    retq
594;
595; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
596; AVX512:       # %bb.0:
597; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
598; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
599; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
600; AVX512-NEXT:    vzeroupper
601; AVX512-NEXT:    retq
602  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
603  %2 = trunc <8 x i32> %1 to <8 x i16>
604  ret <8 x i16> %2
605}
606
607define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
608; SSE-LABEL: trunc_add_const_v16i64_v16i8:
609; SSE:       # %bb.0:
610; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
611; SSE-NEXT:    pand %xmm8, %xmm7
612; SSE-NEXT:    pand %xmm8, %xmm6
613; SSE-NEXT:    packuswb %xmm7, %xmm6
614; SSE-NEXT:    pand %xmm8, %xmm5
615; SSE-NEXT:    pand %xmm8, %xmm4
616; SSE-NEXT:    packuswb %xmm5, %xmm4
617; SSE-NEXT:    packuswb %xmm6, %xmm4
618; SSE-NEXT:    pand %xmm8, %xmm3
619; SSE-NEXT:    pand %xmm8, %xmm2
620; SSE-NEXT:    packuswb %xmm3, %xmm2
621; SSE-NEXT:    pand %xmm8, %xmm1
622; SSE-NEXT:    pand %xmm8, %xmm0
623; SSE-NEXT:    packuswb %xmm1, %xmm0
624; SSE-NEXT:    packuswb %xmm2, %xmm0
625; SSE-NEXT:    packuswb %xmm4, %xmm0
626; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
627; SSE-NEXT:    retq
628;
629; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
630; AVX1:       # %bb.0:
631; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
632; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
633; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
634; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
635; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
636; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
637; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
638; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
639; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
640; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
641; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
642; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
643; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
644; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
645; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
646; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
647; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
648; AVX1-NEXT:    vzeroupper
649; AVX1-NEXT:    retq
650;
651; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
652; AVX2:       # %bb.0:
653; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
654; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
655; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
656; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
657; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
658; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
659; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
660; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
661; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
662; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
663; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
664; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
665; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
666; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
667; AVX2-NEXT:    vzeroupper
668; AVX2-NEXT:    retq
669;
670; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
671; AVX512:       # %bb.0:
672; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
673; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
674; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
675; AVX512-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
676; AVX512-NEXT:    vzeroupper
677; AVX512-NEXT:    retq
678  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
679  %2 = trunc <16 x i64> %1 to <16 x i8>
680  ret <16 x i8> %2
681}
682
683define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
684; SSE-LABEL: trunc_add_const_v16i32_v16i8:
685; SSE:       # %bb.0:
686; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
687; SSE-NEXT:    pand %xmm4, %xmm3
688; SSE-NEXT:    pand %xmm4, %xmm2
689; SSE-NEXT:    packuswb %xmm3, %xmm2
690; SSE-NEXT:    pand %xmm4, %xmm1
691; SSE-NEXT:    pand %xmm4, %xmm0
692; SSE-NEXT:    packuswb %xmm1, %xmm0
693; SSE-NEXT:    packuswb %xmm2, %xmm0
694; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
695; SSE-NEXT:    retq
696;
697; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
698; AVX1:       # %bb.0:
699; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
700; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
701; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
702; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
703; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
704; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
705; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
706; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
707; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
708; AVX1-NEXT:    vzeroupper
709; AVX1-NEXT:    retq
710;
711; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
712; AVX2:       # %bb.0:
713; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
714; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
715; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
716; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
717; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
718; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
719; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
720; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
721; AVX2-NEXT:    vzeroupper
722; AVX2-NEXT:    retq
723;
724; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
725; AVX512:       # %bb.0:
726; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
727; AVX512-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
728; AVX512-NEXT:    vzeroupper
729; AVX512-NEXT:    retq
730  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
731  %2 = trunc <16 x i32> %1 to <16 x i8>
732  ret <16 x i8> %2
733}
734
735define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
736; SSE-LABEL: trunc_add_const_v16i16_v16i8:
737; SSE:       # %bb.0:
738; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
739; SSE-NEXT:    pand %xmm2, %xmm1
740; SSE-NEXT:    pand %xmm2, %xmm0
741; SSE-NEXT:    packuswb %xmm1, %xmm0
742; SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
743; SSE-NEXT:    retq
744;
745; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
746; AVX1:       # %bb.0:
747; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
748; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
749; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
750; AVX1-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
751; AVX1-NEXT:    vzeroupper
752; AVX1-NEXT:    retq
753;
754; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
755; AVX2:       # %bb.0:
756; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
757; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
758; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
759; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
760; AVX2-NEXT:    vzeroupper
761; AVX2-NEXT:    retq
762;
763; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
764; AVX512F:       # %bb.0:
765; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
766; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
767; AVX512F-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
768; AVX512F-NEXT:    vzeroupper
769; AVX512F-NEXT:    retq
770;
771; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
772; AVX512BW:       # %bb.0:
773; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
774; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
775; AVX512BW-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
776; AVX512BW-NEXT:    vzeroupper
777; AVX512BW-NEXT:    retq
778;
779; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
780; AVX512DQ:       # %bb.0:
781; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
782; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
783; AVX512DQ-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
784; AVX512DQ-NEXT:    vzeroupper
785; AVX512DQ-NEXT:    retq
786  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
787  %2 = trunc <16 x i16> %1 to <16 x i8>
788  ret <16 x i8> %2
789}
790
791;
792; sub
793;
794
795define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
796; SSE-LABEL: trunc_sub_v4i64_v4i32:
797; SSE:       # %bb.0:
798; SSE-NEXT:    psubq %xmm3, %xmm1
799; SSE-NEXT:    psubq %xmm2, %xmm0
800; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
801; SSE-NEXT:    retq
802;
803; AVX1-LABEL: trunc_sub_v4i64_v4i32:
804; AVX1:       # %bb.0:
805; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
806; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
807; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
808; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
809; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
810; AVX1-NEXT:    vzeroupper
811; AVX1-NEXT:    retq
812;
813; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
814; AVX2-SLOW:       # %bb.0:
815; AVX2-SLOW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
816; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
817; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
818; AVX2-SLOW-NEXT:    vzeroupper
819; AVX2-SLOW-NEXT:    retq
820;
821; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32:
822; AVX2-FAST-ALL:       # %bb.0:
823; AVX2-FAST-ALL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
824; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
825; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
826; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
827; AVX2-FAST-ALL-NEXT:    vzeroupper
828; AVX2-FAST-ALL-NEXT:    retq
829;
830; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32:
831; AVX2-FAST-PERLANE:       # %bb.0:
832; AVX2-FAST-PERLANE-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
833; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
834; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
835; AVX2-FAST-PERLANE-NEXT:    vzeroupper
836; AVX2-FAST-PERLANE-NEXT:    retq
837;
838; AVX512-LABEL: trunc_sub_v4i64_v4i32:
839; AVX512:       # %bb.0:
840; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
841; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
842; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
843; AVX512-NEXT:    vzeroupper
844; AVX512-NEXT:    retq
845  %1 = sub <4 x i64> %a0, %a1
846  %2 = trunc <4 x i64> %1 to <4 x i32>
847  ret <4 x i32> %2
848}
849
850define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
851; SSE-LABEL: trunc_sub_v8i64_v8i16:
852; SSE:       # %bb.0:
853; SSE-NEXT:    psubq %xmm6, %xmm2
854; SSE-NEXT:    psubq %xmm7, %xmm3
855; SSE-NEXT:    psubq %xmm4, %xmm0
856; SSE-NEXT:    psubq %xmm5, %xmm1
857; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
858; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
859; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
860; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
861; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
862; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
863; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
864; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
865; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
866; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
867; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
868; SSE-NEXT:    retq
869;
870; AVX1-LABEL: trunc_sub_v8i64_v8i16:
871; AVX1:       # %bb.0:
872; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
873; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
874; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
875; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
876; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
877; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
878; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
879; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
880; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
881; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
882; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
883; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
884; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
885; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
886; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
887; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
888; AVX1-NEXT:    vzeroupper
889; AVX1-NEXT:    retq
890;
891; AVX2-LABEL: trunc_sub_v8i64_v8i16:
892; AVX2:       # %bb.0:
893; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
894; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
895; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
896; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
897; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
898; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
899; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
900; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
901; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
902; AVX2-NEXT:    vzeroupper
903; AVX2-NEXT:    retq
904;
905; AVX512-LABEL: trunc_sub_v8i64_v8i16:
906; AVX512:       # %bb.0:
907; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
908; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
909; AVX512-NEXT:    vzeroupper
910; AVX512-NEXT:    retq
911  %1 = sub <8 x i64> %a0, %a1
912  %2 = trunc <8 x i64> %1 to <8 x i16>
913  ret <8 x i16> %2
914}
915
916define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
917; SSE-LABEL: trunc_sub_v8i32_v8i16:
918; SSE:       # %bb.0:
919; SSE-NEXT:    psubd %xmm2, %xmm0
920; SSE-NEXT:    psubd %xmm3, %xmm1
921; SSE-NEXT:    pslld $16, %xmm1
922; SSE-NEXT:    psrad $16, %xmm1
923; SSE-NEXT:    pslld $16, %xmm0
924; SSE-NEXT:    psrad $16, %xmm0
925; SSE-NEXT:    packssdw %xmm1, %xmm0
926; SSE-NEXT:    retq
927;
928; AVX1-LABEL: trunc_sub_v8i32_v8i16:
929; AVX1:       # %bb.0:
930; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
931; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
932; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
933; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
934; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
935; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
936; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
937; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
938; AVX1-NEXT:    vzeroupper
939; AVX1-NEXT:    retq
940;
941; AVX2-LABEL: trunc_sub_v8i32_v8i16:
942; AVX2:       # %bb.0:
943; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
944; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
945; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
946; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
947; AVX2-NEXT:    vzeroupper
948; AVX2-NEXT:    retq
949;
950; AVX512-LABEL: trunc_sub_v8i32_v8i16:
951; AVX512:       # %bb.0:
952; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
953; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
954; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
955; AVX512-NEXT:    vzeroupper
956; AVX512-NEXT:    retq
957  %1 = sub <8 x i32> %a0, %a1
958  %2 = trunc <8 x i32> %1 to <8 x i16>
959  ret <8 x i16> %2
960}
961
962define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
963; SSE-LABEL: trunc_sub_v16i64_v16i8:
964; SSE:       # %bb.0:
965; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
966; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
967; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
968; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
969; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
970; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
971; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
972; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
973; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
974; SSE-NEXT:    pand %xmm8, %xmm7
975; SSE-NEXT:    pand %xmm8, %xmm6
976; SSE-NEXT:    packuswb %xmm7, %xmm6
977; SSE-NEXT:    pand %xmm8, %xmm5
978; SSE-NEXT:    pand %xmm8, %xmm4
979; SSE-NEXT:    packuswb %xmm5, %xmm4
980; SSE-NEXT:    packuswb %xmm6, %xmm4
981; SSE-NEXT:    pand %xmm8, %xmm3
982; SSE-NEXT:    pand %xmm8, %xmm2
983; SSE-NEXT:    packuswb %xmm3, %xmm2
984; SSE-NEXT:    pand %xmm8, %xmm1
985; SSE-NEXT:    pand %xmm8, %xmm0
986; SSE-NEXT:    packuswb %xmm1, %xmm0
987; SSE-NEXT:    packuswb %xmm2, %xmm0
988; SSE-NEXT:    packuswb %xmm4, %xmm0
989; SSE-NEXT:    retq
990;
991; AVX1-LABEL: trunc_sub_v16i64_v16i8:
992; AVX1:       # %bb.0:
993; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
994; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
995; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
996; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
997; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
998; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
999; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1000; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
1001; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
1002; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1003; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1004; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
1005; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
1006; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1007; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1008; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
1009; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1010; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1011; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1012; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1013; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1014; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1015; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1016; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1017; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1018; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1019; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1020; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1021; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1022; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1023; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1024; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1025; AVX1-NEXT:    vzeroupper
1026; AVX1-NEXT:    retq
1027;
1028; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1029; AVX2:       # %bb.0:
1030; AVX2-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
1031; AVX2-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
1032; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1033; AVX2-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
1034; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1035; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1036; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1037; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1038; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1039; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1040; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1041; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1042; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1043; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1044; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1045; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1046; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1047; AVX2-NEXT:    vzeroupper
1048; AVX2-NEXT:    retq
1049;
1050; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1051; AVX512:       # %bb.0:
1052; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1053; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1054; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1055; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1056; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1057; AVX512-NEXT:    vzeroupper
1058; AVX512-NEXT:    retq
1059  %1 = sub <16 x i64> %a0, %a1
1060  %2 = trunc <16 x i64> %1 to <16 x i8>
1061  ret <16 x i8> %2
1062}
1063
1064define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1065; SSE-LABEL: trunc_sub_v16i32_v16i8:
1066; SSE:       # %bb.0:
1067; SSE-NEXT:    psubd %xmm4, %xmm0
1068; SSE-NEXT:    psubd %xmm5, %xmm1
1069; SSE-NEXT:    psubd %xmm6, %xmm2
1070; SSE-NEXT:    psubd %xmm7, %xmm3
1071; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1072; SSE-NEXT:    pand %xmm4, %xmm3
1073; SSE-NEXT:    pand %xmm4, %xmm2
1074; SSE-NEXT:    packuswb %xmm3, %xmm2
1075; SSE-NEXT:    pand %xmm4, %xmm1
1076; SSE-NEXT:    pand %xmm4, %xmm0
1077; SSE-NEXT:    packuswb %xmm1, %xmm0
1078; SSE-NEXT:    packuswb %xmm2, %xmm0
1079; SSE-NEXT:    retq
1080;
1081; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1082; AVX1:       # %bb.0:
1083; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
1084; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1085; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1086; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
1087; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
1088; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1089; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1090; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
1091; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1092; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1093; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1094; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1095; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1096; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1097; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1098; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1099; AVX1-NEXT:    vzeroupper
1100; AVX1-NEXT:    retq
1101;
1102; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1103; AVX2:       # %bb.0:
1104; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
1105; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
1106; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1107; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1108; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1109; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1110; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1111; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1112; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1113; AVX2-NEXT:    vzeroupper
1114; AVX2-NEXT:    retq
1115;
1116; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1117; AVX512:       # %bb.0:
1118; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
1119; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1120; AVX512-NEXT:    vzeroupper
1121; AVX512-NEXT:    retq
1122  %1 = sub <16 x i32> %a0, %a1
1123  %2 = trunc <16 x i32> %1 to <16 x i8>
1124  ret <16 x i8> %2
1125}
1126
1127define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1128; SSE-LABEL: trunc_sub_v16i16_v16i8:
1129; SSE:       # %bb.0:
1130; SSE-NEXT:    psubw %xmm2, %xmm0
1131; SSE-NEXT:    psubw %xmm3, %xmm1
1132; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1133; SSE-NEXT:    pand %xmm2, %xmm1
1134; SSE-NEXT:    pand %xmm2, %xmm0
1135; SSE-NEXT:    packuswb %xmm1, %xmm0
1136; SSE-NEXT:    retq
1137;
1138; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1139; AVX1:       # %bb.0:
1140; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
1141; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1142; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1143; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1144; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1145; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1146; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
1147; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
1148; AVX1-NEXT:    vzeroupper
1149; AVX1-NEXT:    retq
1150;
1151; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1152; AVX2:       # %bb.0:
1153; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1154; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1155; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1156; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1157; AVX2-NEXT:    vzeroupper
1158; AVX2-NEXT:    retq
1159;
1160; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1161; AVX512F:       # %bb.0:
1162; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1163; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1164; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1165; AVX512F-NEXT:    vzeroupper
1166; AVX512F-NEXT:    retq
1167;
1168; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1169; AVX512BW:       # %bb.0:
1170; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1171; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1172; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1173; AVX512BW-NEXT:    vzeroupper
1174; AVX512BW-NEXT:    retq
1175;
1176; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1177; AVX512DQ:       # %bb.0:
1178; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1179; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1180; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1181; AVX512DQ-NEXT:    vzeroupper
1182; AVX512DQ-NEXT:    retq
1183  %1 = sub <16 x i16> %a0, %a1
1184  %2 = trunc <16 x i16> %1 to <16 x i8>
1185  ret <16 x i8> %2
1186}
1187
1188define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1189; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1190; SSE:       # %bb.0:
1191; SSE-NEXT:    psubb %xmm1, %xmm0
1192; SSE-NEXT:    retq
1193;
1194; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1195; AVX:       # %bb.0:
1196; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1197; AVX-NEXT:    retq
1198  %a = zext <16 x i8> %x to <16 x i16>
1199  %b = zext <16 x i8> %y to <16 x i16>
1200  %c = sub <16 x i16> %a, %b
1201  %d = trunc <16 x i16> %c to <16 x i8>
1202  ret <16 x i8> %d
1203}
1204
1205;
1206; sub to constant
1207;
1208
1209define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1210; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1211; SSE:       # %bb.0:
1212; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1213; SSE-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1214; SSE-NEXT:    retq
1215;
1216; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1217; AVX1:       # %bb.0:
1218; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1219; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1220; AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1221; AVX1-NEXT:    vzeroupper
1222; AVX1-NEXT:    retq
1223;
1224; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1225; AVX2-SLOW:       # %bb.0:
1226; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1227; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1228; AVX2-SLOW-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1229; AVX2-SLOW-NEXT:    vzeroupper
1230; AVX2-SLOW-NEXT:    retq
1231;
1232; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32:
1233; AVX2-FAST-ALL:       # %bb.0:
1234; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
1235; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1236; AVX2-FAST-ALL-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1237; AVX2-FAST-ALL-NEXT:    vzeroupper
1238; AVX2-FAST-ALL-NEXT:    retq
1239;
1240; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32:
1241; AVX2-FAST-PERLANE:       # %bb.0:
1242; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
1243; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1244; AVX2-FAST-PERLANE-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1245; AVX2-FAST-PERLANE-NEXT:    vzeroupper
1246; AVX2-FAST-PERLANE-NEXT:    retq
1247;
1248; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1249; AVX512:       # %bb.0:
1250; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1251; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1252; AVX512-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1253; AVX512-NEXT:    vzeroupper
1254; AVX512-NEXT:    retq
1255  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1256  %2 = trunc <4 x i64> %1 to <4 x i32>
1257  ret <4 x i32> %2
1258}
1259
1260define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1261; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1262; SSE:       # %bb.0:
1263; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1264; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1265; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1266; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1267; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1268; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1269; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1270; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1271; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1272; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1273; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1274; SSE-NEXT:    psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1275; SSE-NEXT:    retq
1276;
1277; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1278; AVX1:       # %bb.0:
1279; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
1280; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1281; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1282; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1283; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1284; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1285; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1286; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1287; AVX1-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1288; AVX1-NEXT:    vzeroupper
1289; AVX1-NEXT:    retq
1290;
1291; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
1292; AVX2:       # %bb.0:
1293; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1294; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1295; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1296; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1297; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1298; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1299; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1300; AVX2-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1301; AVX2-NEXT:    vzeroupper
1302; AVX2-NEXT:    retq
1303;
1304; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1305; AVX512:       # %bb.0:
1306; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1307; AVX512-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1308; AVX512-NEXT:    vzeroupper
1309; AVX512-NEXT:    retq
1310  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1311  %2 = trunc <8 x i64> %1 to <8 x i16>
1312  ret <8 x i16> %2
1313}
1314
1315define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1316; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1317; SSE:       # %bb.0:
1318; SSE-NEXT:    pslld $16, %xmm1
1319; SSE-NEXT:    psrad $16, %xmm1
1320; SSE-NEXT:    pslld $16, %xmm0
1321; SSE-NEXT:    psrad $16, %xmm0
1322; SSE-NEXT:    packssdw %xmm1, %xmm0
1323; SSE-NEXT:    psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1324; SSE-NEXT:    retq
1325;
1326; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1327; AVX1:       # %bb.0:
1328; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1329; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1330; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1331; AVX1-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1332; AVX1-NEXT:    vzeroupper
1333; AVX1-NEXT:    retq
1334;
1335; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1336; AVX2:       # %bb.0:
1337; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1338; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1339; AVX2-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1340; AVX2-NEXT:    vzeroupper
1341; AVX2-NEXT:    retq
1342;
1343; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1344; AVX512:       # %bb.0:
1345; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1346; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1347; AVX512-NEXT:    vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1348; AVX512-NEXT:    vzeroupper
1349; AVX512-NEXT:    retq
1350  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1351  %2 = trunc <8 x i32> %1 to <8 x i16>
1352  ret <8 x i16> %2
1353}
1354
1355define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1356; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1357; SSE:       # %bb.0:
1358; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1359; SSE-NEXT:    pand %xmm8, %xmm7
1360; SSE-NEXT:    pand %xmm8, %xmm6
1361; SSE-NEXT:    packuswb %xmm7, %xmm6
1362; SSE-NEXT:    pand %xmm8, %xmm5
1363; SSE-NEXT:    pand %xmm8, %xmm4
1364; SSE-NEXT:    packuswb %xmm5, %xmm4
1365; SSE-NEXT:    packuswb %xmm6, %xmm4
1366; SSE-NEXT:    pand %xmm8, %xmm3
1367; SSE-NEXT:    pand %xmm8, %xmm2
1368; SSE-NEXT:    packuswb %xmm3, %xmm2
1369; SSE-NEXT:    pand %xmm8, %xmm1
1370; SSE-NEXT:    pand %xmm8, %xmm0
1371; SSE-NEXT:    packuswb %xmm1, %xmm0
1372; SSE-NEXT:    packuswb %xmm2, %xmm0
1373; SSE-NEXT:    packuswb %xmm4, %xmm0
1374; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1375; SSE-NEXT:    retq
1376;
1377; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1378; AVX1:       # %bb.0:
1379; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
1380; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1381; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1382; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1383; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1384; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1385; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1386; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1387; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1388; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1389; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1390; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1391; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1392; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1393; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1394; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1395; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1396; AVX1-NEXT:    vzeroupper
1397; AVX1-NEXT:    retq
1398;
1399; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1400; AVX2:       # %bb.0:
1401; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1402; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1403; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1404; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1405; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1406; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1407; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1408; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1409; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1410; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1411; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1412; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1413; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1414; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1415; AVX2-NEXT:    vzeroupper
1416; AVX2-NEXT:    retq
1417;
1418; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1419; AVX512:       # %bb.0:
1420; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1421; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1422; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1423; AVX512-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1424; AVX512-NEXT:    vzeroupper
1425; AVX512-NEXT:    retq
1426  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1427  %2 = trunc <16 x i64> %1 to <16 x i8>
1428  ret <16 x i8> %2
1429}
1430
1431define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1432; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1433; SSE:       # %bb.0:
1434; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1435; SSE-NEXT:    pand %xmm4, %xmm3
1436; SSE-NEXT:    pand %xmm4, %xmm2
1437; SSE-NEXT:    packuswb %xmm3, %xmm2
1438; SSE-NEXT:    pand %xmm4, %xmm1
1439; SSE-NEXT:    pand %xmm4, %xmm0
1440; SSE-NEXT:    packuswb %xmm1, %xmm0
1441; SSE-NEXT:    packuswb %xmm2, %xmm0
1442; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1443; SSE-NEXT:    retq
1444;
1445; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1446; AVX1:       # %bb.0:
1447; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1448; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1449; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1450; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1451; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1452; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1453; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1454; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1455; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1456; AVX1-NEXT:    vzeroupper
1457; AVX1-NEXT:    retq
1458;
1459; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1460; AVX2:       # %bb.0:
1461; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1462; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1463; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1464; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1465; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1466; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1467; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1468; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1469; AVX2-NEXT:    vzeroupper
1470; AVX2-NEXT:    retq
1471;
1472; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1473; AVX512:       # %bb.0:
1474; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1475; AVX512-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1476; AVX512-NEXT:    vzeroupper
1477; AVX512-NEXT:    retq
1478  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1479  %2 = trunc <16 x i32> %1 to <16 x i8>
1480  ret <16 x i8> %2
1481}
1482
1483define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1484; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1485; SSE:       # %bb.0:
1486; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1487; SSE-NEXT:    pand %xmm2, %xmm1
1488; SSE-NEXT:    pand %xmm2, %xmm0
1489; SSE-NEXT:    packuswb %xmm1, %xmm0
1490; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1491; SSE-NEXT:    retq
1492;
1493; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1494; AVX1:       # %bb.0:
1495; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1496; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1497; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1498; AVX1-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1499; AVX1-NEXT:    vzeroupper
1500; AVX1-NEXT:    retq
1501;
1502; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1503; AVX2:       # %bb.0:
1504; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1505; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1506; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1507; AVX2-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1508; AVX2-NEXT:    vzeroupper
1509; AVX2-NEXT:    retq
1510;
1511; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1512; AVX512F:       # %bb.0:
1513; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1514; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1515; AVX512F-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1516; AVX512F-NEXT:    vzeroupper
1517; AVX512F-NEXT:    retq
1518;
1519; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1520; AVX512BW:       # %bb.0:
1521; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1522; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1523; AVX512BW-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1524; AVX512BW-NEXT:    vzeroupper
1525; AVX512BW-NEXT:    retq
1526;
1527; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1528; AVX512DQ:       # %bb.0:
1529; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1530; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1531; AVX512DQ-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1532; AVX512DQ-NEXT:    vzeroupper
1533; AVX512DQ-NEXT:    retq
1534  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1535  %2 = trunc <16 x i16> %1 to <16 x i8>
1536  ret <16 x i8> %2
1537}
1538
1539define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1540; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1541; SSE:       # %bb.0:
1542; SSE-NEXT:    psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1543; SSE-NEXT:    retq
1544;
1545; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1546; AVX:       # %bb.0:
1547; AVX-NEXT:    vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1548; AVX-NEXT:    retq
1549  %a = zext <16 x i8> %x to <16 x i16>
1550  %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1551  %c = trunc <16 x i16> %b to <16 x i8>
1552  ret <16 x i8> %c
1553}
1554
1555define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1556; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1557; SSE:       # %bb.0:
1558; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1559; SSE-NEXT:    psubb %xmm0, %xmm1
1560; SSE-NEXT:    movdqa %xmm1, %xmm0
1561; SSE-NEXT:    retq
1562;
1563; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1564; AVX:       # %bb.0:
1565; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1566; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1567; AVX-NEXT:    retq
1568  %a = zext <16 x i8> %x to <16 x i16>
1569  %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1570  %c = trunc <16 x i16> %b to <16 x i8>
1571  ret <16 x i8> %c
1572}
1573
1574;
1575; mul
1576;
1577
1578define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1579; SSE-LABEL: trunc_mul_v4i64_v4i32:
1580; SSE:       # %bb.0:
1581; SSE-NEXT:    pmuludq %xmm3, %xmm1
1582; SSE-NEXT:    pmuludq %xmm2, %xmm0
1583; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1584; SSE-NEXT:    retq
1585;
1586; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1587; AVX1:       # %bb.0:
1588; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1589; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1590; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1591; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1592; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1593; AVX1-NEXT:    vzeroupper
1594; AVX1-NEXT:    retq
1595;
1596; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1597; AVX2-SLOW:       # %bb.0:
1598; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1599; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1600; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
1601; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1602; AVX2-SLOW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1603; AVX2-SLOW-NEXT:    vzeroupper
1604; AVX2-SLOW-NEXT:    retq
1605;
1606; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32:
1607; AVX2-FAST-ALL:       # %bb.0:
1608; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1609; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1610; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1611; AVX2-FAST-ALL-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1612; AVX2-FAST-ALL-NEXT:    vzeroupper
1613; AVX2-FAST-ALL-NEXT:    retq
1614;
1615; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32:
1616; AVX2-FAST-PERLANE:       # %bb.0:
1617; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
1618; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1619; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm2
1620; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1621; AVX2-FAST-PERLANE-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1622; AVX2-FAST-PERLANE-NEXT:    vzeroupper
1623; AVX2-FAST-PERLANE-NEXT:    retq
1624;
1625; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1626; AVX512F:       # %bb.0:
1627; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1628; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1629; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1630; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1631; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1632; AVX512F-NEXT:    vzeroupper
1633; AVX512F-NEXT:    retq
1634;
1635; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1636; AVX512BW:       # %bb.0:
1637; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1638; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1639; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1640; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1641; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1642; AVX512BW-NEXT:    vzeroupper
1643; AVX512BW-NEXT:    retq
1644;
1645; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1646; AVX512DQ:       # %bb.0:
1647; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1648; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1649; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1650; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1651; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1652; AVX512DQ-NEXT:    vzeroupper
1653; AVX512DQ-NEXT:    retq
1654  %1 = mul <4 x i64> %a0, %a1
1655  %2 = trunc <4 x i64> %1 to <4 x i32>
1656  ret <4 x i32> %2
1657}
1658
1659define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1660; SSE-LABEL: trunc_mul_v8i64_v8i16:
1661; SSE:       # %bb.0:
1662; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1663; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1664; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1665; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1666; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1667; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1668; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1669; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1670; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1671; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1672; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1673; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1674; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1675; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1676; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1677; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1678; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1679; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1680; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1681; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1682; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1683; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1684; SSE-NEXT:    pmullw %xmm6, %xmm0
1685; SSE-NEXT:    retq
1686;
1687; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1688; AVX1:       # %bb.0:
1689; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
1690; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1691; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1692; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1693; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1694; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1695; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1696; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1697; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1698; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1699; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1700; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1701; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1702; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1703; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1704; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1705; AVX1-NEXT:    vzeroupper
1706; AVX1-NEXT:    retq
1707;
1708; AVX2-LABEL: trunc_mul_v8i64_v8i16:
1709; AVX2:       # %bb.0:
1710; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1711; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15]
1712; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15]
1713; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1714; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1715; AVX2-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1716; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15]
1717; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15]
1718; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1719; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1720; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1721; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1722; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1723; AVX2-NEXT:    vzeroupper
1724; AVX2-NEXT:    retq
1725;
1726; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1727; AVX512F:       # %bb.0:
1728; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1729; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1730; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1731; AVX512F-NEXT:    vzeroupper
1732; AVX512F-NEXT:    retq
1733;
1734; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1735; AVX512BW:       # %bb.0:
1736; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1737; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1738; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1739; AVX512BW-NEXT:    vzeroupper
1740; AVX512BW-NEXT:    retq
1741;
1742; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1743; AVX512DQ:       # %bb.0:
1744; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1745; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
1746; AVX512DQ-NEXT:    vzeroupper
1747; AVX512DQ-NEXT:    retq
1748  %1 = mul <8 x i64> %a0, %a1
1749  %2 = trunc <8 x i64> %1 to <8 x i16>
1750  ret <8 x i16> %2
1751}
1752
1753define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1754; SSE-LABEL: trunc_mul_v8i32_v8i16:
1755; SSE:       # %bb.0:
1756; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1757; SSE-NEXT:    pmuludq %xmm2, %xmm0
1758; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1759; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1760; SSE-NEXT:    pmuludq %xmm4, %xmm2
1761; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1762; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1763; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1764; SSE-NEXT:    pmuludq %xmm3, %xmm1
1765; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1766; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1767; SSE-NEXT:    pmuludq %xmm2, %xmm3
1768; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1769; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1770; SSE-NEXT:    pslld $16, %xmm1
1771; SSE-NEXT:    psrad $16, %xmm1
1772; SSE-NEXT:    pslld $16, %xmm0
1773; SSE-NEXT:    psrad $16, %xmm0
1774; SSE-NEXT:    packssdw %xmm1, %xmm0
1775; SSE-NEXT:    retq
1776;
1777; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1778; AVX1:       # %bb.0:
1779; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
1780; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1781; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1782; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1783; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1784; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1785; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
1786; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
1787; AVX1-NEXT:    vzeroupper
1788; AVX1-NEXT:    retq
1789;
1790; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1791; AVX2:       # %bb.0:
1792; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1793; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1794; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1795; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1796; AVX2-NEXT:    vzeroupper
1797; AVX2-NEXT:    retq
1798;
1799; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1800; AVX512:       # %bb.0:
1801; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1802; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1803; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1804; AVX512-NEXT:    vzeroupper
1805; AVX512-NEXT:    retq
1806  %1 = mul <8 x i32> %a0, %a1
1807  %2 = trunc <8 x i32> %1 to <8 x i16>
1808  ret <8 x i16> %2
1809}
1810
1811define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1812; SSE-LABEL: trunc_mul_v16i64_v16i8:
1813; SSE:       # %bb.0:
1814; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm0
1815; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm1
1816; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm2
1817; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm3
1818; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm4
1819; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm5
1820; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm6
1821; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm7
1822; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1823; SSE-NEXT:    pand %xmm8, %xmm7
1824; SSE-NEXT:    pand %xmm8, %xmm6
1825; SSE-NEXT:    packuswb %xmm7, %xmm6
1826; SSE-NEXT:    pand %xmm8, %xmm5
1827; SSE-NEXT:    pand %xmm8, %xmm4
1828; SSE-NEXT:    packuswb %xmm5, %xmm4
1829; SSE-NEXT:    packuswb %xmm6, %xmm4
1830; SSE-NEXT:    pand %xmm8, %xmm3
1831; SSE-NEXT:    pand %xmm8, %xmm2
1832; SSE-NEXT:    packuswb %xmm3, %xmm2
1833; SSE-NEXT:    pand %xmm8, %xmm1
1834; SSE-NEXT:    pand %xmm8, %xmm0
1835; SSE-NEXT:    packuswb %xmm1, %xmm0
1836; SSE-NEXT:    packuswb %xmm2, %xmm0
1837; SSE-NEXT:    packuswb %xmm4, %xmm0
1838; SSE-NEXT:    retq
1839;
1840; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1841; AVX1:       # %bb.0:
1842; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm8
1843; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
1844; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1845; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm0
1846; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
1847; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1848; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1849; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
1850; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm5
1851; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1852; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1853; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
1854; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm6
1855; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1856; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1857; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
1858; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1859; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1860; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1861; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1862; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1863; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1864; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1865; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1866; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1867; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1868; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1869; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1870; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1871; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1872; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1873; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1874; AVX1-NEXT:    vzeroupper
1875; AVX1-NEXT:    retq
1876;
1877; AVX2-LABEL: trunc_mul_v16i64_v16i8:
1878; AVX2:       # %bb.0:
1879; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm0
1880; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm1
1881; AVX2-NEXT:    vpmuludq %ymm6, %ymm2, %ymm2
1882; AVX2-NEXT:    vpmuludq %ymm7, %ymm3, %ymm3
1883; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
1884; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1885; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1886; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
1887; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1888; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1889; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
1890; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1891; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1892; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
1893; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1894; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1895; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1896; AVX2-NEXT:    vzeroupper
1897; AVX2-NEXT:    retq
1898;
1899; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
1900; AVX512F:       # %bb.0:
1901; AVX512F-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
1902; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
1903; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
1904; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
1905; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1906; AVX512F-NEXT:    vzeroupper
1907; AVX512F-NEXT:    retq
1908;
1909; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
1910; AVX512BW:       # %bb.0:
1911; AVX512BW-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
1912; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
1913; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
1914; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
1915; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1916; AVX512BW-NEXT:    vzeroupper
1917; AVX512BW-NEXT:    retq
1918;
1919; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
1920; AVX512DQ:       # %bb.0:
1921; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
1922; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
1923; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
1924; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
1925; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1926; AVX512DQ-NEXT:    vzeroupper
1927; AVX512DQ-NEXT:    retq
1928  %1 = mul <16 x i64> %a0, %a1
1929  %2 = trunc <16 x i64> %1 to <16 x i8>
1930  ret <16 x i8> %2
1931}
1932
1933define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1934; SSE-LABEL: trunc_mul_v16i32_v16i8:
1935; SSE:       # %bb.0:
1936; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
1937; SSE-NEXT:    pmuludq %xmm4, %xmm0
1938; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1939; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1940; SSE-NEXT:    pmuludq %xmm8, %xmm4
1941; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1942; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
1943; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1944; SSE-NEXT:    pmuludq %xmm5, %xmm1
1945; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1946; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1947; SSE-NEXT:    pmuludq %xmm4, %xmm5
1948; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1949; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1950; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
1951; SSE-NEXT:    pmuludq %xmm6, %xmm2
1952; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1953; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
1954; SSE-NEXT:    pmuludq %xmm4, %xmm5
1955; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1956; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1957; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1958; SSE-NEXT:    pmuludq %xmm7, %xmm3
1959; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1960; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1961; SSE-NEXT:    pmuludq %xmm4, %xmm5
1962; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
1963; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1964; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1965; SSE-NEXT:    pand %xmm4, %xmm3
1966; SSE-NEXT:    pand %xmm4, %xmm2
1967; SSE-NEXT:    packuswb %xmm3, %xmm2
1968; SSE-NEXT:    pand %xmm4, %xmm1
1969; SSE-NEXT:    pand %xmm4, %xmm0
1970; SSE-NEXT:    packuswb %xmm1, %xmm0
1971; SSE-NEXT:    packuswb %xmm2, %xmm0
1972; SSE-NEXT:    retq
1973;
1974; AVX1-LABEL: trunc_mul_v16i32_v16i8:
1975; AVX1:       # %bb.0:
1976; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
1977; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1978; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1979; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
1980; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
1981; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1982; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1983; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
1984; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1985; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1986; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1987; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1988; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1989; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1990; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1991; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1992; AVX1-NEXT:    vzeroupper
1993; AVX1-NEXT:    retq
1994;
1995; AVX2-LABEL: trunc_mul_v16i32_v16i8:
1996; AVX2:       # %bb.0:
1997; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
1998; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
1999; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2000; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2001; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2002; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2003; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2004; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2005; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2006; AVX2-NEXT:    vzeroupper
2007; AVX2-NEXT:    retq
2008;
2009; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2010; AVX512:       # %bb.0:
2011; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2012; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2013; AVX512-NEXT:    vzeroupper
2014; AVX512-NEXT:    retq
2015  %1 = mul <16 x i32> %a0, %a1
2016  %2 = trunc <16 x i32> %1 to <16 x i8>
2017  ret <16 x i8> %2
2018}
2019
2020define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2021; SSE-LABEL: trunc_mul_v16i16_v16i8:
2022; SSE:       # %bb.0:
2023; SSE-NEXT:    pmullw %xmm2, %xmm0
2024; SSE-NEXT:    pmullw %xmm3, %xmm1
2025; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2026; SSE-NEXT:    pand %xmm2, %xmm1
2027; SSE-NEXT:    pand %xmm2, %xmm0
2028; SSE-NEXT:    packuswb %xmm1, %xmm0
2029; SSE-NEXT:    retq
2030;
2031; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2032; AVX1:       # %bb.0:
2033; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2034; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2035; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2036; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2037; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2038; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2039; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
2040; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2041; AVX1-NEXT:    vzeroupper
2042; AVX1-NEXT:    retq
2043;
2044; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2045; AVX2:       # %bb.0:
2046; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2047; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2048; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2049; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2050; AVX2-NEXT:    vzeroupper
2051; AVX2-NEXT:    retq
2052;
2053; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2054; AVX512F:       # %bb.0:
2055; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2056; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2057; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2058; AVX512F-NEXT:    vzeroupper
2059; AVX512F-NEXT:    retq
2060;
2061; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2062; AVX512BW:       # %bb.0:
2063; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2064; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2065; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2066; AVX512BW-NEXT:    vzeroupper
2067; AVX512BW-NEXT:    retq
2068;
2069; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2070; AVX512DQ:       # %bb.0:
2071; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2072; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2073; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2074; AVX512DQ-NEXT:    vzeroupper
2075; AVX512DQ-NEXT:    retq
2076  %1 = mul <16 x i16> %a0, %a1
2077  %2 = trunc <16 x i16> %1 to <16 x i8>
2078  ret <16 x i8> %2
2079}
2080
2081define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2082; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2083; SSE:       # %bb.0:
2084; SSE-NEXT:    pxor %xmm3, %xmm3
2085; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2086; SSE-NEXT:    pslld $16, %xmm2
2087; SSE-NEXT:    psrad $16, %xmm2
2088; SSE-NEXT:    pslld $16, %xmm1
2089; SSE-NEXT:    psrad $16, %xmm1
2090; SSE-NEXT:    packssdw %xmm2, %xmm1
2091; SSE-NEXT:    pmullw %xmm1, %xmm0
2092; SSE-NEXT:    retq
2093;
2094; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2095; AVX1:       # %bb.0:
2096; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2097; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2098; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
2099; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2100; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2101; AVX1-NEXT:    vzeroupper
2102; AVX1-NEXT:    retq
2103;
2104; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2105; AVX2:       # %bb.0:
2106; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2107; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2108; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2109; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2110; AVX2-NEXT:    vzeroupper
2111; AVX2-NEXT:    retq
2112;
2113; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2114; AVX512:       # %bb.0:
2115; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2116; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
2117; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2118; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2119; AVX512-NEXT:    vzeroupper
2120; AVX512-NEXT:    retq
2121  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2122  %2 = zext <8 x i8> %1 to <8 x i32>
2123  %3 = mul <8 x i32> %2, %a1
2124  %4 = trunc <8 x i32> %3 to <8 x i16>
2125  ret <8 x i16> %4
2126}
2127
2128;
2129; mul to constant
2130;
2131
2132define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2133; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2134; SSE:       # %bb.0:
2135; SSE-NEXT:    xorps %xmm2, %xmm2
2136; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2137; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2138; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2139; SSE-NEXT:    movaps %xmm2, %xmm0
2140; SSE-NEXT:    retq
2141;
2142; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2143; AVX1:       # %bb.0:
2144; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2145; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2146; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2147; AVX1-NEXT:    vzeroupper
2148; AVX1-NEXT:    retq
2149;
2150; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2151; AVX2-SLOW:       # %bb.0:
2152; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2153; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2154; AVX2-SLOW-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2155; AVX2-SLOW-NEXT:    vzeroupper
2156; AVX2-SLOW-NEXT:    retq
2157;
2158; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32:
2159; AVX2-FAST-ALL:       # %bb.0:
2160; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2161; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2162; AVX2-FAST-ALL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2163; AVX2-FAST-ALL-NEXT:    vzeroupper
2164; AVX2-FAST-ALL-NEXT:    retq
2165;
2166; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32:
2167; AVX2-FAST-PERLANE:       # %bb.0:
2168; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2169; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2170; AVX2-FAST-PERLANE-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2171; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2172; AVX2-FAST-PERLANE-NEXT:    retq
2173;
2174; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2175; AVX512:       # %bb.0:
2176; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2177; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2178; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2179; AVX512-NEXT:    vzeroupper
2180; AVX512-NEXT:    retq
2181  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2182  %2 = trunc <4 x i64> %1 to <4 x i32>
2183  ret <4 x i32> %2
2184}
2185
2186define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2187; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2188; SSE:       # %bb.0:
2189; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2190; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2191; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2192; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2193; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2194; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2195; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2196; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2197; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2198; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2199; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2200; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2201; SSE-NEXT:    retq
2202;
2203; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2204; AVX1:       # %bb.0:
2205; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2206; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2207; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2208; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2209; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2210; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2211; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2212; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2213; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2214; AVX1-NEXT:    vzeroupper
2215; AVX1-NEXT:    retq
2216;
2217; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
2218; AVX2:       # %bb.0:
2219; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2220; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2221; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2222; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2223; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2224; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2225; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2226; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2227; AVX2-NEXT:    vzeroupper
2228; AVX2-NEXT:    retq
2229;
2230; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2231; AVX512:       # %bb.0:
2232; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2233; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2234; AVX512-NEXT:    vzeroupper
2235; AVX512-NEXT:    retq
2236  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2237  %2 = trunc <8 x i64> %1 to <8 x i16>
2238  ret <8 x i16> %2
2239}
2240
2241define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2242; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2243; SSE:       # %bb.0:
2244; SSE-NEXT:    pslld $16, %xmm1
2245; SSE-NEXT:    psrad $16, %xmm1
2246; SSE-NEXT:    pslld $16, %xmm0
2247; SSE-NEXT:    psrad $16, %xmm0
2248; SSE-NEXT:    packssdw %xmm1, %xmm0
2249; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2250; SSE-NEXT:    retq
2251;
2252; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2253; AVX1:       # %bb.0:
2254; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2255; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2256; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2257; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2258; AVX1-NEXT:    vzeroupper
2259; AVX1-NEXT:    retq
2260;
2261; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2262; AVX2:       # %bb.0:
2263; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2264; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2265; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2266; AVX2-NEXT:    vzeroupper
2267; AVX2-NEXT:    retq
2268;
2269; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2270; AVX512:       # %bb.0:
2271; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2272; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2273; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2274; AVX512-NEXT:    vzeroupper
2275; AVX512-NEXT:    retq
2276  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2277  %2 = trunc <8 x i32> %1 to <8 x i16>
2278  ret <8 x i16> %2
2279}
2280
2281define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2282; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2283; SSE:       # %bb.0:
2284; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2285; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2286; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2287; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2288; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
2289; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
2290; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
2291; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2292; SSE-NEXT:    pand %xmm8, %xmm7
2293; SSE-NEXT:    pand %xmm8, %xmm6
2294; SSE-NEXT:    packuswb %xmm7, %xmm6
2295; SSE-NEXT:    pand %xmm8, %xmm5
2296; SSE-NEXT:    pand %xmm8, %xmm4
2297; SSE-NEXT:    packuswb %xmm5, %xmm4
2298; SSE-NEXT:    packuswb %xmm6, %xmm4
2299; SSE-NEXT:    pand %xmm8, %xmm3
2300; SSE-NEXT:    pand %xmm8, %xmm2
2301; SSE-NEXT:    packuswb %xmm3, %xmm2
2302; SSE-NEXT:    pand %xmm8, %xmm1
2303; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2304; SSE-NEXT:    packuswb %xmm1, %xmm0
2305; SSE-NEXT:    packuswb %xmm2, %xmm0
2306; SSE-NEXT:    packuswb %xmm4, %xmm0
2307; SSE-NEXT:    retq
2308;
2309; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2310; AVX1:       # %bb.0:
2311; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8
2312; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2313; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2314; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
2315; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2316; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2317; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
2318; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2319; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2320; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
2321; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2322; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
2323; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
2324; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2325; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
2326; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
2327; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2328; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
2329; AVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
2330; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2331; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2332; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
2333; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2334; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2335; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
2336; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
2337; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2338; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2339; AVX1-NEXT:    vzeroupper
2340; AVX1-NEXT:    retq
2341;
2342; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2343; AVX2:       # %bb.0:
2344; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2345; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2346; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2347; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2348; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
2349; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
2350; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
2351; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
2352; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2353; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
2354; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
2355; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2356; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2357; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
2358; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2359; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2360; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2361; AVX2-NEXT:    vzeroupper
2362; AVX2-NEXT:    retq
2363;
2364; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2365; AVX512F:       # %bb.0:
2366; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2367; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2368; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
2369; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
2370; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2371; AVX512F-NEXT:    vzeroupper
2372; AVX512F-NEXT:    retq
2373;
2374; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2375; AVX512BW:       # %bb.0:
2376; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2377; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2378; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
2379; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
2380; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2381; AVX512BW-NEXT:    vzeroupper
2382; AVX512BW-NEXT:    retq
2383;
2384; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2385; AVX512DQ:       # %bb.0:
2386; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2387; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2388; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
2389; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
2390; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2391; AVX512DQ-NEXT:    vzeroupper
2392; AVX512DQ-NEXT:    retq
2393  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2394  %2 = trunc <16 x i64> %1 to <16 x i8>
2395  ret <16 x i8> %2
2396}
2397
2398define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2399; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2400; SSE:       # %bb.0:
2401; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2402; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2403; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2404; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2405; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2406; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2407; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2408; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2409; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2410; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2411; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2412; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2413; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2414; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2415; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2416; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2417; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2418; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2419; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2420; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2421; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2422; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
2423; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2424; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2425; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2426; SSE-NEXT:    pand %xmm4, %xmm3
2427; SSE-NEXT:    pand %xmm4, %xmm2
2428; SSE-NEXT:    packuswb %xmm3, %xmm2
2429; SSE-NEXT:    pand %xmm4, %xmm1
2430; SSE-NEXT:    pand %xmm4, %xmm0
2431; SSE-NEXT:    packuswb %xmm1, %xmm0
2432; SSE-NEXT:    packuswb %xmm2, %xmm0
2433; SSE-NEXT:    retq
2434;
2435; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2436; AVX1:       # %bb.0:
2437; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2438; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2439; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2440; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
2441; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2442; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2443; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
2444; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2445; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2446; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2447; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2448; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2449; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
2450; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2451; AVX1-NEXT:    vzeroupper
2452; AVX1-NEXT:    retq
2453;
2454; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2455; AVX2:       # %bb.0:
2456; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2457; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2458; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
2459; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
2460; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2461; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2462; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2463; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2464; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2465; AVX2-NEXT:    vzeroupper
2466; AVX2-NEXT:    retq
2467;
2468; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2469; AVX512:       # %bb.0:
2470; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2471; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2472; AVX512-NEXT:    vzeroupper
2473; AVX512-NEXT:    retq
2474  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2475  %2 = trunc <16 x i32> %1 to <16 x i8>
2476  ret <16 x i8> %2
2477}
2478
2479define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2480; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2481; SSE:       # %bb.0:
2482; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2483; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2484; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2485; SSE-NEXT:    pand %xmm2, %xmm1
2486; SSE-NEXT:    pand %xmm2, %xmm0
2487; SSE-NEXT:    packuswb %xmm1, %xmm0
2488; SSE-NEXT:    retq
2489;
2490; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2491; AVX1:       # %bb.0:
2492; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2493; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2494; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2495; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2496; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2497; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
2498; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2499; AVX1-NEXT:    vzeroupper
2500; AVX1-NEXT:    retq
2501;
2502; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2503; AVX2:       # %bb.0:
2504; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2505; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2506; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2507; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2508; AVX2-NEXT:    vzeroupper
2509; AVX2-NEXT:    retq
2510;
2511; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2512; AVX512F:       # %bb.0:
2513; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2514; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2515; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2516; AVX512F-NEXT:    vzeroupper
2517; AVX512F-NEXT:    retq
2518;
2519; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2520; AVX512BW:       # %bb.0:
2521; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2522; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2523; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2524; AVX512BW-NEXT:    vzeroupper
2525; AVX512BW-NEXT:    retq
2526;
2527; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2528; AVX512DQ:       # %bb.0:
2529; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2530; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2531; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2532; AVX512DQ-NEXT:    vzeroupper
2533; AVX512DQ-NEXT:    retq
2534  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2535  %2 = trunc <16 x i16> %1 to <16 x i8>
2536  ret <16 x i8> %2
2537}
2538
2539;
2540; and
2541;
2542
2543define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2544; SSE-LABEL: trunc_and_v4i64_v4i32:
2545; SSE:       # %bb.0:
2546; SSE-NEXT:    andps %xmm3, %xmm1
2547; SSE-NEXT:    andps %xmm2, %xmm0
2548; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2549; SSE-NEXT:    retq
2550;
2551; AVX1-LABEL: trunc_and_v4i64_v4i32:
2552; AVX1:       # %bb.0:
2553; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2554; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2555; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2556; AVX1-NEXT:    vzeroupper
2557; AVX1-NEXT:    retq
2558;
2559; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2560; AVX2-SLOW:       # %bb.0:
2561; AVX2-SLOW-NEXT:    vandps %ymm1, %ymm0, %ymm0
2562; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2563; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2564; AVX2-SLOW-NEXT:    vzeroupper
2565; AVX2-SLOW-NEXT:    retq
2566;
2567; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
2568; AVX2-FAST-ALL:       # %bb.0:
2569; AVX2-FAST-ALL-NEXT:    vandps %ymm1, %ymm0, %ymm0
2570; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2571; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2572; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2573; AVX2-FAST-ALL-NEXT:    vzeroupper
2574; AVX2-FAST-ALL-NEXT:    retq
2575;
2576; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32:
2577; AVX2-FAST-PERLANE:       # %bb.0:
2578; AVX2-FAST-PERLANE-NEXT:    vandps %ymm1, %ymm0, %ymm0
2579; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2580; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2581; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2582; AVX2-FAST-PERLANE-NEXT:    retq
2583;
2584; AVX512-LABEL: trunc_and_v4i64_v4i32:
2585; AVX512:       # %bb.0:
2586; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2587; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2588; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2589; AVX512-NEXT:    vzeroupper
2590; AVX512-NEXT:    retq
2591  %1 = and <4 x i64> %a0, %a1
2592  %2 = trunc <4 x i64> %1 to <4 x i32>
2593  ret <4 x i32> %2
2594}
2595
2596define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2597; SSE-LABEL: trunc_and_v8i64_v8i16:
2598; SSE:       # %bb.0:
2599; SSE-NEXT:    pand %xmm6, %xmm2
2600; SSE-NEXT:    pand %xmm7, %xmm3
2601; SSE-NEXT:    pand %xmm4, %xmm0
2602; SSE-NEXT:    pand %xmm5, %xmm1
2603; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2604; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2605; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2606; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2607; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2608; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2609; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2610; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2611; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2612; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2613; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2614; SSE-NEXT:    retq
2615;
2616; AVX1-LABEL: trunc_and_v8i64_v8i16:
2617; AVX1:       # %bb.0:
2618; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
2619; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2620; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2621; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2622; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2623; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2624; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2625; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2626; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2627; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2628; AVX1-NEXT:    vzeroupper
2629; AVX1-NEXT:    retq
2630;
2631; AVX2-LABEL: trunc_and_v8i64_v8i16:
2632; AVX2:       # %bb.0:
2633; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2634; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2635; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2636; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2637; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2638; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2639; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2640; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2641; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2642; AVX2-NEXT:    vzeroupper
2643; AVX2-NEXT:    retq
2644;
2645; AVX512-LABEL: trunc_and_v8i64_v8i16:
2646; AVX512:       # %bb.0:
2647; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2648; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2649; AVX512-NEXT:    vzeroupper
2650; AVX512-NEXT:    retq
2651  %1 = and <8 x i64> %a0, %a1
2652  %2 = trunc <8 x i64> %1 to <8 x i16>
2653  ret <8 x i16> %2
2654}
2655
2656define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2657; SSE-LABEL: trunc_and_v8i32_v8i16:
2658; SSE:       # %bb.0:
2659; SSE-NEXT:    pand %xmm2, %xmm0
2660; SSE-NEXT:    pand %xmm3, %xmm1
2661; SSE-NEXT:    pslld $16, %xmm1
2662; SSE-NEXT:    psrad $16, %xmm1
2663; SSE-NEXT:    pslld $16, %xmm0
2664; SSE-NEXT:    psrad $16, %xmm0
2665; SSE-NEXT:    packssdw %xmm1, %xmm0
2666; SSE-NEXT:    retq
2667;
2668; AVX1-LABEL: trunc_and_v8i32_v8i16:
2669; AVX1:       # %bb.0:
2670; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2671; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2672; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2673; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2674; AVX1-NEXT:    vzeroupper
2675; AVX1-NEXT:    retq
2676;
2677; AVX2-LABEL: trunc_and_v8i32_v8i16:
2678; AVX2:       # %bb.0:
2679; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2680; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2681; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2682; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2683; AVX2-NEXT:    vzeroupper
2684; AVX2-NEXT:    retq
2685;
2686; AVX512-LABEL: trunc_and_v8i32_v8i16:
2687; AVX512:       # %bb.0:
2688; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2689; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2690; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2691; AVX512-NEXT:    vzeroupper
2692; AVX512-NEXT:    retq
2693  %1 = and <8 x i32> %a0, %a1
2694  %2 = trunc <8 x i32> %1 to <8 x i16>
2695  ret <8 x i16> %2
2696}
2697
2698define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2699; SSE-LABEL: trunc_and_v16i64_v16i8:
2700; SSE:       # %bb.0:
2701; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
2702; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
2703; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
2704; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
2705; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
2706; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
2707; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
2708; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
2709; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2710; SSE-NEXT:    pand %xmm8, %xmm7
2711; SSE-NEXT:    pand %xmm8, %xmm6
2712; SSE-NEXT:    packuswb %xmm7, %xmm6
2713; SSE-NEXT:    pand %xmm8, %xmm5
2714; SSE-NEXT:    pand %xmm8, %xmm4
2715; SSE-NEXT:    packuswb %xmm5, %xmm4
2716; SSE-NEXT:    packuswb %xmm6, %xmm4
2717; SSE-NEXT:    pand %xmm8, %xmm3
2718; SSE-NEXT:    pand %xmm8, %xmm2
2719; SSE-NEXT:    packuswb %xmm3, %xmm2
2720; SSE-NEXT:    pand %xmm8, %xmm1
2721; SSE-NEXT:    pand %xmm8, %xmm0
2722; SSE-NEXT:    packuswb %xmm1, %xmm0
2723; SSE-NEXT:    packuswb %xmm2, %xmm0
2724; SSE-NEXT:    packuswb %xmm4, %xmm0
2725; SSE-NEXT:    retq
2726;
2727; AVX1-LABEL: trunc_and_v16i64_v16i8:
2728; AVX1:       # %bb.0:
2729; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255]
2730; AVX1-NEXT:    vandps %ymm7, %ymm8, %ymm7
2731; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
2732; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
2733; AVX1-NEXT:    vpackusdw %xmm7, %xmm3, %xmm3
2734; AVX1-NEXT:    vandps %ymm6, %ymm8, %ymm6
2735; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
2736; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
2737; AVX1-NEXT:    vpackusdw %xmm6, %xmm2, %xmm2
2738; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2739; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm3
2740; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2741; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2742; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2743; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm3
2744; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
2745; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2746; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
2747; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2748; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2749; AVX1-NEXT:    vzeroupper
2750; AVX1-NEXT:    retq
2751;
2752; AVX2-LABEL: trunc_and_v16i64_v16i8:
2753; AVX2:       # %bb.0:
2754; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255]
2755; AVX2-NEXT:    vpand %ymm7, %ymm8, %ymm7
2756; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
2757; AVX2-NEXT:    vpand %ymm6, %ymm8, %ymm6
2758; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
2759; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
2760; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2761; AVX2-NEXT:    vpand %ymm5, %ymm8, %ymm3
2762; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2763; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm3
2764; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2765; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2766; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2767; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
2768; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2769; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2770; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2771; AVX2-NEXT:    vzeroupper
2772; AVX2-NEXT:    retq
2773;
2774; AVX512-LABEL: trunc_and_v16i64_v16i8:
2775; AVX512:       # %bb.0:
2776; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
2777; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
2778; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
2779; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
2780; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2781; AVX512-NEXT:    vzeroupper
2782; AVX512-NEXT:    retq
2783  %1 = and <16 x i64> %a0, %a1
2784  %2 = trunc <16 x i64> %1 to <16 x i8>
2785  ret <16 x i8> %2
2786}
2787
2788define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2789; SSE-LABEL: trunc_and_v16i32_v16i8:
2790; SSE:       # %bb.0:
2791; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2792; SSE-NEXT:    pand %xmm8, %xmm7
2793; SSE-NEXT:    pand %xmm3, %xmm7
2794; SSE-NEXT:    pand %xmm8, %xmm6
2795; SSE-NEXT:    pand %xmm2, %xmm6
2796; SSE-NEXT:    packuswb %xmm7, %xmm6
2797; SSE-NEXT:    pand %xmm8, %xmm5
2798; SSE-NEXT:    pand %xmm1, %xmm5
2799; SSE-NEXT:    pand %xmm4, %xmm8
2800; SSE-NEXT:    pand %xmm8, %xmm0
2801; SSE-NEXT:    packuswb %xmm5, %xmm0
2802; SSE-NEXT:    packuswb %xmm6, %xmm0
2803; SSE-NEXT:    retq
2804;
2805; AVX1-LABEL: trunc_and_v16i32_v16i8:
2806; AVX1:       # %bb.0:
2807; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2808; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2809; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2810; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2811; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2812; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2813; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2814; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2815; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2816; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2817; AVX1-NEXT:    vzeroupper
2818; AVX1-NEXT:    retq
2819;
2820; AVX2-LABEL: trunc_and_v16i32_v16i8:
2821; AVX2:       # %bb.0:
2822; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
2823; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
2824; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2825; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
2826; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
2827; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2828; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2829; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2830; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2831; AVX2-NEXT:    vzeroupper
2832; AVX2-NEXT:    retq
2833;
2834; AVX512-LABEL: trunc_and_v16i32_v16i8:
2835; AVX512:       # %bb.0:
2836; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
2837; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2838; AVX512-NEXT:    vzeroupper
2839; AVX512-NEXT:    retq
2840  %1 = and <16 x i32> %a0, %a1
2841  %2 = trunc <16 x i32> %1 to <16 x i8>
2842  ret <16 x i8> %2
2843}
2844
2845define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2846; SSE-LABEL: trunc_and_v16i16_v16i8:
2847; SSE:       # %bb.0:
2848; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2849; SSE-NEXT:    pand %xmm4, %xmm3
2850; SSE-NEXT:    pand %xmm1, %xmm3
2851; SSE-NEXT:    pand %xmm2, %xmm4
2852; SSE-NEXT:    pand %xmm4, %xmm0
2853; SSE-NEXT:    packuswb %xmm3, %xmm0
2854; SSE-NEXT:    retq
2855;
2856; AVX1-LABEL: trunc_and_v16i16_v16i8:
2857; AVX1:       # %bb.0:
2858; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2859; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2860; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2861; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2862; AVX1-NEXT:    vzeroupper
2863; AVX1-NEXT:    retq
2864;
2865; AVX2-LABEL: trunc_and_v16i16_v16i8:
2866; AVX2:       # %bb.0:
2867; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2868; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2869; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2870; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2871; AVX2-NEXT:    vzeroupper
2872; AVX2-NEXT:    retq
2873;
2874; AVX512F-LABEL: trunc_and_v16i16_v16i8:
2875; AVX512F:       # %bb.0:
2876; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
2877; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2878; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2879; AVX512F-NEXT:    vzeroupper
2880; AVX512F-NEXT:    retq
2881;
2882; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
2883; AVX512BW:       # %bb.0:
2884; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
2885; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2886; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2887; AVX512BW-NEXT:    vzeroupper
2888; AVX512BW-NEXT:    retq
2889;
2890; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
2891; AVX512DQ:       # %bb.0:
2892; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
2893; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2894; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2895; AVX512DQ-NEXT:    vzeroupper
2896; AVX512DQ-NEXT:    retq
2897  %1 = and <16 x i16> %a0, %a1
2898  %2 = trunc <16 x i16> %1 to <16 x i8>
2899  ret <16 x i8> %2
2900}
2901
2902;
2903; and to constant
2904;
2905
2906define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2907; SSE-LABEL: trunc_and_const_v4i64_v4i32:
2908; SSE:       # %bb.0:
2909; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2910; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2911; SSE-NEXT:    retq
2912;
2913; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
2914; AVX1:       # %bb.0:
2915; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2916; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2917; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2918; AVX1-NEXT:    vzeroupper
2919; AVX1-NEXT:    retq
2920;
2921; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
2922; AVX2-SLOW:       # %bb.0:
2923; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2924; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2925; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2926; AVX2-SLOW-NEXT:    vzeroupper
2927; AVX2-SLOW-NEXT:    retq
2928;
2929; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
2930; AVX2-FAST-ALL:       # %bb.0:
2931; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,2,4,6,u,u,u,u>
2932; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2933; AVX2-FAST-ALL-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2934; AVX2-FAST-ALL-NEXT:    vzeroupper
2935; AVX2-FAST-ALL-NEXT:    retq
2936;
2937; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32:
2938; AVX2-FAST-PERLANE:       # %bb.0:
2939; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
2940; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2941; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2942; AVX2-FAST-PERLANE-NEXT:    vzeroupper
2943; AVX2-FAST-PERLANE-NEXT:    retq
2944;
2945; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
2946; AVX512:       # %bb.0:
2947; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2948; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2949; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2950; AVX512-NEXT:    vzeroupper
2951; AVX512-NEXT:    retq
2952  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2953  %2 = trunc <4 x i64> %1 to <4 x i32>
2954  ret <4 x i32> %2
2955}
2956
2957define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2958; SSE-LABEL: trunc_and_const_v8i64_v8i16:
2959; SSE:       # %bb.0:
2960; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2961; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2962; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2963; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2964; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2965; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2966; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2967; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2968; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2969; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2970; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2971; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2972; SSE-NEXT:    retq
2973;
2974; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
2975; AVX1:       # %bb.0:
2976; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2977; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2978; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2979; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2980; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2981; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2982; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2983; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2984; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2985; AVX1-NEXT:    vzeroupper
2986; AVX1-NEXT:    retq
2987;
2988; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
2989; AVX2:       # %bb.0:
2990; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2991; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
2992; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
2993; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2994; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2995; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2996; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2997; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2998; AVX2-NEXT:    vzeroupper
2999; AVX2-NEXT:    retq
3000;
3001; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3002; AVX512:       # %bb.0:
3003; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3004; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3005; AVX512-NEXT:    vzeroupper
3006; AVX512-NEXT:    retq
3007  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3008  %2 = trunc <8 x i64> %1 to <8 x i16>
3009  ret <8 x i16> %2
3010}
3011
3012define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3013; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3014; SSE:       # %bb.0:
3015; SSE-NEXT:    pslld $16, %xmm1
3016; SSE-NEXT:    psrad $16, %xmm1
3017; SSE-NEXT:    pslld $16, %xmm0
3018; SSE-NEXT:    psrad $16, %xmm0
3019; SSE-NEXT:    packssdw %xmm1, %xmm0
3020; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3021; SSE-NEXT:    retq
3022;
3023; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3024; AVX1:       # %bb.0:
3025; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3026; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3027; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3028; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3029; AVX1-NEXT:    vzeroupper
3030; AVX1-NEXT:    retq
3031;
3032; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3033; AVX2:       # %bb.0:
3034; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3035; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3036; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3037; AVX2-NEXT:    vzeroupper
3038; AVX2-NEXT:    retq
3039;
3040; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3041; AVX512:       # %bb.0:
3042; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3043; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3044; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3045; AVX512-NEXT:    vzeroupper
3046; AVX512-NEXT:    retq
3047  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3048  %2 = trunc <8 x i32> %1 to <8 x i16>
3049  ret <8 x i16> %2
3050}
3051
3052define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3053; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3054; SSE:       # %bb.0:
3055; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3056; SSE-NEXT:    pand %xmm8, %xmm7
3057; SSE-NEXT:    pand %xmm8, %xmm6
3058; SSE-NEXT:    packuswb %xmm7, %xmm6
3059; SSE-NEXT:    pand %xmm8, %xmm5
3060; SSE-NEXT:    pand %xmm8, %xmm4
3061; SSE-NEXT:    packuswb %xmm5, %xmm4
3062; SSE-NEXT:    packuswb %xmm6, %xmm4
3063; SSE-NEXT:    pand %xmm8, %xmm3
3064; SSE-NEXT:    pand %xmm8, %xmm2
3065; SSE-NEXT:    packuswb %xmm3, %xmm2
3066; SSE-NEXT:    pand %xmm8, %xmm1
3067; SSE-NEXT:    pand %xmm8, %xmm0
3068; SSE-NEXT:    packuswb %xmm1, %xmm0
3069; SSE-NEXT:    packuswb %xmm2, %xmm0
3070; SSE-NEXT:    packuswb %xmm4, %xmm0
3071; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3072; SSE-NEXT:    retq
3073;
3074; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3075; AVX1:       # %bb.0:
3076; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3077; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3078; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3079; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3080; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3081; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3082; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3083; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3084; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3085; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3086; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3087; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3088; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3089; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3090; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3091; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3092; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3093; AVX1-NEXT:    vzeroupper
3094; AVX1-NEXT:    retq
3095;
3096; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3097; AVX2:       # %bb.0:
3098; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3099; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3100; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3101; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3102; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3103; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3104; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3105; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3106; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3107; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3108; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3109; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3110; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3111; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3112; AVX2-NEXT:    vzeroupper
3113; AVX2-NEXT:    retq
3114;
3115; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3116; AVX512:       # %bb.0:
3117; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3118; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3119; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3120; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3121; AVX512-NEXT:    vzeroupper
3122; AVX512-NEXT:    retq
3123  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3124  %2 = trunc <16 x i64> %1 to <16 x i8>
3125  ret <16 x i8> %2
3126}
3127
3128define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3129; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3130; SSE:       # %bb.0:
3131; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3132; SSE-NEXT:    pand %xmm4, %xmm3
3133; SSE-NEXT:    pand %xmm4, %xmm2
3134; SSE-NEXT:    packuswb %xmm3, %xmm2
3135; SSE-NEXT:    pand %xmm4, %xmm1
3136; SSE-NEXT:    pand %xmm4, %xmm0
3137; SSE-NEXT:    packuswb %xmm1, %xmm0
3138; SSE-NEXT:    packuswb %xmm2, %xmm0
3139; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3140; SSE-NEXT:    retq
3141;
3142; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3143; AVX1:       # %bb.0:
3144; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3145; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3146; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3147; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3148; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3149; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3150; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3151; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3152; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3153; AVX1-NEXT:    vzeroupper
3154; AVX1-NEXT:    retq
3155;
3156; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3157; AVX2:       # %bb.0:
3158; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3159; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3160; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3161; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3162; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3163; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3164; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3165; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3166; AVX2-NEXT:    vzeroupper
3167; AVX2-NEXT:    retq
3168;
3169; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3170; AVX512:       # %bb.0:
3171; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3172; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3173; AVX512-NEXT:    vzeroupper
3174; AVX512-NEXT:    retq
3175  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3176  %2 = trunc <16 x i32> %1 to <16 x i8>
3177  ret <16 x i8> %2
3178}
3179
3180define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3181; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3182; SSE:       # %bb.0:
3183; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3184; SSE-NEXT:    pand %xmm2, %xmm1
3185; SSE-NEXT:    pand %xmm2, %xmm0
3186; SSE-NEXT:    packuswb %xmm1, %xmm0
3187; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3188; SSE-NEXT:    retq
3189;
3190; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3191; AVX1:       # %bb.0:
3192; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3193; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3194; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3195; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3196; AVX1-NEXT:    vzeroupper
3197; AVX1-NEXT:    retq
3198;
3199; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3200; AVX2:       # %bb.0:
3201; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3202; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3203; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3204; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3205; AVX2-NEXT:    vzeroupper
3206; AVX2-NEXT:    retq
3207;
3208; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3209; AVX512F:       # %bb.0:
3210; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3211; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3212; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3213; AVX512F-NEXT:    vzeroupper
3214; AVX512F-NEXT:    retq
3215;
3216; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3217; AVX512BW:       # %bb.0:
3218; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3219; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3220; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3221; AVX512BW-NEXT:    vzeroupper
3222; AVX512BW-NEXT:    retq
3223;
3224; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3225; AVX512DQ:       # %bb.0:
3226; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3227; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3228; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3229; AVX512DQ-NEXT:    vzeroupper
3230; AVX512DQ-NEXT:    retq
3231  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3232  %2 = trunc <16 x i16> %1 to <16 x i8>
3233  ret <16 x i8> %2
3234}
3235
3236;
3237; xor
3238;
3239
3240define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3241; SSE-LABEL: trunc_xor_v4i64_v4i32:
3242; SSE:       # %bb.0:
3243; SSE-NEXT:    xorps %xmm3, %xmm1
3244; SSE-NEXT:    xorps %xmm2, %xmm0
3245; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3246; SSE-NEXT:    retq
3247;
3248; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3249; AVX1:       # %bb.0:
3250; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3251; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3252; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3253; AVX1-NEXT:    vzeroupper
3254; AVX1-NEXT:    retq
3255;
3256; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3257; AVX2-SLOW:       # %bb.0:
3258; AVX2-SLOW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3259; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3260; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3261; AVX2-SLOW-NEXT:    vzeroupper
3262; AVX2-SLOW-NEXT:    retq
3263;
3264; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
3265; AVX2-FAST-ALL:       # %bb.0:
3266; AVX2-FAST-ALL-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3267; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3268; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3269; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3270; AVX2-FAST-ALL-NEXT:    vzeroupper
3271; AVX2-FAST-ALL-NEXT:    retq
3272;
3273; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32:
3274; AVX2-FAST-PERLANE:       # %bb.0:
3275; AVX2-FAST-PERLANE-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3276; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3277; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3278; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3279; AVX2-FAST-PERLANE-NEXT:    retq
3280;
3281; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3282; AVX512:       # %bb.0:
3283; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3284; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3285; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3286; AVX512-NEXT:    vzeroupper
3287; AVX512-NEXT:    retq
3288  %1 = xor <4 x i64> %a0, %a1
3289  %2 = trunc <4 x i64> %1 to <4 x i32>
3290  ret <4 x i32> %2
3291}
3292
3293define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3294; SSE-LABEL: trunc_xor_v8i64_v8i16:
3295; SSE:       # %bb.0:
3296; SSE-NEXT:    pxor %xmm6, %xmm2
3297; SSE-NEXT:    pxor %xmm7, %xmm3
3298; SSE-NEXT:    pxor %xmm4, %xmm0
3299; SSE-NEXT:    pxor %xmm5, %xmm1
3300; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3301; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3302; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3303; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3304; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3305; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3306; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3307; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3308; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3309; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3310; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3311; SSE-NEXT:    retq
3312;
3313; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3314; AVX1:       # %bb.0:
3315; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3316; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3317; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3318; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3319; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3320; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3321; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3322; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3323; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3324; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3325; AVX1-NEXT:    vzeroupper
3326; AVX1-NEXT:    retq
3327;
3328; AVX2-LABEL: trunc_xor_v8i64_v8i16:
3329; AVX2:       # %bb.0:
3330; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3331; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3332; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3333; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3334; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3335; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3336; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3337; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3338; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3339; AVX2-NEXT:    vzeroupper
3340; AVX2-NEXT:    retq
3341;
3342; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3343; AVX512:       # %bb.0:
3344; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
3345; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3346; AVX512-NEXT:    vzeroupper
3347; AVX512-NEXT:    retq
3348  %1 = xor <8 x i64> %a0, %a1
3349  %2 = trunc <8 x i64> %1 to <8 x i16>
3350  ret <8 x i16> %2
3351}
3352
3353define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3354; SSE-LABEL: trunc_xor_v8i32_v8i16:
3355; SSE:       # %bb.0:
3356; SSE-NEXT:    pxor %xmm2, %xmm0
3357; SSE-NEXT:    pxor %xmm3, %xmm1
3358; SSE-NEXT:    pslld $16, %xmm1
3359; SSE-NEXT:    psrad $16, %xmm1
3360; SSE-NEXT:    pslld $16, %xmm0
3361; SSE-NEXT:    psrad $16, %xmm0
3362; SSE-NEXT:    packssdw %xmm1, %xmm0
3363; SSE-NEXT:    retq
3364;
3365; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3366; AVX1:       # %bb.0:
3367; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3368; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3369; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3370; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3371; AVX1-NEXT:    vzeroupper
3372; AVX1-NEXT:    retq
3373;
3374; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3375; AVX2:       # %bb.0:
3376; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3377; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3378; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3379; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3380; AVX2-NEXT:    vzeroupper
3381; AVX2-NEXT:    retq
3382;
3383; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3384; AVX512:       # %bb.0:
3385; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3386; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3387; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3388; AVX512-NEXT:    vzeroupper
3389; AVX512-NEXT:    retq
3390  %1 = xor <8 x i32> %a0, %a1
3391  %2 = trunc <8 x i32> %1 to <8 x i16>
3392  ret <8 x i16> %2
3393}
3394
3395define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3396; SSE-LABEL: trunc_xor_v16i64_v16i8:
3397; SSE:       # %bb.0:
3398; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
3399; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
3400; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
3401; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
3402; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
3403; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
3404; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
3405; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
3406; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3407; SSE-NEXT:    pand %xmm8, %xmm7
3408; SSE-NEXT:    pand %xmm8, %xmm6
3409; SSE-NEXT:    packuswb %xmm7, %xmm6
3410; SSE-NEXT:    pand %xmm8, %xmm5
3411; SSE-NEXT:    pand %xmm8, %xmm4
3412; SSE-NEXT:    packuswb %xmm5, %xmm4
3413; SSE-NEXT:    packuswb %xmm6, %xmm4
3414; SSE-NEXT:    pand %xmm8, %xmm3
3415; SSE-NEXT:    pand %xmm8, %xmm2
3416; SSE-NEXT:    packuswb %xmm3, %xmm2
3417; SSE-NEXT:    pand %xmm8, %xmm1
3418; SSE-NEXT:    pand %xmm8, %xmm0
3419; SSE-NEXT:    packuswb %xmm1, %xmm0
3420; SSE-NEXT:    packuswb %xmm2, %xmm0
3421; SSE-NEXT:    packuswb %xmm4, %xmm0
3422; SSE-NEXT:    retq
3423;
3424; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3425; AVX1:       # %bb.0:
3426; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3427; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3428; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3429; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3430; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3431; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3432; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3433; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3434; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3435; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3436; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3437; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3438; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3439; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3440; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3441; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3442; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3443; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3444; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3445; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3446; AVX1-NEXT:    vzeroupper
3447; AVX1-NEXT:    retq
3448;
3449; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3450; AVX2:       # %bb.0:
3451; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
3452; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
3453; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
3454; AVX2-NEXT:    vpxor %ymm7, %ymm3, %ymm3
3455; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3456; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3457; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3458; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3459; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3460; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3461; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3462; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3463; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3464; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3465; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3466; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3467; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3468; AVX2-NEXT:    vzeroupper
3469; AVX2-NEXT:    retq
3470;
3471; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3472; AVX512:       # %bb.0:
3473; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3474; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3475; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3476; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3477; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3478; AVX512-NEXT:    vzeroupper
3479; AVX512-NEXT:    retq
3480  %1 = xor <16 x i64> %a0, %a1
3481  %2 = trunc <16 x i64> %1 to <16 x i8>
3482  ret <16 x i8> %2
3483}
3484
3485define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3486; SSE-LABEL: trunc_xor_v16i32_v16i8:
3487; SSE:       # %bb.0:
3488; SSE-NEXT:    pxor %xmm4, %xmm0
3489; SSE-NEXT:    pxor %xmm5, %xmm1
3490; SSE-NEXT:    pxor %xmm6, %xmm2
3491; SSE-NEXT:    pxor %xmm7, %xmm3
3492; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3493; SSE-NEXT:    pand %xmm4, %xmm3
3494; SSE-NEXT:    pand %xmm4, %xmm2
3495; SSE-NEXT:    packuswb %xmm3, %xmm2
3496; SSE-NEXT:    pand %xmm4, %xmm1
3497; SSE-NEXT:    pand %xmm4, %xmm0
3498; SSE-NEXT:    packuswb %xmm1, %xmm0
3499; SSE-NEXT:    packuswb %xmm2, %xmm0
3500; SSE-NEXT:    retq
3501;
3502; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3503; AVX1:       # %bb.0:
3504; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3505; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3506; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3507; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3508; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3509; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3510; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3511; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3512; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3513; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3514; AVX1-NEXT:    vzeroupper
3515; AVX1-NEXT:    retq
3516;
3517; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3518; AVX2:       # %bb.0:
3519; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3520; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3521; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3522; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3523; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3524; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3525; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3526; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3527; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3528; AVX2-NEXT:    vzeroupper
3529; AVX2-NEXT:    retq
3530;
3531; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3532; AVX512:       # %bb.0:
3533; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
3534; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3535; AVX512-NEXT:    vzeroupper
3536; AVX512-NEXT:    retq
3537  %1 = xor <16 x i32> %a0, %a1
3538  %2 = trunc <16 x i32> %1 to <16 x i8>
3539  ret <16 x i8> %2
3540}
3541
3542define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3543; SSE-LABEL: trunc_xor_v16i16_v16i8:
3544; SSE:       # %bb.0:
3545; SSE-NEXT:    pxor %xmm2, %xmm0
3546; SSE-NEXT:    pxor %xmm3, %xmm1
3547; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3548; SSE-NEXT:    pand %xmm2, %xmm1
3549; SSE-NEXT:    pand %xmm2, %xmm0
3550; SSE-NEXT:    packuswb %xmm1, %xmm0
3551; SSE-NEXT:    retq
3552;
3553; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3554; AVX1:       # %bb.0:
3555; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3556; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3557; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3558; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3559; AVX1-NEXT:    vzeroupper
3560; AVX1-NEXT:    retq
3561;
3562; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3563; AVX2:       # %bb.0:
3564; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3565; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3566; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3567; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3568; AVX2-NEXT:    vzeroupper
3569; AVX2-NEXT:    retq
3570;
3571; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3572; AVX512F:       # %bb.0:
3573; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3574; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3575; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3576; AVX512F-NEXT:    vzeroupper
3577; AVX512F-NEXT:    retq
3578;
3579; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3580; AVX512BW:       # %bb.0:
3581; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3582; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3583; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3584; AVX512BW-NEXT:    vzeroupper
3585; AVX512BW-NEXT:    retq
3586;
3587; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3588; AVX512DQ:       # %bb.0:
3589; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3590; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3591; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3592; AVX512DQ-NEXT:    vzeroupper
3593; AVX512DQ-NEXT:    retq
3594  %1 = xor <16 x i16> %a0, %a1
3595  %2 = trunc <16 x i16> %1 to <16 x i8>
3596  ret <16 x i8> %2
3597}
3598
3599;
3600; xor to constant
3601;
3602
3603define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3604; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3605; SSE:       # %bb.0:
3606; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3607; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3608; SSE-NEXT:    retq
3609;
3610; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3611; AVX1:       # %bb.0:
3612; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3613; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3614; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3615; AVX1-NEXT:    vzeroupper
3616; AVX1-NEXT:    retq
3617;
3618; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3619; AVX2-SLOW:       # %bb.0:
3620; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3621; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3622; AVX2-SLOW-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3623; AVX2-SLOW-NEXT:    vzeroupper
3624; AVX2-SLOW-NEXT:    retq
3625;
3626; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
3627; AVX2-FAST-ALL:       # %bb.0:
3628; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3629; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3630; AVX2-FAST-ALL-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3631; AVX2-FAST-ALL-NEXT:    vzeroupper
3632; AVX2-FAST-ALL-NEXT:    retq
3633;
3634; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32:
3635; AVX2-FAST-PERLANE:       # %bb.0:
3636; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3637; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3638; AVX2-FAST-PERLANE-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3639; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3640; AVX2-FAST-PERLANE-NEXT:    retq
3641;
3642; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3643; AVX512:       # %bb.0:
3644; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3645; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3646; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3647; AVX512-NEXT:    vzeroupper
3648; AVX512-NEXT:    retq
3649  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3650  %2 = trunc <4 x i64> %1 to <4 x i32>
3651  ret <4 x i32> %2
3652}
3653
3654define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3655; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3656; SSE:       # %bb.0:
3657; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3658; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3659; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3660; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3661; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3662; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3663; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3664; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3665; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3666; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3667; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3668; SSE-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3669; SSE-NEXT:    retq
3670;
3671; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3672; AVX1:       # %bb.0:
3673; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3674; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3675; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3676; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3677; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3678; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3679; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3680; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3681; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3682; AVX1-NEXT:    vzeroupper
3683; AVX1-NEXT:    retq
3684;
3685; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
3686; AVX2:       # %bb.0:
3687; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3688; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
3689; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
3690; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3691; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3692; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3693; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3694; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3695; AVX2-NEXT:    vzeroupper
3696; AVX2-NEXT:    retq
3697;
3698; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
3699; AVX512:       # %bb.0:
3700; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3701; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3702; AVX512-NEXT:    vzeroupper
3703; AVX512-NEXT:    retq
3704  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3705  %2 = trunc <8 x i64> %1 to <8 x i16>
3706  ret <8 x i16> %2
3707}
3708
3709define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3710; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
3711; SSE:       # %bb.0:
3712; SSE-NEXT:    pslld $16, %xmm1
3713; SSE-NEXT:    psrad $16, %xmm1
3714; SSE-NEXT:    pslld $16, %xmm0
3715; SSE-NEXT:    psrad $16, %xmm0
3716; SSE-NEXT:    packssdw %xmm1, %xmm0
3717; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3718; SSE-NEXT:    retq
3719;
3720; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
3721; AVX1:       # %bb.0:
3722; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3723; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3724; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3725; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3726; AVX1-NEXT:    vzeroupper
3727; AVX1-NEXT:    retq
3728;
3729; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
3730; AVX2:       # %bb.0:
3731; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3732; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3733; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3734; AVX2-NEXT:    vzeroupper
3735; AVX2-NEXT:    retq
3736;
3737; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
3738; AVX512:       # %bb.0:
3739; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3740; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3741; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3742; AVX512-NEXT:    vzeroupper
3743; AVX512-NEXT:    retq
3744  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3745  %2 = trunc <8 x i32> %1 to <8 x i16>
3746  ret <8 x i16> %2
3747}
3748
3749define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3750; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
3751; SSE:       # %bb.0:
3752; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3753; SSE-NEXT:    pand %xmm8, %xmm7
3754; SSE-NEXT:    pand %xmm8, %xmm6
3755; SSE-NEXT:    packuswb %xmm7, %xmm6
3756; SSE-NEXT:    pand %xmm8, %xmm5
3757; SSE-NEXT:    pand %xmm8, %xmm4
3758; SSE-NEXT:    packuswb %xmm5, %xmm4
3759; SSE-NEXT:    packuswb %xmm6, %xmm4
3760; SSE-NEXT:    pand %xmm8, %xmm3
3761; SSE-NEXT:    pand %xmm8, %xmm2
3762; SSE-NEXT:    packuswb %xmm3, %xmm2
3763; SSE-NEXT:    pand %xmm8, %xmm1
3764; SSE-NEXT:    pand %xmm8, %xmm0
3765; SSE-NEXT:    packuswb %xmm1, %xmm0
3766; SSE-NEXT:    packuswb %xmm2, %xmm0
3767; SSE-NEXT:    packuswb %xmm4, %xmm0
3768; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3769; SSE-NEXT:    retq
3770;
3771; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
3772; AVX1:       # %bb.0:
3773; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3774; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3775; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3776; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3777; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3778; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3779; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3780; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3781; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3782; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3783; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3784; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3785; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3786; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3787; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3788; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3789; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3790; AVX1-NEXT:    vzeroupper
3791; AVX1-NEXT:    retq
3792;
3793; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
3794; AVX2:       # %bb.0:
3795; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
3796; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
3797; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
3798; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
3799; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
3800; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
3801; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3802; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3803; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3804; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
3805; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3806; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3807; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3808; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3809; AVX2-NEXT:    vzeroupper
3810; AVX2-NEXT:    retq
3811;
3812; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
3813; AVX512:       # %bb.0:
3814; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3815; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3816; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3817; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3818; AVX512-NEXT:    vzeroupper
3819; AVX512-NEXT:    retq
3820  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3821  %2 = trunc <16 x i64> %1 to <16 x i8>
3822  ret <16 x i8> %2
3823}
3824
3825define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3826; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
3827; SSE:       # %bb.0:
3828; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3829; SSE-NEXT:    pand %xmm4, %xmm3
3830; SSE-NEXT:    pand %xmm4, %xmm2
3831; SSE-NEXT:    packuswb %xmm3, %xmm2
3832; SSE-NEXT:    pand %xmm4, %xmm1
3833; SSE-NEXT:    pand %xmm4, %xmm0
3834; SSE-NEXT:    packuswb %xmm1, %xmm0
3835; SSE-NEXT:    packuswb %xmm2, %xmm0
3836; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3837; SSE-NEXT:    retq
3838;
3839; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
3840; AVX1:       # %bb.0:
3841; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3842; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3843; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3844; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3845; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3846; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3847; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3848; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3849; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3850; AVX1-NEXT:    vzeroupper
3851; AVX1-NEXT:    retq
3852;
3853; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
3854; AVX2:       # %bb.0:
3855; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3856; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
3857; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3858; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3859; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3860; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3861; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3862; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3863; AVX2-NEXT:    vzeroupper
3864; AVX2-NEXT:    retq
3865;
3866; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
3867; AVX512:       # %bb.0:
3868; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3869; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3870; AVX512-NEXT:    vzeroupper
3871; AVX512-NEXT:    retq
3872  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3873  %2 = trunc <16 x i32> %1 to <16 x i8>
3874  ret <16 x i8> %2
3875}
3876
3877define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3878; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
3879; SSE:       # %bb.0:
3880; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3881; SSE-NEXT:    pand %xmm2, %xmm1
3882; SSE-NEXT:    pand %xmm2, %xmm0
3883; SSE-NEXT:    packuswb %xmm1, %xmm0
3884; SSE-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3885; SSE-NEXT:    retq
3886;
3887; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
3888; AVX1:       # %bb.0:
3889; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3890; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3891; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3892; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3893; AVX1-NEXT:    vzeroupper
3894; AVX1-NEXT:    retq
3895;
3896; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
3897; AVX2:       # %bb.0:
3898; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3899; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3900; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3901; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3902; AVX2-NEXT:    vzeroupper
3903; AVX2-NEXT:    retq
3904;
3905; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
3906; AVX512F:       # %bb.0:
3907; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3908; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3909; AVX512F-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3910; AVX512F-NEXT:    vzeroupper
3911; AVX512F-NEXT:    retq
3912;
3913; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
3914; AVX512BW:       # %bb.0:
3915; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3916; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3917; AVX512BW-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3918; AVX512BW-NEXT:    vzeroupper
3919; AVX512BW-NEXT:    retq
3920;
3921; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
3922; AVX512DQ:       # %bb.0:
3923; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3924; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3925; AVX512DQ-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3926; AVX512DQ-NEXT:    vzeroupper
3927; AVX512DQ-NEXT:    retq
3928  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3929  %2 = trunc <16 x i16> %1 to <16 x i8>
3930  ret <16 x i8> %2
3931}
3932
3933;
3934; or
3935;
3936
3937define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3938; SSE-LABEL: trunc_or_v4i64_v4i32:
3939; SSE:       # %bb.0:
3940; SSE-NEXT:    orps %xmm3, %xmm1
3941; SSE-NEXT:    orps %xmm2, %xmm0
3942; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3943; SSE-NEXT:    retq
3944;
3945; AVX1-LABEL: trunc_or_v4i64_v4i32:
3946; AVX1:       # %bb.0:
3947; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
3948; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3949; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3950; AVX1-NEXT:    vzeroupper
3951; AVX1-NEXT:    retq
3952;
3953; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
3954; AVX2-SLOW:       # %bb.0:
3955; AVX2-SLOW-NEXT:    vorps %ymm1, %ymm0, %ymm0
3956; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3957; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3958; AVX2-SLOW-NEXT:    vzeroupper
3959; AVX2-SLOW-NEXT:    retq
3960;
3961; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
3962; AVX2-FAST-ALL:       # %bb.0:
3963; AVX2-FAST-ALL-NEXT:    vorps %ymm1, %ymm0, %ymm0
3964; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3965; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3966; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3967; AVX2-FAST-ALL-NEXT:    vzeroupper
3968; AVX2-FAST-ALL-NEXT:    retq
3969;
3970; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32:
3971; AVX2-FAST-PERLANE:       # %bb.0:
3972; AVX2-FAST-PERLANE-NEXT:    vorps %ymm1, %ymm0, %ymm0
3973; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
3974; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3975; AVX2-FAST-PERLANE-NEXT:    vzeroupper
3976; AVX2-FAST-PERLANE-NEXT:    retq
3977;
3978; AVX512-LABEL: trunc_or_v4i64_v4i32:
3979; AVX512:       # %bb.0:
3980; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
3981; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3982; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3983; AVX512-NEXT:    vzeroupper
3984; AVX512-NEXT:    retq
3985  %1 = or <4 x i64> %a0, %a1
3986  %2 = trunc <4 x i64> %1 to <4 x i32>
3987  ret <4 x i32> %2
3988}
3989
3990define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3991; SSE-LABEL: trunc_or_v8i64_v8i16:
3992; SSE:       # %bb.0:
3993; SSE-NEXT:    por %xmm6, %xmm2
3994; SSE-NEXT:    por %xmm7, %xmm3
3995; SSE-NEXT:    por %xmm4, %xmm0
3996; SSE-NEXT:    por %xmm5, %xmm1
3997; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3998; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3999; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4000; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4001; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4002; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4003; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4004; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4005; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4006; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4007; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4008; SSE-NEXT:    retq
4009;
4010; AVX1-LABEL: trunc_or_v8i64_v8i16:
4011; AVX1:       # %bb.0:
4012; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4013; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4014; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4015; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4016; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4017; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4018; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4019; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4020; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4021; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4022; AVX1-NEXT:    vzeroupper
4023; AVX1-NEXT:    retq
4024;
4025; AVX2-LABEL: trunc_or_v8i64_v8i16:
4026; AVX2:       # %bb.0:
4027; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4028; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4029; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4030; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4031; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4032; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4033; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4034; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4035; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4036; AVX2-NEXT:    vzeroupper
4037; AVX2-NEXT:    retq
4038;
4039; AVX512-LABEL: trunc_or_v8i64_v8i16:
4040; AVX512:       # %bb.0:
4041; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
4042; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4043; AVX512-NEXT:    vzeroupper
4044; AVX512-NEXT:    retq
4045  %1 = or <8 x i64> %a0, %a1
4046  %2 = trunc <8 x i64> %1 to <8 x i16>
4047  ret <8 x i16> %2
4048}
4049
4050define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4051; SSE-LABEL: trunc_or_v8i32_v8i16:
4052; SSE:       # %bb.0:
4053; SSE-NEXT:    por %xmm2, %xmm0
4054; SSE-NEXT:    por %xmm3, %xmm1
4055; SSE-NEXT:    pslld $16, %xmm1
4056; SSE-NEXT:    psrad $16, %xmm1
4057; SSE-NEXT:    pslld $16, %xmm0
4058; SSE-NEXT:    psrad $16, %xmm0
4059; SSE-NEXT:    packssdw %xmm1, %xmm0
4060; SSE-NEXT:    retq
4061;
4062; AVX1-LABEL: trunc_or_v8i32_v8i16:
4063; AVX1:       # %bb.0:
4064; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4065; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4066; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4067; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4068; AVX1-NEXT:    vzeroupper
4069; AVX1-NEXT:    retq
4070;
4071; AVX2-LABEL: trunc_or_v8i32_v8i16:
4072; AVX2:       # %bb.0:
4073; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4074; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4075; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4076; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4077; AVX2-NEXT:    vzeroupper
4078; AVX2-NEXT:    retq
4079;
4080; AVX512-LABEL: trunc_or_v8i32_v8i16:
4081; AVX512:       # %bb.0:
4082; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4083; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4084; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4085; AVX512-NEXT:    vzeroupper
4086; AVX512-NEXT:    retq
4087  %1 = or <8 x i32> %a0, %a1
4088  %2 = trunc <8 x i32> %1 to <8 x i16>
4089  ret <8 x i16> %2
4090}
4091
4092define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4093; SSE-LABEL: trunc_or_v16i64_v16i8:
4094; SSE:       # %bb.0:
4095; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
4096; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
4097; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
4098; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
4099; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
4100; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
4101; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
4102; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
4103; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4104; SSE-NEXT:    pand %xmm8, %xmm7
4105; SSE-NEXT:    pand %xmm8, %xmm6
4106; SSE-NEXT:    packuswb %xmm7, %xmm6
4107; SSE-NEXT:    pand %xmm8, %xmm5
4108; SSE-NEXT:    pand %xmm8, %xmm4
4109; SSE-NEXT:    packuswb %xmm5, %xmm4
4110; SSE-NEXT:    packuswb %xmm6, %xmm4
4111; SSE-NEXT:    pand %xmm8, %xmm3
4112; SSE-NEXT:    pand %xmm8, %xmm2
4113; SSE-NEXT:    packuswb %xmm3, %xmm2
4114; SSE-NEXT:    pand %xmm8, %xmm1
4115; SSE-NEXT:    pand %xmm8, %xmm0
4116; SSE-NEXT:    packuswb %xmm1, %xmm0
4117; SSE-NEXT:    packuswb %xmm2, %xmm0
4118; SSE-NEXT:    packuswb %xmm4, %xmm0
4119; SSE-NEXT:    retq
4120;
4121; AVX1-LABEL: trunc_or_v16i64_v16i8:
4122; AVX1:       # %bb.0:
4123; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
4124; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
4125; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
4126; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
4127; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4128; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4129; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4130; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4131; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4132; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4133; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4134; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4135; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4136; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4137; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4138; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4139; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4140; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4141; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4142; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4143; AVX1-NEXT:    vzeroupper
4144; AVX1-NEXT:    retq
4145;
4146; AVX2-LABEL: trunc_or_v16i64_v16i8:
4147; AVX2:       # %bb.0:
4148; AVX2-NEXT:    vpor %ymm4, %ymm0, %ymm0
4149; AVX2-NEXT:    vpor %ymm5, %ymm1, %ymm1
4150; AVX2-NEXT:    vpor %ymm6, %ymm2, %ymm2
4151; AVX2-NEXT:    vpor %ymm7, %ymm3, %ymm3
4152; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4153; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
4154; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
4155; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
4156; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4157; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
4158; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
4159; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4160; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4161; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
4162; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4163; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4164; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4165; AVX2-NEXT:    vzeroupper
4166; AVX2-NEXT:    retq
4167;
4168; AVX512-LABEL: trunc_or_v16i64_v16i8:
4169; AVX512:       # %bb.0:
4170; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
4171; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
4172; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4173; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4174; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4175; AVX512-NEXT:    vzeroupper
4176; AVX512-NEXT:    retq
4177  %1 = or <16 x i64> %a0, %a1
4178  %2 = trunc <16 x i64> %1 to <16 x i8>
4179  ret <16 x i8> %2
4180}
4181
4182define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4183; SSE-LABEL: trunc_or_v16i32_v16i8:
4184; SSE:       # %bb.0:
4185; SSE-NEXT:    por %xmm4, %xmm0
4186; SSE-NEXT:    por %xmm5, %xmm1
4187; SSE-NEXT:    por %xmm6, %xmm2
4188; SSE-NEXT:    por %xmm7, %xmm3
4189; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4190; SSE-NEXT:    pand %xmm4, %xmm3
4191; SSE-NEXT:    pand %xmm4, %xmm2
4192; SSE-NEXT:    packuswb %xmm3, %xmm2
4193; SSE-NEXT:    pand %xmm4, %xmm1
4194; SSE-NEXT:    pand %xmm4, %xmm0
4195; SSE-NEXT:    packuswb %xmm1, %xmm0
4196; SSE-NEXT:    packuswb %xmm2, %xmm0
4197; SSE-NEXT:    retq
4198;
4199; AVX1-LABEL: trunc_or_v16i32_v16i8:
4200; AVX1:       # %bb.0:
4201; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4202; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4203; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4204; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4205; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4206; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4207; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4208; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4209; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4210; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4211; AVX1-NEXT:    vzeroupper
4212; AVX1-NEXT:    retq
4213;
4214; AVX2-LABEL: trunc_or_v16i32_v16i8:
4215; AVX2:       # %bb.0:
4216; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4217; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4218; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4219; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
4220; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
4221; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4222; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4223; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4224; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4225; AVX2-NEXT:    vzeroupper
4226; AVX2-NEXT:    retq
4227;
4228; AVX512-LABEL: trunc_or_v16i32_v16i8:
4229; AVX512:       # %bb.0:
4230; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
4231; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4232; AVX512-NEXT:    vzeroupper
4233; AVX512-NEXT:    retq
4234  %1 = or <16 x i32> %a0, %a1
4235  %2 = trunc <16 x i32> %1 to <16 x i8>
4236  ret <16 x i8> %2
4237}
4238
4239define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4240; SSE-LABEL: trunc_or_v16i16_v16i8:
4241; SSE:       # %bb.0:
4242; SSE-NEXT:    por %xmm2, %xmm0
4243; SSE-NEXT:    por %xmm3, %xmm1
4244; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4245; SSE-NEXT:    pand %xmm2, %xmm1
4246; SSE-NEXT:    pand %xmm2, %xmm0
4247; SSE-NEXT:    packuswb %xmm1, %xmm0
4248; SSE-NEXT:    retq
4249;
4250; AVX1-LABEL: trunc_or_v16i16_v16i8:
4251; AVX1:       # %bb.0:
4252; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4253; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4254; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4255; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4256; AVX1-NEXT:    vzeroupper
4257; AVX1-NEXT:    retq
4258;
4259; AVX2-LABEL: trunc_or_v16i16_v16i8:
4260; AVX2:       # %bb.0:
4261; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4262; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4263; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4264; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4265; AVX2-NEXT:    vzeroupper
4266; AVX2-NEXT:    retq
4267;
4268; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4269; AVX512F:       # %bb.0:
4270; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
4271; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4272; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4273; AVX512F-NEXT:    vzeroupper
4274; AVX512F-NEXT:    retq
4275;
4276; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4277; AVX512BW:       # %bb.0:
4278; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
4279; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4280; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4281; AVX512BW-NEXT:    vzeroupper
4282; AVX512BW-NEXT:    retq
4283;
4284; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4285; AVX512DQ:       # %bb.0:
4286; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
4287; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4288; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4289; AVX512DQ-NEXT:    vzeroupper
4290; AVX512DQ-NEXT:    retq
4291  %1 = or <16 x i16> %a0, %a1
4292  %2 = trunc <16 x i16> %1 to <16 x i8>
4293  ret <16 x i8> %2
4294}
4295
4296;
4297; or to constant
4298;
4299
4300define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4301; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4302; SSE:       # %bb.0:
4303; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4304; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4305; SSE-NEXT:    retq
4306;
4307; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4308; AVX1:       # %bb.0:
4309; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4310; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4311; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4312; AVX1-NEXT:    vzeroupper
4313; AVX1-NEXT:    retq
4314;
4315; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4316; AVX2-SLOW:       # %bb.0:
4317; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
4318; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4319; AVX2-SLOW-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4320; AVX2-SLOW-NEXT:    vzeroupper
4321; AVX2-SLOW-NEXT:    retq
4322;
4323; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
4324; AVX2-FAST-ALL:       # %bb.0:
4325; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
4326; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
4327; AVX2-FAST-ALL-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4328; AVX2-FAST-ALL-NEXT:    vzeroupper
4329; AVX2-FAST-ALL-NEXT:    retq
4330;
4331; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32:
4332; AVX2-FAST-PERLANE:       # %bb.0:
4333; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
4334; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4335; AVX2-FAST-PERLANE-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4336; AVX2-FAST-PERLANE-NEXT:    vzeroupper
4337; AVX2-FAST-PERLANE-NEXT:    retq
4338;
4339; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4340; AVX512:       # %bb.0:
4341; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4342; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4343; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4344; AVX512-NEXT:    vzeroupper
4345; AVX512-NEXT:    retq
4346  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4347  %2 = trunc <4 x i64> %1 to <4 x i32>
4348  ret <4 x i32> %2
4349}
4350
4351define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4352; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4353; SSE:       # %bb.0:
4354; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4355; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4356; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4357; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4358; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4359; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4360; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4361; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4362; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4363; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4364; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4365; SSE-NEXT:    orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4366; SSE-NEXT:    retq
4367;
4368; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4369; AVX1:       # %bb.0:
4370; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4371; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4372; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4373; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4374; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4375; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4376; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4377; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4378; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4379; AVX1-NEXT:    vzeroupper
4380; AVX1-NEXT:    retq
4381;
4382; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
4383; AVX2:       # %bb.0:
4384; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4385; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
4386; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
4387; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4388; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4389; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4390; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4391; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4392; AVX2-NEXT:    vzeroupper
4393; AVX2-NEXT:    retq
4394;
4395; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4396; AVX512:       # %bb.0:
4397; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4398; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4399; AVX512-NEXT:    vzeroupper
4400; AVX512-NEXT:    retq
4401  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4402  %2 = trunc <8 x i64> %1 to <8 x i16>
4403  ret <8 x i16> %2
4404}
4405
4406define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4407; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4408; SSE:       # %bb.0:
4409; SSE-NEXT:    pslld $16, %xmm1
4410; SSE-NEXT:    psrad $16, %xmm1
4411; SSE-NEXT:    pslld $16, %xmm0
4412; SSE-NEXT:    psrad $16, %xmm0
4413; SSE-NEXT:    packssdw %xmm1, %xmm0
4414; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4415; SSE-NEXT:    retq
4416;
4417; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4418; AVX1:       # %bb.0:
4419; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4420; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4421; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4422; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4423; AVX1-NEXT:    vzeroupper
4424; AVX1-NEXT:    retq
4425;
4426; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4427; AVX2:       # %bb.0:
4428; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4429; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4430; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4431; AVX2-NEXT:    vzeroupper
4432; AVX2-NEXT:    retq
4433;
4434; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4435; AVX512:       # %bb.0:
4436; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4437; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4438; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4439; AVX512-NEXT:    vzeroupper
4440; AVX512-NEXT:    retq
4441  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4442  %2 = trunc <8 x i32> %1 to <8 x i16>
4443  ret <8 x i16> %2
4444}
4445
4446define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4447; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4448; SSE:       # %bb.0:
4449; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4450; SSE-NEXT:    pand %xmm8, %xmm7
4451; SSE-NEXT:    pand %xmm8, %xmm6
4452; SSE-NEXT:    packuswb %xmm7, %xmm6
4453; SSE-NEXT:    pand %xmm8, %xmm5
4454; SSE-NEXT:    pand %xmm8, %xmm4
4455; SSE-NEXT:    packuswb %xmm5, %xmm4
4456; SSE-NEXT:    packuswb %xmm6, %xmm4
4457; SSE-NEXT:    pand %xmm8, %xmm3
4458; SSE-NEXT:    pand %xmm8, %xmm2
4459; SSE-NEXT:    packuswb %xmm3, %xmm2
4460; SSE-NEXT:    pand %xmm8, %xmm1
4461; SSE-NEXT:    pand %xmm8, %xmm0
4462; SSE-NEXT:    packuswb %xmm1, %xmm0
4463; SSE-NEXT:    packuswb %xmm2, %xmm0
4464; SSE-NEXT:    packuswb %xmm4, %xmm0
4465; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4466; SSE-NEXT:    retq
4467;
4468; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4469; AVX1:       # %bb.0:
4470; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4471; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4472; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4473; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4474; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4475; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4476; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4477; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4478; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4479; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4480; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4481; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4482; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4483; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4484; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4485; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4486; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4487; AVX1-NEXT:    vzeroupper
4488; AVX1-NEXT:    retq
4489;
4490; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
4491; AVX2:       # %bb.0:
4492; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
4493; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
4494; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
4495; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
4496; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
4497; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
4498; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
4499; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4500; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4501; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
4502; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4503; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4504; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4505; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4506; AVX2-NEXT:    vzeroupper
4507; AVX2-NEXT:    retq
4508;
4509; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4510; AVX512:       # %bb.0:
4511; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4512; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4513; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4514; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4515; AVX512-NEXT:    vzeroupper
4516; AVX512-NEXT:    retq
4517  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4518  %2 = trunc <16 x i64> %1 to <16 x i8>
4519  ret <16 x i8> %2
4520}
4521
4522define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4523; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4524; SSE:       # %bb.0:
4525; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4526; SSE-NEXT:    pand %xmm4, %xmm3
4527; SSE-NEXT:    pand %xmm4, %xmm2
4528; SSE-NEXT:    packuswb %xmm3, %xmm2
4529; SSE-NEXT:    pand %xmm4, %xmm1
4530; SSE-NEXT:    pand %xmm4, %xmm0
4531; SSE-NEXT:    packuswb %xmm1, %xmm0
4532; SSE-NEXT:    packuswb %xmm2, %xmm0
4533; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4534; SSE-NEXT:    retq
4535;
4536; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4537; AVX1:       # %bb.0:
4538; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4539; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4540; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4541; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4542; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4543; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4544; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4545; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4546; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4547; AVX1-NEXT:    vzeroupper
4548; AVX1-NEXT:    retq
4549;
4550; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4551; AVX2:       # %bb.0:
4552; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4553; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
4554; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
4555; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4556; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4557; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4558; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4559; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4560; AVX2-NEXT:    vzeroupper
4561; AVX2-NEXT:    retq
4562;
4563; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4564; AVX512:       # %bb.0:
4565; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4566; AVX512-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4567; AVX512-NEXT:    vzeroupper
4568; AVX512-NEXT:    retq
4569  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4570  %2 = trunc <16 x i32> %1 to <16 x i8>
4571  ret <16 x i8> %2
4572}
4573
4574define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4575; SSE-LABEL: trunc_or_const_v16i16_v16i8:
4576; SSE:       # %bb.0:
4577; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4578; SSE-NEXT:    pand %xmm2, %xmm1
4579; SSE-NEXT:    pand %xmm2, %xmm0
4580; SSE-NEXT:    packuswb %xmm1, %xmm0
4581; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4582; SSE-NEXT:    retq
4583;
4584; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
4585; AVX1:       # %bb.0:
4586; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4587; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4588; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4589; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4590; AVX1-NEXT:    vzeroupper
4591; AVX1-NEXT:    retq
4592;
4593; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
4594; AVX2:       # %bb.0:
4595; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4596; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4597; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4598; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4599; AVX2-NEXT:    vzeroupper
4600; AVX2-NEXT:    retq
4601;
4602; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
4603; AVX512F:       # %bb.0:
4604; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4605; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4606; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4607; AVX512F-NEXT:    vzeroupper
4608; AVX512F-NEXT:    retq
4609;
4610; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
4611; AVX512BW:       # %bb.0:
4612; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4613; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4614; AVX512BW-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4615; AVX512BW-NEXT:    vzeroupper
4616; AVX512BW-NEXT:    retq
4617;
4618; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
4619; AVX512DQ:       # %bb.0:
4620; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4621; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4622; AVX512DQ-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4623; AVX512DQ-NEXT:    vzeroupper
4624; AVX512DQ-NEXT:    retq
4625  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4626  %2 = trunc <16 x i16> %1 to <16 x i8>
4627  ret <16 x i8> %2
4628}
4629
4630;
4631; complex patterns - often created by vectorizer
4632;
4633
4634define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4635; SSE-LABEL: mul_add_const_v4i64_v4i32:
4636; SSE:       # %bb.0:
4637; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4638; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4639; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4640; SSE-NEXT:    pmuludq %xmm2, %xmm0
4641; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4642; SSE-NEXT:    pmuludq %xmm3, %xmm1
4643; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4644; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4645; SSE-NEXT:    retq
4646;
4647; AVX-LABEL: mul_add_const_v4i64_v4i32:
4648; AVX:       # %bb.0:
4649; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
4650; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4651; AVX-NEXT:    retq
4652  %1 = sext <4 x i32> %a0 to <4 x i64>
4653  %2 = sext <4 x i32> %a1 to <4 x i64>
4654  %3 = mul <4 x i64> %1, %2
4655  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
4656  %5 = trunc <4 x i64> %4 to <4 x i32>
4657  ret <4 x i32> %5
4658}
4659
4660define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4661; SSE-LABEL: mul_add_self_v4i64_v4i32:
4662; SSE:       # %bb.0:
4663; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4664; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4665; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
4666; SSE-NEXT:    pmuludq %xmm2, %xmm0
4667; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4668; SSE-NEXT:    pmuludq %xmm3, %xmm1
4669; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4670; SSE-NEXT:    paddd %xmm0, %xmm0
4671; SSE-NEXT:    retq
4672;
4673; AVX-LABEL: mul_add_self_v4i64_v4i32:
4674; AVX:       # %bb.0:
4675; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
4676; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
4677; AVX-NEXT:    retq
4678  %1 = sext <4 x i32> %a0 to <4 x i64>
4679  %2 = sext <4 x i32> %a1 to <4 x i64>
4680  %3 = mul <4 x i64> %1, %2
4681  %4 = add <4 x i64> %3, %3
4682  %5 = trunc <4 x i64> %4 to <4 x i32>
4683  ret <4 x i32> %5
4684}
4685
4686define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
4687; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
4688; SSE:       # %bb.0:
4689; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4690; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
4691; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
4692; SSE-NEXT:    pmuludq %xmm2, %xmm4
4693; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
4694; SSE-NEXT:    pmuludq %xmm3, %xmm1
4695; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
4696; SSE-NEXT:    paddd %xmm4, %xmm0
4697; SSE-NEXT:    retq
4698;
4699; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
4700; AVX:       # %bb.0:
4701; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
4702; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
4703; AVX-NEXT:    retq
4704  %1 = sext <4 x i32> %a0 to <4 x i64>
4705  %2 = sext <4 x i32> %a1 to <4 x i64>
4706  %3 = mul <4 x i64> %1, %2
4707  %4 = add <4 x i64> %1, %3
4708  %5 = trunc <4 x i64> %4 to <4 x i32>
4709  ret <4 x i32> %5
4710}
4711