1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
16
17define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
18; SSE-LABEL: trunc8i64_8i32:
19; SSE:       # %bb.0: # %entry
20; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
21; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
22; SSE-NEXT:    movaps %xmm2, %xmm1
23; SSE-NEXT:    retq
24;
25; AVX1-LABEL: trunc8i64_8i32:
26; AVX1:       # %bb.0: # %entry
27; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
28; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
29; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
30; AVX1-NEXT:    retq
31;
32; AVX2-SLOW-LABEL: trunc8i64_8i32:
33; AVX2-SLOW:       # %bb.0: # %entry
34; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
35; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
36; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
37; AVX2-SLOW-NEXT:    retq
38;
39; AVX2-FAST-ALL-LABEL: trunc8i64_8i32:
40; AVX2-FAST-ALL:       # %bb.0: # %entry
41; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
42; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
43; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
44; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
45; AVX2-FAST-ALL-NEXT:    retq
46;
47; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32:
48; AVX2-FAST-PERLANE:       # %bb.0: # %entry
49; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
50; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
51; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
52; AVX2-FAST-PERLANE-NEXT:    retq
53;
54; AVX512-LABEL: trunc8i64_8i32:
55; AVX512:       # %bb.0: # %entry
56; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
57; AVX512-NEXT:    retq
58entry:
59  %0 = trunc <8 x i64> %a to <8 x i32>
60  ret <8 x i32> %0
61}
62
63define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
64; SSE-LABEL: trunc8i64_8i32_ashr:
65; SSE:       # %bb.0: # %entry
66; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
67; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
68; SSE-NEXT:    movaps %xmm2, %xmm1
69; SSE-NEXT:    retq
70;
71; AVX1-LABEL: trunc8i64_8i32_ashr:
72; AVX1:       # %bb.0: # %entry
73; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
74; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
75; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
76; AVX1-NEXT:    retq
77;
78; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
79; AVX2-SLOW:       # %bb.0: # %entry
80; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
81; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
82; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
83; AVX2-SLOW-NEXT:    retq
84;
85; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr:
86; AVX2-FAST-ALL:       # %bb.0: # %entry
87; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
88; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
89; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
90; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
91; AVX2-FAST-ALL-NEXT:    retq
92;
93; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr:
94; AVX2-FAST-PERLANE:       # %bb.0: # %entry
95; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
96; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
97; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
98; AVX2-FAST-PERLANE-NEXT:    retq
99;
100; AVX512-LABEL: trunc8i64_8i32_ashr:
101; AVX512:       # %bb.0: # %entry
102; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
103; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
104; AVX512-NEXT:    retq
105entry:
106  %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
107  %1 = trunc <8 x i64> %0 to <8 x i32>
108  ret <8 x i32> %1
109}
110
111define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
112; SSE-LABEL: trunc8i64_8i32_lshr:
113; SSE:       # %bb.0: # %entry
114; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
115; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
116; SSE-NEXT:    movaps %xmm2, %xmm1
117; SSE-NEXT:    retq
118;
119; AVX1-LABEL: trunc8i64_8i32_lshr:
120; AVX1:       # %bb.0: # %entry
121; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
122; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
123; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
124; AVX1-NEXT:    retq
125;
126; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
127; AVX2-SLOW:       # %bb.0: # %entry
128; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
129; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
130; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
131; AVX2-SLOW-NEXT:    retq
132;
133; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr:
134; AVX2-FAST-ALL:       # %bb.0: # %entry
135; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
136; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
137; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
138; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
139; AVX2-FAST-ALL-NEXT:    retq
140;
141; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr:
142; AVX2-FAST-PERLANE:       # %bb.0: # %entry
143; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
144; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
145; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
146; AVX2-FAST-PERLANE-NEXT:    retq
147;
148; AVX512-LABEL: trunc8i64_8i32_lshr:
149; AVX512:       # %bb.0: # %entry
150; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
151; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
152; AVX512-NEXT:    retq
153entry:
154  %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
155  %1 = trunc <8 x i64> %0 to <8 x i32>
156  ret <8 x i32> %1
157}
158
159define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
160; SSE2-LABEL: trunc8i64_8i16:
161; SSE2:       # %bb.0: # %entry
162; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
163; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
164; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
165; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
166; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
167; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
168; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
169; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
170; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
171; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
172; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
173; SSE2-NEXT:    retq
174;
175; SSSE3-LABEL: trunc8i64_8i16:
176; SSSE3:       # %bb.0: # %entry
177; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
178; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
179; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
180; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
181; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
182; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
183; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
184; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
185; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
186; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
187; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
188; SSSE3-NEXT:    retq
189;
190; SSE41-LABEL: trunc8i64_8i16:
191; SSE41:       # %bb.0: # %entry
192; SSE41-NEXT:    pxor %xmm4, %xmm4
193; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
194; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
195; SSE41-NEXT:    packusdw %xmm3, %xmm2
196; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
197; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
198; SSE41-NEXT:    packusdw %xmm1, %xmm0
199; SSE41-NEXT:    packusdw %xmm2, %xmm0
200; SSE41-NEXT:    retq
201;
202; AVX1-LABEL: trunc8i64_8i16:
203; AVX1:       # %bb.0: # %entry
204; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
205; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
206; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
207; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
208; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
209; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
210; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
211; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
212; AVX1-NEXT:    vzeroupper
213; AVX1-NEXT:    retq
214;
215; AVX2-LABEL: trunc8i64_8i16:
216; AVX2:       # %bb.0: # %entry
217; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
218; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
219; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
220; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
221; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
222; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
223; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
224; AVX2-NEXT:    vzeroupper
225; AVX2-NEXT:    retq
226;
227; AVX512-LABEL: trunc8i64_8i16:
228; AVX512:       # %bb.0: # %entry
229; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
230; AVX512-NEXT:    vzeroupper
231; AVX512-NEXT:    retq
232entry:
233  %0 = trunc <8 x i64> %a to <8 x i16>
234  ret <8 x i16> %0
235}
236
237define void @trunc8i64_8i8(<8 x i64> %a) {
238; SSE2-LABEL: trunc8i64_8i8:
239; SSE2:       # %bb.0: # %entry
240; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
241; SSE2-NEXT:    pand %xmm4, %xmm3
242; SSE2-NEXT:    pand %xmm4, %xmm2
243; SSE2-NEXT:    packuswb %xmm3, %xmm2
244; SSE2-NEXT:    pand %xmm4, %xmm1
245; SSE2-NEXT:    pand %xmm4, %xmm0
246; SSE2-NEXT:    packuswb %xmm1, %xmm0
247; SSE2-NEXT:    packuswb %xmm2, %xmm0
248; SSE2-NEXT:    packuswb %xmm0, %xmm0
249; SSE2-NEXT:    movq %xmm0, (%rax)
250; SSE2-NEXT:    retq
251;
252; SSSE3-LABEL: trunc8i64_8i8:
253; SSSE3:       # %bb.0: # %entry
254; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
255; SSSE3-NEXT:    pand %xmm4, %xmm3
256; SSSE3-NEXT:    pand %xmm4, %xmm2
257; SSSE3-NEXT:    packuswb %xmm3, %xmm2
258; SSSE3-NEXT:    pand %xmm4, %xmm1
259; SSSE3-NEXT:    pand %xmm4, %xmm0
260; SSSE3-NEXT:    packuswb %xmm1, %xmm0
261; SSSE3-NEXT:    packuswb %xmm2, %xmm0
262; SSSE3-NEXT:    packuswb %xmm0, %xmm0
263; SSSE3-NEXT:    movq %xmm0, (%rax)
264; SSSE3-NEXT:    retq
265;
266; SSE41-LABEL: trunc8i64_8i8:
267; SSE41:       # %bb.0: # %entry
268; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
269; SSE41-NEXT:    pand %xmm4, %xmm3
270; SSE41-NEXT:    pand %xmm4, %xmm2
271; SSE41-NEXT:    packusdw %xmm3, %xmm2
272; SSE41-NEXT:    pand %xmm4, %xmm1
273; SSE41-NEXT:    pand %xmm4, %xmm0
274; SSE41-NEXT:    packusdw %xmm1, %xmm0
275; SSE41-NEXT:    packusdw %xmm2, %xmm0
276; SSE41-NEXT:    packuswb %xmm0, %xmm0
277; SSE41-NEXT:    movq %xmm0, (%rax)
278; SSE41-NEXT:    retq
279;
280; AVX1-LABEL: trunc8i64_8i8:
281; AVX1:       # %bb.0: # %entry
282; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255]
283; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
284; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
285; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
286; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
287; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
288; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
289; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
290; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
291; AVX1-NEXT:    vmovq %xmm0, (%rax)
292; AVX1-NEXT:    vzeroupper
293; AVX1-NEXT:    retq
294;
295; AVX2-LABEL: trunc8i64_8i8:
296; AVX2:       # %bb.0: # %entry
297; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
298; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
299; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
300; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
301; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
302; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
303; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
304; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
305; AVX2-NEXT:    vmovq %xmm0, (%rax)
306; AVX2-NEXT:    vzeroupper
307; AVX2-NEXT:    retq
308;
309; AVX512-LABEL: trunc8i64_8i8:
310; AVX512:       # %bb.0: # %entry
311; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
312; AVX512-NEXT:    vzeroupper
313; AVX512-NEXT:    retq
314entry:
315  %0 = trunc <8 x i64> %a to <8 x i8>
316  store <8 x i8> %0, <8 x i8>* undef, align 4
317  ret void
318}
319
320define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
321; SSE2-LABEL: trunc8i32_8i16:
322; SSE2:       # %bb.0: # %entry
323; SSE2-NEXT:    pslld $16, %xmm1
324; SSE2-NEXT:    psrad $16, %xmm1
325; SSE2-NEXT:    pslld $16, %xmm0
326; SSE2-NEXT:    psrad $16, %xmm0
327; SSE2-NEXT:    packssdw %xmm1, %xmm0
328; SSE2-NEXT:    retq
329;
330; SSSE3-LABEL: trunc8i32_8i16:
331; SSSE3:       # %bb.0: # %entry
332; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
333; SSSE3-NEXT:    pshufb %xmm2, %xmm1
334; SSSE3-NEXT:    pshufb %xmm2, %xmm0
335; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
336; SSSE3-NEXT:    retq
337;
338; SSE41-LABEL: trunc8i32_8i16:
339; SSE41:       # %bb.0: # %entry
340; SSE41-NEXT:    pxor %xmm2, %xmm2
341; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
342; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
343; SSE41-NEXT:    packusdw %xmm1, %xmm0
344; SSE41-NEXT:    retq
345;
346; AVX1-LABEL: trunc8i32_8i16:
347; AVX1:       # %bb.0: # %entry
348; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
349; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
350; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
351; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
352; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
353; AVX1-NEXT:    vzeroupper
354; AVX1-NEXT:    retq
355;
356; AVX2-LABEL: trunc8i32_8i16:
357; AVX2:       # %bb.0: # %entry
358; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
359; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
360; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
361; AVX2-NEXT:    vzeroupper
362; AVX2-NEXT:    retq
363;
364; AVX512F-LABEL: trunc8i32_8i16:
365; AVX512F:       # %bb.0: # %entry
366; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
367; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
368; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
369; AVX512F-NEXT:    vzeroupper
370; AVX512F-NEXT:    retq
371;
372; AVX512VL-LABEL: trunc8i32_8i16:
373; AVX512VL:       # %bb.0: # %entry
374; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
375; AVX512VL-NEXT:    vzeroupper
376; AVX512VL-NEXT:    retq
377;
378; AVX512BW-LABEL: trunc8i32_8i16:
379; AVX512BW:       # %bb.0: # %entry
380; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
381; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
382; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
383; AVX512BW-NEXT:    vzeroupper
384; AVX512BW-NEXT:    retq
385;
386; AVX512BWVL-LABEL: trunc8i32_8i16:
387; AVX512BWVL:       # %bb.0: # %entry
388; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
389; AVX512BWVL-NEXT:    vzeroupper
390; AVX512BWVL-NEXT:    retq
391entry:
392  %0 = trunc <8 x i32> %a to <8 x i16>
393  ret <8 x i16> %0
394}
395
396define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
397; SSE2-LABEL: trunc8i32_8i16_ashr:
398; SSE2:       # %bb.0: # %entry
399; SSE2-NEXT:    psrad $16, %xmm1
400; SSE2-NEXT:    psrad $16, %xmm0
401; SSE2-NEXT:    packssdw %xmm1, %xmm0
402; SSE2-NEXT:    retq
403;
404; SSSE3-LABEL: trunc8i32_8i16_ashr:
405; SSSE3:       # %bb.0: # %entry
406; SSSE3-NEXT:    psrad $16, %xmm1
407; SSSE3-NEXT:    psrad $16, %xmm0
408; SSSE3-NEXT:    packssdw %xmm1, %xmm0
409; SSSE3-NEXT:    retq
410;
411; SSE41-LABEL: trunc8i32_8i16_ashr:
412; SSE41:       # %bb.0: # %entry
413; SSE41-NEXT:    psrld $16, %xmm1
414; SSE41-NEXT:    psrld $16, %xmm0
415; SSE41-NEXT:    packusdw %xmm1, %xmm0
416; SSE41-NEXT:    retq
417;
418; AVX1-LABEL: trunc8i32_8i16_ashr:
419; AVX1:       # %bb.0: # %entry
420; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
421; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
422; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
423; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
424; AVX1-NEXT:    vzeroupper
425; AVX1-NEXT:    retq
426;
427; AVX2-LABEL: trunc8i32_8i16_ashr:
428; AVX2:       # %bb.0: # %entry
429; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
430; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
431; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
432; AVX2-NEXT:    vzeroupper
433; AVX2-NEXT:    retq
434;
435; AVX512F-LABEL: trunc8i32_8i16_ashr:
436; AVX512F:       # %bb.0: # %entry
437; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
438; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
439; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
440; AVX512F-NEXT:    vzeroupper
441; AVX512F-NEXT:    retq
442;
443; AVX512VL-LABEL: trunc8i32_8i16_ashr:
444; AVX512VL:       # %bb.0: # %entry
445; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
446; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
447; AVX512VL-NEXT:    vzeroupper
448; AVX512VL-NEXT:    retq
449;
450; AVX512BW-LABEL: trunc8i32_8i16_ashr:
451; AVX512BW:       # %bb.0: # %entry
452; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
453; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
454; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
455; AVX512BW-NEXT:    vzeroupper
456; AVX512BW-NEXT:    retq
457;
458; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
459; AVX512BWVL:       # %bb.0: # %entry
460; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
461; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
462; AVX512BWVL-NEXT:    vzeroupper
463; AVX512BWVL-NEXT:    retq
464entry:
465  %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
466  %1 = trunc <8 x i32> %0 to <8 x i16>
467  ret <8 x i16> %1
468}
469
470define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
471; SSE2-LABEL: trunc8i32_8i16_lshr:
472; SSE2:       # %bb.0: # %entry
473; SSE2-NEXT:    psrad $16, %xmm1
474; SSE2-NEXT:    psrad $16, %xmm0
475; SSE2-NEXT:    packssdw %xmm1, %xmm0
476; SSE2-NEXT:    retq
477;
478; SSSE3-LABEL: trunc8i32_8i16_lshr:
479; SSSE3:       # %bb.0: # %entry
480; SSSE3-NEXT:    psrad $16, %xmm1
481; SSSE3-NEXT:    psrad $16, %xmm0
482; SSSE3-NEXT:    packssdw %xmm1, %xmm0
483; SSSE3-NEXT:    retq
484;
485; SSE41-LABEL: trunc8i32_8i16_lshr:
486; SSE41:       # %bb.0: # %entry
487; SSE41-NEXT:    psrld $16, %xmm1
488; SSE41-NEXT:    psrld $16, %xmm0
489; SSE41-NEXT:    packusdw %xmm1, %xmm0
490; SSE41-NEXT:    retq
491;
492; AVX1-LABEL: trunc8i32_8i16_lshr:
493; AVX1:       # %bb.0: # %entry
494; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
495; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
496; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
497; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
498; AVX1-NEXT:    vzeroupper
499; AVX1-NEXT:    retq
500;
501; AVX2-LABEL: trunc8i32_8i16_lshr:
502; AVX2:       # %bb.0: # %entry
503; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
504; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
505; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
506; AVX2-NEXT:    vzeroupper
507; AVX2-NEXT:    retq
508;
509; AVX512F-LABEL: trunc8i32_8i16_lshr:
510; AVX512F:       # %bb.0: # %entry
511; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
512; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
513; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
514; AVX512F-NEXT:    vzeroupper
515; AVX512F-NEXT:    retq
516;
517; AVX512VL-LABEL: trunc8i32_8i16_lshr:
518; AVX512VL:       # %bb.0: # %entry
519; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
520; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
521; AVX512VL-NEXT:    vzeroupper
522; AVX512VL-NEXT:    retq
523;
524; AVX512BW-LABEL: trunc8i32_8i16_lshr:
525; AVX512BW:       # %bb.0: # %entry
526; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
527; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
528; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
529; AVX512BW-NEXT:    vzeroupper
530; AVX512BW-NEXT:    retq
531;
532; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
533; AVX512BWVL:       # %bb.0: # %entry
534; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
535; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
536; AVX512BWVL-NEXT:    vzeroupper
537; AVX512BWVL-NEXT:    retq
538entry:
539  %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
540  %1 = trunc <8 x i32> %0 to <8 x i16>
541  ret <8 x i16> %1
542}
543
544define void @trunc8i32_8i8(<8 x i32> %a) {
545; SSE2-LABEL: trunc8i32_8i8:
546; SSE2:       # %bb.0: # %entry
547; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
548; SSE2-NEXT:    pand %xmm2, %xmm1
549; SSE2-NEXT:    pand %xmm2, %xmm0
550; SSE2-NEXT:    packuswb %xmm1, %xmm0
551; SSE2-NEXT:    packuswb %xmm0, %xmm0
552; SSE2-NEXT:    movq %xmm0, (%rax)
553; SSE2-NEXT:    retq
554;
555; SSSE3-LABEL: trunc8i32_8i8:
556; SSSE3:       # %bb.0: # %entry
557; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
558; SSSE3-NEXT:    pshufb %xmm2, %xmm1
559; SSSE3-NEXT:    pshufb %xmm2, %xmm0
560; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
561; SSSE3-NEXT:    movq %xmm0, (%rax)
562; SSSE3-NEXT:    retq
563;
564; SSE41-LABEL: trunc8i32_8i8:
565; SSE41:       # %bb.0: # %entry
566; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
567; SSE41-NEXT:    pshufb %xmm2, %xmm1
568; SSE41-NEXT:    pshufb %xmm2, %xmm0
569; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
570; SSE41-NEXT:    movq %xmm0, (%rax)
571; SSE41-NEXT:    retq
572;
573; AVX1-LABEL: trunc8i32_8i8:
574; AVX1:       # %bb.0: # %entry
575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
576; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
577; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
578; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
579; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
580; AVX1-NEXT:    vmovq %xmm0, (%rax)
581; AVX1-NEXT:    vzeroupper
582; AVX1-NEXT:    retq
583;
584; AVX2-LABEL: trunc8i32_8i8:
585; AVX2:       # %bb.0: # %entry
586; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
587; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
588; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
589; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
590; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
591; AVX2-NEXT:    vmovq %xmm0, (%rax)
592; AVX2-NEXT:    vzeroupper
593; AVX2-NEXT:    retq
594;
595; AVX512F-LABEL: trunc8i32_8i8:
596; AVX512F:       # %bb.0: # %entry
597; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
598; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
599; AVX512F-NEXT:    vmovq %xmm0, (%rax)
600; AVX512F-NEXT:    vzeroupper
601; AVX512F-NEXT:    retq
602;
603; AVX512VL-LABEL: trunc8i32_8i8:
604; AVX512VL:       # %bb.0: # %entry
605; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
606; AVX512VL-NEXT:    vzeroupper
607; AVX512VL-NEXT:    retq
608;
609; AVX512BW-LABEL: trunc8i32_8i8:
610; AVX512BW:       # %bb.0: # %entry
611; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
612; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
613; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
614; AVX512BW-NEXT:    vzeroupper
615; AVX512BW-NEXT:    retq
616;
617; AVX512BWVL-LABEL: trunc8i32_8i8:
618; AVX512BWVL:       # %bb.0: # %entry
619; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
620; AVX512BWVL-NEXT:    vzeroupper
621; AVX512BWVL-NEXT:    retq
622entry:
623  %0 = trunc <8 x i32> %a to <8 x i8>
624  store <8 x i8> %0, <8 x i8>* undef, align 4
625  ret void
626}
627
628define void @trunc16i32_16i16(<16 x i32> %a) {
629; SSE2-LABEL: trunc16i32_16i16:
630; SSE2:       # %bb.0: # %entry
631; SSE2-NEXT:    pslld $16, %xmm1
632; SSE2-NEXT:    psrad $16, %xmm1
633; SSE2-NEXT:    pslld $16, %xmm0
634; SSE2-NEXT:    psrad $16, %xmm0
635; SSE2-NEXT:    packssdw %xmm1, %xmm0
636; SSE2-NEXT:    pslld $16, %xmm3
637; SSE2-NEXT:    psrad $16, %xmm3
638; SSE2-NEXT:    pslld $16, %xmm2
639; SSE2-NEXT:    psrad $16, %xmm2
640; SSE2-NEXT:    packssdw %xmm3, %xmm2
641; SSE2-NEXT:    movdqu %xmm2, (%rax)
642; SSE2-NEXT:    movdqu %xmm0, (%rax)
643; SSE2-NEXT:    retq
644;
645; SSSE3-LABEL: trunc16i32_16i16:
646; SSSE3:       # %bb.0: # %entry
647; SSSE3-NEXT:    pslld $16, %xmm1
648; SSSE3-NEXT:    psrad $16, %xmm1
649; SSSE3-NEXT:    pslld $16, %xmm0
650; SSSE3-NEXT:    psrad $16, %xmm0
651; SSSE3-NEXT:    packssdw %xmm1, %xmm0
652; SSSE3-NEXT:    pslld $16, %xmm3
653; SSSE3-NEXT:    psrad $16, %xmm3
654; SSSE3-NEXT:    pslld $16, %xmm2
655; SSSE3-NEXT:    psrad $16, %xmm2
656; SSSE3-NEXT:    packssdw %xmm3, %xmm2
657; SSSE3-NEXT:    movdqu %xmm2, (%rax)
658; SSSE3-NEXT:    movdqu %xmm0, (%rax)
659; SSSE3-NEXT:    retq
660;
661; SSE41-LABEL: trunc16i32_16i16:
662; SSE41:       # %bb.0: # %entry
663; SSE41-NEXT:    pxor %xmm4, %xmm4
664; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
665; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
666; SSE41-NEXT:    packusdw %xmm1, %xmm0
667; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
668; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
669; SSE41-NEXT:    packusdw %xmm3, %xmm2
670; SSE41-NEXT:    movdqu %xmm2, (%rax)
671; SSE41-NEXT:    movdqu %xmm0, (%rax)
672; SSE41-NEXT:    retq
673;
674; AVX1-LABEL: trunc16i32_16i16:
675; AVX1:       # %bb.0: # %entry
676; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
677; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
678; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
679; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
680; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
681; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
682; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
683; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
684; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
685; AVX1-NEXT:    vzeroupper
686; AVX1-NEXT:    retq
687;
688; AVX2-LABEL: trunc16i32_16i16:
689; AVX2:       # %bb.0: # %entry
690; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
691; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
692; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
693; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
694; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
695; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
696; AVX2-NEXT:    vzeroupper
697; AVX2-NEXT:    retq
698;
699; AVX512-LABEL: trunc16i32_16i16:
700; AVX512:       # %bb.0: # %entry
701; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
702; AVX512-NEXT:    vzeroupper
703; AVX512-NEXT:    retq
704entry:
705  %0 = trunc <16 x i32> %a to <16 x i16>
706  store <16 x i16> %0, <16 x i16>* undef, align 4
707  ret void
708}
709
710define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
711; SSE2-LABEL: trunc16i32_16i16_ashr:
712; SSE2:       # %bb.0: # %entry
713; SSE2-NEXT:    psrad $16, %xmm1
714; SSE2-NEXT:    psrad $16, %xmm0
715; SSE2-NEXT:    packssdw %xmm1, %xmm0
716; SSE2-NEXT:    psrad $16, %xmm3
717; SSE2-NEXT:    psrad $16, %xmm2
718; SSE2-NEXT:    packssdw %xmm3, %xmm2
719; SSE2-NEXT:    movdqu %xmm2, (%rax)
720; SSE2-NEXT:    movdqu %xmm0, (%rax)
721; SSE2-NEXT:    retq
722;
723; SSSE3-LABEL: trunc16i32_16i16_ashr:
724; SSSE3:       # %bb.0: # %entry
725; SSSE3-NEXT:    psrad $16, %xmm1
726; SSSE3-NEXT:    psrad $16, %xmm0
727; SSSE3-NEXT:    packssdw %xmm1, %xmm0
728; SSSE3-NEXT:    psrad $16, %xmm3
729; SSSE3-NEXT:    psrad $16, %xmm2
730; SSSE3-NEXT:    packssdw %xmm3, %xmm2
731; SSSE3-NEXT:    movdqu %xmm2, (%rax)
732; SSSE3-NEXT:    movdqu %xmm0, (%rax)
733; SSSE3-NEXT:    retq
734;
735; SSE41-LABEL: trunc16i32_16i16_ashr:
736; SSE41:       # %bb.0: # %entry
737; SSE41-NEXT:    psrld $16, %xmm3
738; SSE41-NEXT:    psrld $16, %xmm2
739; SSE41-NEXT:    packusdw %xmm3, %xmm2
740; SSE41-NEXT:    psrld $16, %xmm1
741; SSE41-NEXT:    psrld $16, %xmm0
742; SSE41-NEXT:    packusdw %xmm1, %xmm0
743; SSE41-NEXT:    movdqu %xmm2, (%rax)
744; SSE41-NEXT:    movdqu %xmm0, (%rax)
745; SSE41-NEXT:    retq
746;
747; AVX1-LABEL: trunc16i32_16i16_ashr:
748; AVX1:       # %bb.0: # %entry
749; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
750; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
751; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
752; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
753; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
754; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
755; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
756; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
757; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
758; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
759; AVX1-NEXT:    vzeroupper
760; AVX1-NEXT:    retq
761;
762; AVX2-LABEL: trunc16i32_16i16_ashr:
763; AVX2:       # %bb.0: # %entry
764; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
765; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
766; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
767; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
768; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
769; AVX2-NEXT:    vzeroupper
770; AVX2-NEXT:    retq
771;
772; AVX512-LABEL: trunc16i32_16i16_ashr:
773; AVX512:       # %bb.0: # %entry
774; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
775; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
776; AVX512-NEXT:    vzeroupper
777; AVX512-NEXT:    retq
778entry:
779  %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
780  %1 = trunc <16 x i32> %0 to <16 x i16>
781  store <16 x i16> %1, <16 x i16>* undef, align 4
782  ret void
783}
784
785define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
786; SSE2-LABEL: trunc16i32_16i16_lshr:
787; SSE2:       # %bb.0: # %entry
788; SSE2-NEXT:    psrad $16, %xmm1
789; SSE2-NEXT:    psrad $16, %xmm0
790; SSE2-NEXT:    packssdw %xmm1, %xmm0
791; SSE2-NEXT:    psrad $16, %xmm3
792; SSE2-NEXT:    psrad $16, %xmm2
793; SSE2-NEXT:    packssdw %xmm3, %xmm2
794; SSE2-NEXT:    movdqu %xmm2, (%rax)
795; SSE2-NEXT:    movdqu %xmm0, (%rax)
796; SSE2-NEXT:    retq
797;
798; SSSE3-LABEL: trunc16i32_16i16_lshr:
799; SSSE3:       # %bb.0: # %entry
800; SSSE3-NEXT:    psrad $16, %xmm1
801; SSSE3-NEXT:    psrad $16, %xmm0
802; SSSE3-NEXT:    packssdw %xmm1, %xmm0
803; SSSE3-NEXT:    psrad $16, %xmm3
804; SSSE3-NEXT:    psrad $16, %xmm2
805; SSSE3-NEXT:    packssdw %xmm3, %xmm2
806; SSSE3-NEXT:    movdqu %xmm2, (%rax)
807; SSSE3-NEXT:    movdqu %xmm0, (%rax)
808; SSSE3-NEXT:    retq
809;
810; SSE41-LABEL: trunc16i32_16i16_lshr:
811; SSE41:       # %bb.0: # %entry
812; SSE41-NEXT:    psrld $16, %xmm3
813; SSE41-NEXT:    psrld $16, %xmm2
814; SSE41-NEXT:    packusdw %xmm3, %xmm2
815; SSE41-NEXT:    psrld $16, %xmm1
816; SSE41-NEXT:    psrld $16, %xmm0
817; SSE41-NEXT:    packusdw %xmm1, %xmm0
818; SSE41-NEXT:    movdqu %xmm2, (%rax)
819; SSE41-NEXT:    movdqu %xmm0, (%rax)
820; SSE41-NEXT:    retq
821;
822; AVX1-LABEL: trunc16i32_16i16_lshr:
823; AVX1:       # %bb.0: # %entry
824; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
825; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
826; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
827; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
828; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
829; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
830; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
831; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
832; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
833; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
834; AVX1-NEXT:    vzeroupper
835; AVX1-NEXT:    retq
836;
837; AVX2-LABEL: trunc16i32_16i16_lshr:
838; AVX2:       # %bb.0: # %entry
839; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
840; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
841; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
842; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
843; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
844; AVX2-NEXT:    vzeroupper
845; AVX2-NEXT:    retq
846;
847; AVX512-LABEL: trunc16i32_16i16_lshr:
848; AVX512:       # %bb.0: # %entry
849; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
850; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
851; AVX512-NEXT:    vzeroupper
852; AVX512-NEXT:    retq
853entry:
854  %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
855  %1 = trunc <16 x i32> %0 to <16 x i16>
856  store <16 x i16> %1, <16 x i16>* undef, align 4
857  ret void
858}
859
860define void @trunc16i32_16i8(<16 x i32> %a) {
861; SSE2-LABEL: trunc16i32_16i8:
862; SSE2:       # %bb.0: # %entry
863; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
864; SSE2-NEXT:    pand %xmm4, %xmm3
865; SSE2-NEXT:    pand %xmm4, %xmm2
866; SSE2-NEXT:    packuswb %xmm3, %xmm2
867; SSE2-NEXT:    pand %xmm4, %xmm1
868; SSE2-NEXT:    pand %xmm4, %xmm0
869; SSE2-NEXT:    packuswb %xmm1, %xmm0
870; SSE2-NEXT:    packuswb %xmm2, %xmm0
871; SSE2-NEXT:    movdqu %xmm0, (%rax)
872; SSE2-NEXT:    retq
873;
874; SSSE3-LABEL: trunc16i32_16i8:
875; SSSE3:       # %bb.0: # %entry
876; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
877; SSSE3-NEXT:    pand %xmm4, %xmm3
878; SSSE3-NEXT:    pand %xmm4, %xmm2
879; SSSE3-NEXT:    packuswb %xmm3, %xmm2
880; SSSE3-NEXT:    pand %xmm4, %xmm1
881; SSSE3-NEXT:    pand %xmm4, %xmm0
882; SSSE3-NEXT:    packuswb %xmm1, %xmm0
883; SSSE3-NEXT:    packuswb %xmm2, %xmm0
884; SSSE3-NEXT:    movdqu %xmm0, (%rax)
885; SSSE3-NEXT:    retq
886;
887; SSE41-LABEL: trunc16i32_16i8:
888; SSE41:       # %bb.0: # %entry
889; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
890; SSE41-NEXT:    pand %xmm4, %xmm3
891; SSE41-NEXT:    pand %xmm4, %xmm2
892; SSE41-NEXT:    packusdw %xmm3, %xmm2
893; SSE41-NEXT:    pand %xmm4, %xmm1
894; SSE41-NEXT:    pand %xmm4, %xmm0
895; SSE41-NEXT:    packusdw %xmm1, %xmm0
896; SSE41-NEXT:    packuswb %xmm2, %xmm0
897; SSE41-NEXT:    movdqu %xmm0, (%rax)
898; SSE41-NEXT:    retq
899;
900; AVX1-LABEL: trunc16i32_16i8:
901; AVX1:       # %bb.0: # %entry
902; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
903; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
904; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
905; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
906; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
907; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
908; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
909; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
910; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
911; AVX1-NEXT:    vzeroupper
912; AVX1-NEXT:    retq
913;
914; AVX2-LABEL: trunc16i32_16i8:
915; AVX2:       # %bb.0: # %entry
916; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
917; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
918; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
919; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
920; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
921; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
922; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
923; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
924; AVX2-NEXT:    vzeroupper
925; AVX2-NEXT:    retq
926;
927; AVX512-LABEL: trunc16i32_16i8:
928; AVX512:       # %bb.0: # %entry
929; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
930; AVX512-NEXT:    vzeroupper
931; AVX512-NEXT:    retq
932entry:
933  %0 = trunc <16 x i32> %a to <16 x i8>
934  store <16 x i8> %0, <16 x i8>* undef, align 4
935  ret void
936}
937
938define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
939; SSE2-LABEL: trunc16i32_16i8_ashr:
940; SSE2:       # %bb.0: # %entry
941; SSE2-NEXT:    psrld $24, %xmm1
942; SSE2-NEXT:    psrld $24, %xmm0
943; SSE2-NEXT:    packuswb %xmm1, %xmm0
944; SSE2-NEXT:    psrld $24, %xmm3
945; SSE2-NEXT:    psrld $24, %xmm2
946; SSE2-NEXT:    packuswb %xmm3, %xmm2
947; SSE2-NEXT:    packuswb %xmm2, %xmm0
948; SSE2-NEXT:    movdqu %xmm0, (%rax)
949; SSE2-NEXT:    retq
950;
951; SSSE3-LABEL: trunc16i32_16i8_ashr:
952; SSSE3:       # %bb.0: # %entry
953; SSSE3-NEXT:    psrld $24, %xmm1
954; SSSE3-NEXT:    psrld $24, %xmm0
955; SSSE3-NEXT:    packuswb %xmm1, %xmm0
956; SSSE3-NEXT:    psrld $24, %xmm3
957; SSSE3-NEXT:    psrld $24, %xmm2
958; SSSE3-NEXT:    packuswb %xmm3, %xmm2
959; SSSE3-NEXT:    packuswb %xmm2, %xmm0
960; SSSE3-NEXT:    movdqu %xmm0, (%rax)
961; SSSE3-NEXT:    retq
962;
963; SSE41-LABEL: trunc16i32_16i8_ashr:
964; SSE41:       # %bb.0: # %entry
965; SSE41-NEXT:    psrld $24, %xmm1
966; SSE41-NEXT:    psrld $24, %xmm0
967; SSE41-NEXT:    packusdw %xmm1, %xmm0
968; SSE41-NEXT:    psrld $24, %xmm3
969; SSE41-NEXT:    psrld $24, %xmm2
970; SSE41-NEXT:    packusdw %xmm3, %xmm2
971; SSE41-NEXT:    packuswb %xmm2, %xmm0
972; SSE41-NEXT:    movdqu %xmm0, (%rax)
973; SSE41-NEXT:    retq
974;
975; AVX1-LABEL: trunc16i32_16i8_ashr:
976; AVX1:       # %bb.0: # %entry
977; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
978; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
979; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
980; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
981; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
982; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
983; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
984; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
985; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
986; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
987; AVX1-NEXT:    vzeroupper
988; AVX1-NEXT:    retq
989;
990; AVX2-LABEL: trunc16i32_16i8_ashr:
991; AVX2:       # %bb.0: # %entry
992; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
993; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
994; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
995; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
996; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
997; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
998; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
999; AVX2-NEXT:    vzeroupper
1000; AVX2-NEXT:    retq
1001;
1002; AVX512-LABEL: trunc16i32_16i8_ashr:
1003; AVX512:       # %bb.0: # %entry
1004; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
1005; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
1006; AVX512-NEXT:    vzeroupper
1007; AVX512-NEXT:    retq
1008entry:
1009  %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
1010  %1 = trunc <16 x i32> %0 to <16 x i8>
1011  store <16 x i8> %1, <16 x i8>* undef, align 4
1012  ret void
1013}
1014
1015define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
1016; SSE2-LABEL: trunc16i32_16i8_lshr:
1017; SSE2:       # %bb.0: # %entry
1018; SSE2-NEXT:    psrld $24, %xmm1
1019; SSE2-NEXT:    psrld $24, %xmm0
1020; SSE2-NEXT:    packuswb %xmm1, %xmm0
1021; SSE2-NEXT:    psrld $24, %xmm3
1022; SSE2-NEXT:    psrld $24, %xmm2
1023; SSE2-NEXT:    packuswb %xmm3, %xmm2
1024; SSE2-NEXT:    packuswb %xmm2, %xmm0
1025; SSE2-NEXT:    movdqu %xmm0, (%rax)
1026; SSE2-NEXT:    retq
1027;
1028; SSSE3-LABEL: trunc16i32_16i8_lshr:
1029; SSSE3:       # %bb.0: # %entry
1030; SSSE3-NEXT:    psrld $24, %xmm1
1031; SSSE3-NEXT:    psrld $24, %xmm0
1032; SSSE3-NEXT:    packuswb %xmm1, %xmm0
1033; SSSE3-NEXT:    psrld $24, %xmm3
1034; SSSE3-NEXT:    psrld $24, %xmm2
1035; SSSE3-NEXT:    packuswb %xmm3, %xmm2
1036; SSSE3-NEXT:    packuswb %xmm2, %xmm0
1037; SSSE3-NEXT:    movdqu %xmm0, (%rax)
1038; SSSE3-NEXT:    retq
1039;
1040; SSE41-LABEL: trunc16i32_16i8_lshr:
1041; SSE41:       # %bb.0: # %entry
1042; SSE41-NEXT:    psrld $24, %xmm1
1043; SSE41-NEXT:    psrld $24, %xmm0
1044; SSE41-NEXT:    packusdw %xmm1, %xmm0
1045; SSE41-NEXT:    psrld $24, %xmm3
1046; SSE41-NEXT:    psrld $24, %xmm2
1047; SSE41-NEXT:    packusdw %xmm3, %xmm2
1048; SSE41-NEXT:    packuswb %xmm2, %xmm0
1049; SSE41-NEXT:    movdqu %xmm0, (%rax)
1050; SSE41-NEXT:    retq
1051;
1052; AVX1-LABEL: trunc16i32_16i8_lshr:
1053; AVX1:       # %bb.0: # %entry
1054; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1055; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
1056; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
1057; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1058; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1059; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
1060; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
1061; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
1062; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1063; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1064; AVX1-NEXT:    vzeroupper
1065; AVX1-NEXT:    retq
1066;
1067; AVX2-LABEL: trunc16i32_16i8_lshr:
1068; AVX2:       # %bb.0: # %entry
1069; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
1070; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
1071; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1072; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1073; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1074; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1075; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1076; AVX2-NEXT:    vzeroupper
1077; AVX2-NEXT:    retq
1078;
1079; AVX512-LABEL: trunc16i32_16i8_lshr:
1080; AVX512:       # %bb.0: # %entry
1081; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
1082; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
1083; AVX512-NEXT:    vzeroupper
1084; AVX512-NEXT:    retq
1085entry:
1086  %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
1087  %1 = trunc <16 x i32> %0 to <16 x i8>
1088  store <16 x i8> %1, <16 x i8>* undef, align 4
1089  ret void
1090}
1091
1092;PR25684
1093define void @trunc16i16_16i8(<16 x i16> %a) {
1094; SSE-LABEL: trunc16i16_16i8:
1095; SSE:       # %bb.0: # %entry
1096; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1097; SSE-NEXT:    pand %xmm2, %xmm1
1098; SSE-NEXT:    pand %xmm2, %xmm0
1099; SSE-NEXT:    packuswb %xmm1, %xmm0
1100; SSE-NEXT:    movdqu %xmm0, (%rax)
1101; SSE-NEXT:    retq
1102;
1103; AVX1-LABEL: trunc16i16_16i8:
1104; AVX1:       # %bb.0: # %entry
1105; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1106; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1107; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1108; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1109; AVX1-NEXT:    vzeroupper
1110; AVX1-NEXT:    retq
1111;
1112; AVX2-LABEL: trunc16i16_16i8:
1113; AVX2:       # %bb.0: # %entry
1114; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1115; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1116; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1117; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1118; AVX2-NEXT:    vzeroupper
1119; AVX2-NEXT:    retq
1120;
1121; AVX512F-LABEL: trunc16i16_16i8:
1122; AVX512F:       # %bb.0: # %entry
1123; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1124; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1125; AVX512F-NEXT:    vzeroupper
1126; AVX512F-NEXT:    retq
1127;
1128; AVX512VL-LABEL: trunc16i16_16i8:
1129; AVX512VL:       # %bb.0: # %entry
1130; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1131; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1132; AVX512VL-NEXT:    vzeroupper
1133; AVX512VL-NEXT:    retq
1134;
1135; AVX512BW-LABEL: trunc16i16_16i8:
1136; AVX512BW:       # %bb.0: # %entry
1137; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1138; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1139; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1140; AVX512BW-NEXT:    vzeroupper
1141; AVX512BW-NEXT:    retq
1142;
1143; AVX512BWVL-LABEL: trunc16i16_16i8:
1144; AVX512BWVL:       # %bb.0: # %entry
1145; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1146; AVX512BWVL-NEXT:    vzeroupper
1147; AVX512BWVL-NEXT:    retq
1148entry:
1149  %0 = trunc <16 x i16> %a to <16 x i8>
1150  store <16 x i8> %0, <16 x i8>* undef, align 4
1151  ret void
1152}
1153
1154define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
1155; SSE-LABEL: trunc16i16_16i8_ashr:
1156; SSE:       # %bb.0: # %entry
1157; SSE-NEXT:    psrlw $8, %xmm1
1158; SSE-NEXT:    psrlw $8, %xmm0
1159; SSE-NEXT:    packuswb %xmm1, %xmm0
1160; SSE-NEXT:    movdqu %xmm0, (%rax)
1161; SSE-NEXT:    retq
1162;
1163; AVX1-LABEL: trunc16i16_16i8_ashr:
1164; AVX1:       # %bb.0: # %entry
1165; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1166; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1167; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1168; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1169; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1170; AVX1-NEXT:    vzeroupper
1171; AVX1-NEXT:    retq
1172;
1173; AVX2-LABEL: trunc16i16_16i8_ashr:
1174; AVX2:       # %bb.0: # %entry
1175; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1176; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1177; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1178; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1179; AVX2-NEXT:    vzeroupper
1180; AVX2-NEXT:    retq
1181;
1182; AVX512F-LABEL: trunc16i16_16i8_ashr:
1183; AVX512F:       # %bb.0: # %entry
1184; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1185; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1186; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1187; AVX512F-NEXT:    vzeroupper
1188; AVX512F-NEXT:    retq
1189;
1190; AVX512VL-LABEL: trunc16i16_16i8_ashr:
1191; AVX512VL:       # %bb.0: # %entry
1192; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1193; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1194; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1195; AVX512VL-NEXT:    vzeroupper
1196; AVX512VL-NEXT:    retq
1197;
1198; AVX512BW-LABEL: trunc16i16_16i8_ashr:
1199; AVX512BW:       # %bb.0: # %entry
1200; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1201; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1202; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1203; AVX512BW-NEXT:    vzeroupper
1204; AVX512BW-NEXT:    retq
1205;
1206; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
1207; AVX512BWVL:       # %bb.0: # %entry
1208; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1209; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1210; AVX512BWVL-NEXT:    vzeroupper
1211; AVX512BWVL-NEXT:    retq
1212entry:
1213  %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1214  %1 = trunc <16 x i16> %0 to <16 x i8>
1215  store <16 x i8> %1, <16 x i8>* undef, align 4
1216  ret void
1217}
1218
1219define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
1220; SSE-LABEL: trunc16i16_16i8_lshr:
1221; SSE:       # %bb.0: # %entry
1222; SSE-NEXT:    psrlw $8, %xmm1
1223; SSE-NEXT:    psrlw $8, %xmm0
1224; SSE-NEXT:    packuswb %xmm1, %xmm0
1225; SSE-NEXT:    movdqu %xmm0, (%rax)
1226; SSE-NEXT:    retq
1227;
1228; AVX1-LABEL: trunc16i16_16i8_lshr:
1229; AVX1:       # %bb.0: # %entry
1230; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1231; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1232; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1233; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1234; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1235; AVX1-NEXT:    vzeroupper
1236; AVX1-NEXT:    retq
1237;
1238; AVX2-LABEL: trunc16i16_16i8_lshr:
1239; AVX2:       # %bb.0: # %entry
1240; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1241; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1242; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1243; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1244; AVX2-NEXT:    vzeroupper
1245; AVX2-NEXT:    retq
1246;
1247; AVX512F-LABEL: trunc16i16_16i8_lshr:
1248; AVX512F:       # %bb.0: # %entry
1249; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1250; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1251; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1252; AVX512F-NEXT:    vzeroupper
1253; AVX512F-NEXT:    retq
1254;
1255; AVX512VL-LABEL: trunc16i16_16i8_lshr:
1256; AVX512VL:       # %bb.0: # %entry
1257; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1258; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1259; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1260; AVX512VL-NEXT:    vzeroupper
1261; AVX512VL-NEXT:    retq
1262;
1263; AVX512BW-LABEL: trunc16i16_16i8_lshr:
1264; AVX512BW:       # %bb.0: # %entry
1265; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1266; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1267; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1268; AVX512BW-NEXT:    vzeroupper
1269; AVX512BW-NEXT:    retq
1270;
1271; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
1272; AVX512BWVL:       # %bb.0: # %entry
1273; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1274; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1275; AVX512BWVL-NEXT:    vzeroupper
1276; AVX512BWVL-NEXT:    retq
1277entry:
1278  %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1279  %1 = trunc <16 x i16> %0 to <16 x i8>
1280  store <16 x i8> %1, <16 x i8>* undef, align 4
1281  ret void
1282}
1283
1284define void @trunc32i16_32i8(<32 x i16> %a) {
1285; SSE-LABEL: trunc32i16_32i8:
1286; SSE:       # %bb.0: # %entry
1287; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1288; SSE-NEXT:    pand %xmm4, %xmm1
1289; SSE-NEXT:    pand %xmm4, %xmm0
1290; SSE-NEXT:    packuswb %xmm1, %xmm0
1291; SSE-NEXT:    pand %xmm4, %xmm3
1292; SSE-NEXT:    pand %xmm4, %xmm2
1293; SSE-NEXT:    packuswb %xmm3, %xmm2
1294; SSE-NEXT:    movdqu %xmm2, (%rax)
1295; SSE-NEXT:    movdqu %xmm0, (%rax)
1296; SSE-NEXT:    retq
1297;
1298; AVX1-LABEL: trunc32i16_32i8:
1299; AVX1:       # %bb.0: # %entry
1300; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1301; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1302; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1303; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1304; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1305; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1306; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1307; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1308; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1309; AVX1-NEXT:    vzeroupper
1310; AVX1-NEXT:    retq
1311;
1312; AVX2-LABEL: trunc32i16_32i8:
1313; AVX2:       # %bb.0: # %entry
1314; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1315; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1316; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1317; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1318; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1319; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1320; AVX2-NEXT:    vzeroupper
1321; AVX2-NEXT:    retq
1322;
1323; AVX512F-LABEL: trunc32i16_32i8:
1324; AVX512F:       # %bb.0: # %entry
1325; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1326; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1327; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
1328; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1329; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1330; AVX512F-NEXT:    vzeroupper
1331; AVX512F-NEXT:    retq
1332;
1333; AVX512VL-LABEL: trunc32i16_32i8:
1334; AVX512VL:       # %bb.0: # %entry
1335; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1336; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1337; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
1338; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1339; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1340; AVX512VL-NEXT:    vzeroupper
1341; AVX512VL-NEXT:    retq
1342;
1343; AVX512BW-LABEL: trunc32i16_32i8:
1344; AVX512BW:       # %bb.0: # %entry
1345; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
1346; AVX512BW-NEXT:    vzeroupper
1347; AVX512BW-NEXT:    retq
1348;
1349; AVX512BWVL-LABEL: trunc32i16_32i8:
1350; AVX512BWVL:       # %bb.0: # %entry
1351; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
1352; AVX512BWVL-NEXT:    vzeroupper
1353; AVX512BWVL-NEXT:    retq
1354entry:
1355  %0 = trunc <32 x i16> %a to <32 x i8>
1356  store <32 x i8> %0, <32 x i8>* undef, align 4
1357  ret void
1358}
1359
1360define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
1361; SSE-LABEL: trunc2x4i64_8i32:
1362; SSE:       # %bb.0: # %entry
1363; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1364; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1365; SSE-NEXT:    movaps %xmm2, %xmm1
1366; SSE-NEXT:    retq
1367;
1368; AVX1-LABEL: trunc2x4i64_8i32:
1369; AVX1:       # %bb.0: # %entry
1370; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1371; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1372; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1373; AVX1-NEXT:    retq
1374;
1375; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
1376; AVX2-SLOW:       # %bb.0: # %entry
1377; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1378; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1379; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1380; AVX2-SLOW-NEXT:    retq
1381;
1382; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32:
1383; AVX2-FAST-ALL:       # %bb.0: # %entry
1384; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1385; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
1386; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
1387; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1388; AVX2-FAST-ALL-NEXT:    retq
1389;
1390; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32:
1391; AVX2-FAST-PERLANE:       # %bb.0: # %entry
1392; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1393; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1394; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1395; AVX2-FAST-PERLANE-NEXT:    retq
1396;
1397; AVX512F-LABEL: trunc2x4i64_8i32:
1398; AVX512F:       # %bb.0: # %entry
1399; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1400; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1401; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1402; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1403; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1404; AVX512F-NEXT:    retq
1405;
1406; AVX512VL-LABEL: trunc2x4i64_8i32:
1407; AVX512VL:       # %bb.0: # %entry
1408; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1409; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
1410; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1411; AVX512VL-NEXT:    retq
1412;
1413; AVX512BW-LABEL: trunc2x4i64_8i32:
1414; AVX512BW:       # %bb.0: # %entry
1415; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1416; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1417; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1418; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1419; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1420; AVX512BW-NEXT:    retq
1421;
1422; AVX512BWVL-LABEL: trunc2x4i64_8i32:
1423; AVX512BWVL:       # %bb.0: # %entry
1424; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1425; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
1426; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1427; AVX512BWVL-NEXT:    retq
1428entry:
1429  %0 = trunc <4 x i64> %a to <4 x i32>
1430  %1 = trunc <4 x i64> %b to <4 x i32>
1431  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1432  ret <8 x i32> %2
1433}
1434
1435define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
1436; SSE2-LABEL: trunc2x4i64_8i16:
1437; SSE2:       # %bb.0: # %entry
1438; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1439; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1440; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1441; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1442; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1443; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1444; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1445; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1446; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1447; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1448; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1449; SSE2-NEXT:    retq
1450;
1451; SSSE3-LABEL: trunc2x4i64_8i16:
1452; SSSE3:       # %bb.0: # %entry
1453; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1454; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1455; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1456; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1457; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1458; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1459; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1460; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1461; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1462; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1463; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1464; SSSE3-NEXT:    retq
1465;
1466; SSE41-LABEL: trunc2x4i64_8i16:
1467; SSE41:       # %bb.0: # %entry
1468; SSE41-NEXT:    pxor %xmm4, %xmm4
1469; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
1470; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
1471; SSE41-NEXT:    packusdw %xmm3, %xmm2
1472; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
1473; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
1474; SSE41-NEXT:    packusdw %xmm1, %xmm0
1475; SSE41-NEXT:    packusdw %xmm2, %xmm0
1476; SSE41-NEXT:    retq
1477;
1478; AVX1-LABEL: trunc2x4i64_8i16:
1479; AVX1:       # %bb.0: # %entry
1480; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1481; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1482; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1483; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1484; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
1485; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1486; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1487; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1488; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1489; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1490; AVX1-NEXT:    vzeroupper
1491; AVX1-NEXT:    retq
1492;
1493; AVX2-LABEL: trunc2x4i64_8i16:
1494; AVX2:       # %bb.0: # %entry
1495; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1496; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1497; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1498; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1499; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1500; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1501; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1502; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1503; AVX2-NEXT:    vzeroupper
1504; AVX2-NEXT:    retq
1505;
1506; AVX512F-LABEL: trunc2x4i64_8i16:
1507; AVX512F:       # %bb.0: # %entry
1508; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1509; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1510; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1511; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1512; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1513; AVX512F-NEXT:    vzeroupper
1514; AVX512F-NEXT:    retq
1515;
1516; AVX512VL-LABEL: trunc2x4i64_8i16:
1517; AVX512VL:       # %bb.0: # %entry
1518; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
1519; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
1520; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1521; AVX512VL-NEXT:    vzeroupper
1522; AVX512VL-NEXT:    retq
1523;
1524; AVX512BW-LABEL: trunc2x4i64_8i16:
1525; AVX512BW:       # %bb.0: # %entry
1526; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1527; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1528; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1529; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1530; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1531; AVX512BW-NEXT:    vzeroupper
1532; AVX512BW-NEXT:    retq
1533;
1534; AVX512BWVL-LABEL: trunc2x4i64_8i16:
1535; AVX512BWVL:       # %bb.0: # %entry
1536; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
1537; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
1538; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1539; AVX512BWVL-NEXT:    vzeroupper
1540; AVX512BWVL-NEXT:    retq
1541entry:
1542  %0 = trunc <4 x i64> %a to <4 x i16>
1543  %1 = trunc <4 x i64> %b to <4 x i16>
1544  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1545  ret <8 x i16> %2
1546}
1547
1548define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
1549; SSE-LABEL: trunc2x2i64_4i32:
1550; SSE:       # %bb.0: # %entry
1551; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1552; SSE-NEXT:    retq
1553;
1554; AVX-LABEL: trunc2x2i64_4i32:
1555; AVX:       # %bb.0: # %entry
1556; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1557; AVX-NEXT:    retq
1558;
1559; AVX512F-LABEL: trunc2x2i64_4i32:
1560; AVX512F:       # %bb.0: # %entry
1561; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1562; AVX512F-NEXT:    retq
1563;
1564; AVX512VL-LABEL: trunc2x2i64_4i32:
1565; AVX512VL:       # %bb.0: # %entry
1566; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1567; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1568; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1569; AVX512VL-NEXT:    vzeroupper
1570; AVX512VL-NEXT:    retq
1571;
1572; AVX512BW-LABEL: trunc2x2i64_4i32:
1573; AVX512BW:       # %bb.0: # %entry
1574; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1575; AVX512BW-NEXT:    retq
1576;
1577; AVX512BWVL-LABEL: trunc2x2i64_4i32:
1578; AVX512BWVL:       # %bb.0: # %entry
1579; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1580; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1581; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1582; AVX512BWVL-NEXT:    vzeroupper
1583; AVX512BWVL-NEXT:    retq
1584entry:
1585  %0 = trunc <2 x i64> %a to <2 x i32>
1586  %1 = trunc <2 x i64> %b to <2 x i32>
1587  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1588  ret <4 x i32> %2
1589}
1590
1591define i64 @trunc2i64_i64(<2 x i64> %inval) {
1592; SSE-LABEL: trunc2i64_i64:
1593; SSE:       # %bb.0: # %entry
1594; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1595; SSE-NEXT:    movq %xmm0, %rax
1596; SSE-NEXT:    retq
1597;
1598; AVX-LABEL: trunc2i64_i64:
1599; AVX:       # %bb.0: # %entry
1600; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1601; AVX-NEXT:    vmovq %xmm0, %rax
1602; AVX-NEXT:    retq
1603;
1604; AVX512-LABEL: trunc2i64_i64:
1605; AVX512:       # %bb.0: # %entry
1606; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1607; AVX512-NEXT:    vmovq %xmm0, %rax
1608; AVX512-NEXT:    retq
1609entry:
1610  %0 = trunc <2 x i64> %inval to <2 x i32>
1611  %1 = bitcast <2 x i32> %0 to i64
1612  ret i64 %1
1613}
1614
1615define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
1616; SSE2-LABEL: trunc2x4i32_8i16:
1617; SSE2:       # %bb.0: # %entry
1618; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1619; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1620; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1621; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1622; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1623; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1624; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1625; SSE2-NEXT:    retq
1626;
1627; SSSE3-LABEL: trunc2x4i32_8i16:
1628; SSSE3:       # %bb.0: # %entry
1629; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1630; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1631; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1632; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1633; SSSE3-NEXT:    retq
1634;
1635; SSE41-LABEL: trunc2x4i32_8i16:
1636; SSE41:       # %bb.0: # %entry
1637; SSE41-NEXT:    pxor %xmm2, %xmm2
1638; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1639; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1640; SSE41-NEXT:    packusdw %xmm1, %xmm0
1641; SSE41-NEXT:    retq
1642;
1643; AVX-LABEL: trunc2x4i32_8i16:
1644; AVX:       # %bb.0: # %entry
1645; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1646; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1647; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1648; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1649; AVX-NEXT:    retq
1650;
1651; AVX512F-LABEL: trunc2x4i32_8i16:
1652; AVX512F:       # %bb.0: # %entry
1653; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1654; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1655; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1656; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1657; AVX512F-NEXT:    vzeroupper
1658; AVX512F-NEXT:    retq
1659;
1660; AVX512VL-LABEL: trunc2x4i32_8i16:
1661; AVX512VL:       # %bb.0: # %entry
1662; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1663; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1664; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1665; AVX512VL-NEXT:    vzeroupper
1666; AVX512VL-NEXT:    retq
1667;
1668; AVX512BW-LABEL: trunc2x4i32_8i16:
1669; AVX512BW:       # %bb.0: # %entry
1670; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1671; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1672; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1673; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1674; AVX512BW-NEXT:    vzeroupper
1675; AVX512BW-NEXT:    retq
1676;
1677; AVX512BWVL-LABEL: trunc2x4i32_8i16:
1678; AVX512BWVL:       # %bb.0: # %entry
1679; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1680; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1681; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1682; AVX512BWVL-NEXT:    vzeroupper
1683; AVX512BWVL-NEXT:    retq
1684entry:
1685  %0 = trunc <4 x i32> %a to <4 x i16>
1686  %1 = trunc <4 x i32> %b to <4 x i16>
1687  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1688  ret <8 x i16> %2
1689}
1690
1691; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1692define i64 @trunc4i32_i64(<4 x i32> %inval) {
1693; SSE2-LABEL: trunc4i32_i64:
1694; SSE2:       # %bb.0: # %entry
1695; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1696; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1697; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1698; SSE2-NEXT:    movq %xmm0, %rax
1699; SSE2-NEXT:    retq
1700;
1701; SSSE3-LABEL: trunc4i32_i64:
1702; SSSE3:       # %bb.0: # %entry
1703; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1704; SSSE3-NEXT:    movq %xmm0, %rax
1705; SSSE3-NEXT:    retq
1706;
1707; SSE41-LABEL: trunc4i32_i64:
1708; SSE41:       # %bb.0: # %entry
1709; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1710; SSE41-NEXT:    movq %xmm0, %rax
1711; SSE41-NEXT:    retq
1712;
1713; AVX-LABEL: trunc4i32_i64:
1714; AVX:       # %bb.0: # %entry
1715; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1716; AVX-NEXT:    vmovq %xmm0, %rax
1717; AVX-NEXT:    retq
1718;
1719; AVX512F-LABEL: trunc4i32_i64:
1720; AVX512F:       # %bb.0: # %entry
1721; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1722; AVX512F-NEXT:    vmovq %xmm0, %rax
1723; AVX512F-NEXT:    retq
1724;
1725; AVX512VL-LABEL: trunc4i32_i64:
1726; AVX512VL:       # %bb.0: # %entry
1727; AVX512VL-NEXT:    vpmovdw %xmm0, %xmm0
1728; AVX512VL-NEXT:    vmovq %xmm0, %rax
1729; AVX512VL-NEXT:    retq
1730;
1731; AVX512BW-LABEL: trunc4i32_i64:
1732; AVX512BW:       # %bb.0: # %entry
1733; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1734; AVX512BW-NEXT:    vmovq %xmm0, %rax
1735; AVX512BW-NEXT:    retq
1736;
1737; AVX512BWVL-LABEL: trunc4i32_i64:
1738; AVX512BWVL:       # %bb.0: # %entry
1739; AVX512BWVL-NEXT:    vpmovdw %xmm0, %xmm0
1740; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
1741; AVX512BWVL-NEXT:    retq
1742entry:
1743  %0 = trunc <4 x i32> %inval to <4 x i16>
1744  %1 = bitcast <4 x i16> %0 to i64
1745  ret i64 %1
1746}
1747
1748define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
1749; SSE2-LABEL: trunc2x8i16_16i8:
1750; SSE2:       # %bb.0: # %entry
1751; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1752; SSE2-NEXT:    pand %xmm2, %xmm0
1753; SSE2-NEXT:    pand %xmm2, %xmm1
1754; SSE2-NEXT:    packuswb %xmm1, %xmm0
1755; SSE2-NEXT:    retq
1756;
1757; SSSE3-LABEL: trunc2x8i16_16i8:
1758; SSSE3:       # %bb.0: # %entry
1759; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1760; SSSE3-NEXT:    pand %xmm2, %xmm1
1761; SSSE3-NEXT:    pand %xmm2, %xmm0
1762; SSSE3-NEXT:    packuswb %xmm1, %xmm0
1763; SSSE3-NEXT:    retq
1764;
1765; SSE41-LABEL: trunc2x8i16_16i8:
1766; SSE41:       # %bb.0: # %entry
1767; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1768; SSE41-NEXT:    pand %xmm2, %xmm1
1769; SSE41-NEXT:    pand %xmm2, %xmm0
1770; SSE41-NEXT:    packuswb %xmm1, %xmm0
1771; SSE41-NEXT:    retq
1772;
1773; AVX-LABEL: trunc2x8i16_16i8:
1774; AVX:       # %bb.0: # %entry
1775; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1776; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
1777; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
1778; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1779; AVX-NEXT:    retq
1780;
1781; AVX512F-LABEL: trunc2x8i16_16i8:
1782; AVX512F:       # %bb.0: # %entry
1783; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1784; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
1785; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
1786; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1787; AVX512F-NEXT:    retq
1788;
1789; AVX512VL-LABEL: trunc2x8i16_16i8:
1790; AVX512VL:       # %bb.0: # %entry
1791; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1792; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
1793; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
1794; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1795; AVX512VL-NEXT:    retq
1796;
1797; AVX512BW-LABEL: trunc2x8i16_16i8:
1798; AVX512BW:       # %bb.0: # %entry
1799; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1800; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1801; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1802; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1803; AVX512BW-NEXT:    vzeroupper
1804; AVX512BW-NEXT:    retq
1805;
1806; AVX512BWVL-LABEL: trunc2x8i16_16i8:
1807; AVX512BWVL:       # %bb.0: # %entry
1808; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1809; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1810; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1811; AVX512BWVL-NEXT:    vzeroupper
1812; AVX512BWVL-NEXT:    retq
1813entry:
1814  %0 = trunc <8 x i16> %a to <8 x i8>
1815  %1 = trunc <8 x i16> %b to <8 x i8>
1816  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1817  ret <16 x i8> %2
1818}
1819
1820; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1821define i64 @trunc8i16_i64(<8 x i16> %inval) {
1822; SSE2-LABEL: trunc8i16_i64:
1823; SSE2:       # %bb.0: # %entry
1824; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1825; SSE2-NEXT:    packuswb %xmm0, %xmm0
1826; SSE2-NEXT:    movq %xmm0, %rax
1827; SSE2-NEXT:    retq
1828;
1829; SSSE3-LABEL: trunc8i16_i64:
1830; SSSE3:       # %bb.0: # %entry
1831; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1832; SSSE3-NEXT:    movq %xmm0, %rax
1833; SSSE3-NEXT:    retq
1834;
1835; SSE41-LABEL: trunc8i16_i64:
1836; SSE41:       # %bb.0: # %entry
1837; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1838; SSE41-NEXT:    movq %xmm0, %rax
1839; SSE41-NEXT:    retq
1840;
1841; AVX-LABEL: trunc8i16_i64:
1842; AVX:       # %bb.0: # %entry
1843; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1844; AVX-NEXT:    vmovq %xmm0, %rax
1845; AVX-NEXT:    retq
1846;
1847; AVX512F-LABEL: trunc8i16_i64:
1848; AVX512F:       # %bb.0: # %entry
1849; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1850; AVX512F-NEXT:    vmovq %xmm0, %rax
1851; AVX512F-NEXT:    retq
1852;
1853; AVX512VL-LABEL: trunc8i16_i64:
1854; AVX512VL:       # %bb.0: # %entry
1855; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1856; AVX512VL-NEXT:    vmovq %xmm0, %rax
1857; AVX512VL-NEXT:    retq
1858;
1859; AVX512BW-LABEL: trunc8i16_i64:
1860; AVX512BW:       # %bb.0: # %entry
1861; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1862; AVX512BW-NEXT:    vmovq %xmm0, %rax
1863; AVX512BW-NEXT:    retq
1864;
1865; AVX512BWVL-LABEL: trunc8i16_i64:
1866; AVX512BWVL:       # %bb.0: # %entry
1867; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
1868; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
1869; AVX512BWVL-NEXT:    retq
1870entry:
1871  %0 = trunc <8 x i16> %inval to <8 x i8>
1872  %1 = bitcast <8 x i8> %0 to i64
1873  ret i64 %1
1874}
1875
1876define <16 x i8> @trunc16i64_16i8_const() {
1877; SSE-LABEL: trunc16i64_16i8_const:
1878; SSE:       # %bb.0: # %entry
1879; SSE-NEXT:    xorps %xmm0, %xmm0
1880; SSE-NEXT:    retq
1881;
1882; AVX-LABEL: trunc16i64_16i8_const:
1883; AVX:       # %bb.0: # %entry
1884; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1885; AVX-NEXT:    retq
1886;
1887; AVX512-LABEL: trunc16i64_16i8_const:
1888; AVX512:       # %bb.0: # %entry
1889; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1890; AVX512-NEXT:    retq
1891
1892entry:
1893  %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
1894  %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
1895  ret <16 x i8> %1
1896}
1897
1898define <8 x i16> @PR32160(<8 x i32> %x) {
1899; SSE-LABEL: PR32160:
1900; SSE:       # %bb.0:
1901; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1902; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1903; SSE-NEXT:    retq
1904;
1905; AVX-LABEL: PR32160:
1906; AVX:       # %bb.0:
1907; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
1908; AVX-NEXT:    vzeroupper
1909; AVX-NEXT:    retq
1910;
1911; AVX512F-LABEL: PR32160:
1912; AVX512F:       # %bb.0:
1913; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1914; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1915; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1916; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
1917; AVX512F-NEXT:    vzeroupper
1918; AVX512F-NEXT:    retq
1919;
1920; AVX512VL-LABEL: PR32160:
1921; AVX512VL:       # %bb.0:
1922; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1923; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1924; AVX512VL-NEXT:    vzeroupper
1925; AVX512VL-NEXT:    retq
1926;
1927; AVX512BW-LABEL: PR32160:
1928; AVX512BW:       # %bb.0:
1929; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1930; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1931; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1932; AVX512BW-NEXT:    vzeroupper
1933; AVX512BW-NEXT:    retq
1934;
1935; AVX512BWVL-LABEL: PR32160:
1936; AVX512BWVL:       # %bb.0:
1937; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1938; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1939; AVX512BWVL-NEXT:    vzeroupper
1940; AVX512BWVL-NEXT:    retq
1941  %shuf = trunc <8 x i32> %x to <8 x i16>
1942  %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1943  ret <8 x i16> %trunc
1944}
1945
1946define void @PR34773(i16* %a0, i8* %a1) {
1947; SSE-LABEL: PR34773:
1948; SSE:       # %bb.0:
1949; SSE-NEXT:    movdqu (%rdi), %xmm0
1950; SSE-NEXT:    movdqu 16(%rdi), %xmm1
1951; SSE-NEXT:    movdqu 32(%rdi), %xmm2
1952; SSE-NEXT:    movdqu 48(%rdi), %xmm3
1953; SSE-NEXT:    psrlw $8, %xmm1
1954; SSE-NEXT:    psrlw $8, %xmm0
1955; SSE-NEXT:    packuswb %xmm1, %xmm0
1956; SSE-NEXT:    psrlw $8, %xmm3
1957; SSE-NEXT:    psrlw $8, %xmm2
1958; SSE-NEXT:    packuswb %xmm3, %xmm2
1959; SSE-NEXT:    movdqu %xmm0, (%rsi)
1960; SSE-NEXT:    movdqu %xmm2, 16(%rsi)
1961; SSE-NEXT:    retq
1962;
1963; AVX1-LABEL: PR34773:
1964; AVX1:       # %bb.0:
1965; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
1966; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
1967; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
1968; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
1969; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1970; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1971; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1972; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
1973; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1974; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
1975; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
1976; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
1977; AVX1-NEXT:    retq
1978;
1979; AVX2-LABEL: PR34773:
1980; AVX2:       # %bb.0:
1981; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
1982; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
1983; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1984; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1985; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1986; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1987; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1988; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1989; AVX2-NEXT:    vmovdqu %xmm0, (%rsi)
1990; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi)
1991; AVX2-NEXT:    vzeroupper
1992; AVX2-NEXT:    retq
1993;
1994; AVX512F-LABEL: PR34773:
1995; AVX512F:       # %bb.0:
1996; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
1997; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
1998; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1999; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
2000; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2001; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
2002; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2003; AVX512F-NEXT:    vpmovdb %zmm0, 16(%rsi)
2004; AVX512F-NEXT:    vzeroupper
2005; AVX512F-NEXT:    retq
2006;
2007; AVX512VL-LABEL: PR34773:
2008; AVX512VL:       # %bb.0:
2009; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
2010; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
2011; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
2012; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
2013; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2014; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
2015; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2016; AVX512VL-NEXT:    vpmovdb %zmm0, 16(%rsi)
2017; AVX512VL-NEXT:    vzeroupper
2018; AVX512VL-NEXT:    retq
2019;
2020; AVX512BW-LABEL: PR34773:
2021; AVX512BW:       # %bb.0:
2022; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
2023; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm1
2024; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
2025; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
2026; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2027; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
2028; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
2029; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi)
2030; AVX512BW-NEXT:    vzeroupper
2031; AVX512BW-NEXT:    retq
2032;
2033; AVX512BWVL-LABEL: PR34773:
2034; AVX512BWVL:       # %bb.0:
2035; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
2036; AVX512BWVL-NEXT:    vpsrlw $8, 32(%rdi), %ymm1
2037; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
2038; AVX512BWVL-NEXT:    vpmovwb %ymm1, 16(%rsi)
2039; AVX512BWVL-NEXT:    vzeroupper
2040; AVX512BWVL-NEXT:    retq
2041  %1  = getelementptr i16, i16* %a0, i64 16
2042  %2  = getelementptr i8, i8* %a1, i64 16
2043  %3  = bitcast i16* %a0 to <16 x i16>*
2044  %4  = bitcast i16* %1 to <16 x i16>*
2045  %5  = bitcast i8* %a1 to <16 x i8>*
2046  %6  = bitcast i8* %2 to <16 x i8>*
2047  %7  = load <16 x i16>, <16 x i16>* %3, align 2
2048  %8  = load <16 x i16>, <16 x i16>* %4, align 2
2049  %9  = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2050  %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2051  %11 = trunc <16 x i16> %9  to <16 x i8>
2052  %12 = trunc <16 x i16> %10 to <16 x i8>
2053  store <16 x i8> %11, <16 x i8>* %5, align 1
2054  store <16 x i8> %12, <16 x i8>* %6, align 1
2055  ret void
2056}
2057
2058; Store merging must not infinitely fight store splitting.
2059
2060define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 {
2061; SSE2-LABEL: store_merge_split:
2062; SSE2:       # %bb.0:
2063; SSE2-NEXT:    pslld $16, %xmm1
2064; SSE2-NEXT:    psrad $16, %xmm1
2065; SSE2-NEXT:    pslld $16, %xmm0
2066; SSE2-NEXT:    psrad $16, %xmm0
2067; SSE2-NEXT:    packssdw %xmm1, %xmm0
2068; SSE2-NEXT:    pslld $16, %xmm3
2069; SSE2-NEXT:    psrad $16, %xmm3
2070; SSE2-NEXT:    pslld $16, %xmm2
2071; SSE2-NEXT:    psrad $16, %xmm2
2072; SSE2-NEXT:    packssdw %xmm3, %xmm2
2073; SSE2-NEXT:    shlq $4, %rdi
2074; SSE2-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2075; SSE2-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2076; SSE2-NEXT:    retq
2077;
2078; SSSE3-LABEL: store_merge_split:
2079; SSSE3:       # %bb.0:
2080; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2081; SSSE3-NEXT:    pshufb %xmm4, %xmm1
2082; SSSE3-NEXT:    pshufb %xmm4, %xmm0
2083; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2084; SSSE3-NEXT:    pshufb %xmm4, %xmm3
2085; SSSE3-NEXT:    pshufb %xmm4, %xmm2
2086; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2087; SSSE3-NEXT:    shlq $4, %rdi
2088; SSSE3-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2089; SSSE3-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2090; SSSE3-NEXT:    retq
2091;
2092; SSE41-LABEL: store_merge_split:
2093; SSE41:       # %bb.0:
2094; SSE41-NEXT:    pxor %xmm4, %xmm4
2095; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
2096; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
2097; SSE41-NEXT:    packusdw %xmm1, %xmm0
2098; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
2099; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
2100; SSE41-NEXT:    packusdw %xmm3, %xmm2
2101; SSE41-NEXT:    shlq $4, %rdi
2102; SSE41-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2103; SSE41-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2104; SSE41-NEXT:    retq
2105;
2106; AVX1-LABEL: store_merge_split:
2107; AVX1:       # %bb.0:
2108; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2109; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2110; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2111; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2112; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2113; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2114; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2115; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2116; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
2117; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2118; AVX1-NEXT:    shlq $4, %rdi
2119; AVX1-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2120; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2121; AVX1-NEXT:    vzeroupper
2122; AVX1-NEXT:    retq
2123;
2124; AVX2-LABEL: store_merge_split:
2125; AVX2:       # %bb.0:
2126; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2127; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2128; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2129; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2130; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2131; AVX2-NEXT:    shlq $4, %rdi
2132; AVX2-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2133; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2134; AVX2-NEXT:    vzeroupper
2135; AVX2-NEXT:    retq
2136;
2137; AVX512F-LABEL: store_merge_split:
2138; AVX512F:       # %bb.0:
2139; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2140; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2141; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2142; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
2143; AVX512F-NEXT:    shlq $4, %rdi
2144; AVX512F-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2145; AVX512F-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2146; AVX512F-NEXT:    vzeroupper
2147; AVX512F-NEXT:    retq
2148;
2149; AVX512VL-LABEL: store_merge_split:
2150; AVX512VL:       # %bb.0:
2151; AVX512VL-NEXT:    shlq $4, %rdi
2152; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi,%rdi)
2153; AVX512VL-NEXT:    vpmovdw %ymm1, 16(%rsi,%rdi)
2154; AVX512VL-NEXT:    vzeroupper
2155; AVX512VL-NEXT:    retq
2156;
2157; AVX512BW-LABEL: store_merge_split:
2158; AVX512BW:       # %bb.0:
2159; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2160; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2161; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
2162; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
2163; AVX512BW-NEXT:    shlq $4, %rdi
2164; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2165; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2166; AVX512BW-NEXT:    vzeroupper
2167; AVX512BW-NEXT:    retq
2168;
2169; AVX512BWVL-LABEL: store_merge_split:
2170; AVX512BWVL:       # %bb.0:
2171; AVX512BWVL-NEXT:    shlq $4, %rdi
2172; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi,%rdi)
2173; AVX512BWVL-NEXT:    vpmovdw %ymm1, 16(%rsi,%rdi)
2174; AVX512BWVL-NEXT:    vzeroupper
2175; AVX512BWVL-NEXT:    retq
2176  %t1 = trunc <8 x i32> %w1 to <8 x i16>
2177  %t2 = trunc <8 x i32> %w2 to <8 x i16>
2178  %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx
2179  %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1
2180  store <8 x i16> %t1, <8 x i16>* %g1, align 2
2181  store <8 x i16> %t2, <8 x i16>* %g2, align 2
2182  ret void
2183}
2184