1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
16
17define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
18; SSE-LABEL: trunc8i64_8i32:
19; SSE:       # %bb.0: # %entry
20; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
21; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
22; SSE-NEXT:    movaps %xmm2, %xmm1
23; SSE-NEXT:    retq
24;
25; AVX1-LABEL: trunc8i64_8i32:
26; AVX1:       # %bb.0: # %entry
27; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
28; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
29; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
30; AVX1-NEXT:    retq
31;
32; AVX2-SLOW-LABEL: trunc8i64_8i32:
33; AVX2-SLOW:       # %bb.0: # %entry
34; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
35; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
36; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
37; AVX2-SLOW-NEXT:    retq
38;
39; AVX2-FAST-ALL-LABEL: trunc8i64_8i32:
40; AVX2-FAST-ALL:       # %bb.0: # %entry
41; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
42; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
43; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
44; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
45; AVX2-FAST-ALL-NEXT:    retq
46;
47; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32:
48; AVX2-FAST-PERLANE:       # %bb.0: # %entry
49; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
50; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
51; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
52; AVX2-FAST-PERLANE-NEXT:    retq
53;
54; AVX512-LABEL: trunc8i64_8i32:
55; AVX512:       # %bb.0: # %entry
56; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
57; AVX512-NEXT:    retq
58entry:
59  %0 = trunc <8 x i64> %a to <8 x i32>
60  ret <8 x i32> %0
61}
62
63define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
64; SSE-LABEL: trunc8i64_8i32_ashr:
65; SSE:       # %bb.0: # %entry
66; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
67; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
68; SSE-NEXT:    movaps %xmm2, %xmm1
69; SSE-NEXT:    retq
70;
71; AVX1-LABEL: trunc8i64_8i32_ashr:
72; AVX1:       # %bb.0: # %entry
73; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
74; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
75; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
76; AVX1-NEXT:    retq
77;
78; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
79; AVX2-SLOW:       # %bb.0: # %entry
80; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
81; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
82; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
83; AVX2-SLOW-NEXT:    retq
84;
85; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr:
86; AVX2-FAST-ALL:       # %bb.0: # %entry
87; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
88; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
89; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
90; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
91; AVX2-FAST-ALL-NEXT:    retq
92;
93; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr:
94; AVX2-FAST-PERLANE:       # %bb.0: # %entry
95; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
96; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
97; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
98; AVX2-FAST-PERLANE-NEXT:    retq
99;
100; AVX512-LABEL: trunc8i64_8i32_ashr:
101; AVX512:       # %bb.0: # %entry
102; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
103; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
104; AVX512-NEXT:    retq
105entry:
106  %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
107  %1 = trunc <8 x i64> %0 to <8 x i32>
108  ret <8 x i32> %1
109}
110
111define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
112; SSE-LABEL: trunc8i64_8i32_lshr:
113; SSE:       # %bb.0: # %entry
114; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
115; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
116; SSE-NEXT:    movaps %xmm2, %xmm1
117; SSE-NEXT:    retq
118;
119; AVX1-LABEL: trunc8i64_8i32_lshr:
120; AVX1:       # %bb.0: # %entry
121; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
122; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
123; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
124; AVX1-NEXT:    retq
125;
126; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
127; AVX2-SLOW:       # %bb.0: # %entry
128; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
129; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
130; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
131; AVX2-SLOW-NEXT:    retq
132;
133; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr:
134; AVX2-FAST-ALL:       # %bb.0: # %entry
135; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,7]
136; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
137; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
138; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
139; AVX2-FAST-ALL-NEXT:    retq
140;
141; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr:
142; AVX2-FAST-PERLANE:       # %bb.0: # %entry
143; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
144; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
145; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
146; AVX2-FAST-PERLANE-NEXT:    retq
147;
148; AVX512-LABEL: trunc8i64_8i32_lshr:
149; AVX512:       # %bb.0: # %entry
150; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
151; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
152; AVX512-NEXT:    retq
153entry:
154  %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
155  %1 = trunc <8 x i64> %0 to <8 x i32>
156  ret <8 x i32> %1
157}
158
159define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
160; SSE2-LABEL: trunc8i64_8i16:
161; SSE2:       # %bb.0: # %entry
162; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
163; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
164; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
165; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
166; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
167; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
168; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
169; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
170; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
171; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
172; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
173; SSE2-NEXT:    retq
174;
175; SSSE3-LABEL: trunc8i64_8i16:
176; SSSE3:       # %bb.0: # %entry
177; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
178; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
179; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
180; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
181; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
182; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
183; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
184; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
185; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
186; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
187; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
188; SSSE3-NEXT:    retq
189;
190; SSE41-LABEL: trunc8i64_8i16:
191; SSE41:       # %bb.0: # %entry
192; SSE41-NEXT:    pxor %xmm4, %xmm4
193; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
194; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
195; SSE41-NEXT:    packusdw %xmm3, %xmm2
196; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
197; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
198; SSE41-NEXT:    packusdw %xmm1, %xmm0
199; SSE41-NEXT:    packusdw %xmm2, %xmm0
200; SSE41-NEXT:    retq
201;
202; AVX1-LABEL: trunc8i64_8i16:
203; AVX1:       # %bb.0: # %entry
204; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
205; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
206; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
207; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
208; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
209; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
210; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
211; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
212; AVX1-NEXT:    vzeroupper
213; AVX1-NEXT:    retq
214;
215; AVX2-LABEL: trunc8i64_8i16:
216; AVX2:       # %bb.0: # %entry
217; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
218; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
219; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
220; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
221; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
222; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
223; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
224; AVX2-NEXT:    vzeroupper
225; AVX2-NEXT:    retq
226;
227; AVX512-LABEL: trunc8i64_8i16:
228; AVX512:       # %bb.0: # %entry
229; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
230; AVX512-NEXT:    vzeroupper
231; AVX512-NEXT:    retq
232entry:
233  %0 = trunc <8 x i64> %a to <8 x i16>
234  ret <8 x i16> %0
235}
236
237define void @trunc8i64_8i8(<8 x i64> %a) {
238; SSE2-LABEL: trunc8i64_8i8:
239; SSE2:       # %bb.0: # %entry
240; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
241; SSE2-NEXT:    pand %xmm4, %xmm3
242; SSE2-NEXT:    pand %xmm4, %xmm2
243; SSE2-NEXT:    packuswb %xmm3, %xmm2
244; SSE2-NEXT:    pand %xmm4, %xmm1
245; SSE2-NEXT:    pand %xmm4, %xmm0
246; SSE2-NEXT:    packuswb %xmm1, %xmm0
247; SSE2-NEXT:    packuswb %xmm2, %xmm0
248; SSE2-NEXT:    packuswb %xmm0, %xmm0
249; SSE2-NEXT:    movq %xmm0, (%rax)
250; SSE2-NEXT:    retq
251;
252; SSSE3-LABEL: trunc8i64_8i8:
253; SSSE3:       # %bb.0: # %entry
254; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
255; SSSE3-NEXT:    pand %xmm4, %xmm3
256; SSSE3-NEXT:    pand %xmm4, %xmm2
257; SSSE3-NEXT:    packuswb %xmm3, %xmm2
258; SSSE3-NEXT:    pand %xmm4, %xmm1
259; SSSE3-NEXT:    pand %xmm4, %xmm0
260; SSSE3-NEXT:    packuswb %xmm1, %xmm0
261; SSSE3-NEXT:    packuswb %xmm2, %xmm0
262; SSSE3-NEXT:    packuswb %xmm0, %xmm0
263; SSSE3-NEXT:    movq %xmm0, (%rax)
264; SSSE3-NEXT:    retq
265;
266; SSE41-LABEL: trunc8i64_8i8:
267; SSE41:       # %bb.0: # %entry
268; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
269; SSE41-NEXT:    pand %xmm4, %xmm3
270; SSE41-NEXT:    pand %xmm4, %xmm2
271; SSE41-NEXT:    packusdw %xmm3, %xmm2
272; SSE41-NEXT:    pand %xmm4, %xmm1
273; SSE41-NEXT:    pand %xmm4, %xmm0
274; SSE41-NEXT:    packusdw %xmm1, %xmm0
275; SSE41-NEXT:    packusdw %xmm2, %xmm0
276; SSE41-NEXT:    packuswb %xmm0, %xmm0
277; SSE41-NEXT:    movq %xmm0, (%rax)
278; SSE41-NEXT:    retq
279;
280; AVX1-LABEL: trunc8i64_8i8:
281; AVX1:       # %bb.0: # %entry
282; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255]
283; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
284; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
285; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
286; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
287; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
288; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
289; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
290; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
291; AVX1-NEXT:    vmovq %xmm0, (%rax)
292; AVX1-NEXT:    vzeroupper
293; AVX1-NEXT:    retq
294;
295; AVX2-LABEL: trunc8i64_8i8:
296; AVX2:       # %bb.0: # %entry
297; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
298; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
299; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
300; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
301; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
302; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
303; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
304; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
305; AVX2-NEXT:    vmovq %xmm0, (%rax)
306; AVX2-NEXT:    vzeroupper
307; AVX2-NEXT:    retq
308;
309; AVX512-LABEL: trunc8i64_8i8:
310; AVX512:       # %bb.0: # %entry
311; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
312; AVX512-NEXT:    vzeroupper
313; AVX512-NEXT:    retq
314entry:
315  %0 = trunc <8 x i64> %a to <8 x i8>
316  store <8 x i8> %0, ptr undef, align 4
317  ret void
318}
319
320define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
321; SSE2-LABEL: trunc8i32_8i16:
322; SSE2:       # %bb.0: # %entry
323; SSE2-NEXT:    pslld $16, %xmm1
324; SSE2-NEXT:    psrad $16, %xmm1
325; SSE2-NEXT:    pslld $16, %xmm0
326; SSE2-NEXT:    psrad $16, %xmm0
327; SSE2-NEXT:    packssdw %xmm1, %xmm0
328; SSE2-NEXT:    retq
329;
330; SSSE3-LABEL: trunc8i32_8i16:
331; SSSE3:       # %bb.0: # %entry
332; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
333; SSSE3-NEXT:    pshufb %xmm2, %xmm1
334; SSSE3-NEXT:    pshufb %xmm2, %xmm0
335; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
336; SSSE3-NEXT:    retq
337;
338; SSE41-LABEL: trunc8i32_8i16:
339; SSE41:       # %bb.0: # %entry
340; SSE41-NEXT:    pxor %xmm2, %xmm2
341; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
342; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
343; SSE41-NEXT:    packusdw %xmm1, %xmm0
344; SSE41-NEXT:    retq
345;
346; AVX1-LABEL: trunc8i32_8i16:
347; AVX1:       # %bb.0: # %entry
348; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
349; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
350; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
351; AVX1-NEXT:    vzeroupper
352; AVX1-NEXT:    retq
353;
354; AVX2-LABEL: trunc8i32_8i16:
355; AVX2:       # %bb.0: # %entry
356; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
357; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
358; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
359; AVX2-NEXT:    vzeroupper
360; AVX2-NEXT:    retq
361;
362; AVX512F-LABEL: trunc8i32_8i16:
363; AVX512F:       # %bb.0: # %entry
364; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
365; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
366; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
367; AVX512F-NEXT:    vzeroupper
368; AVX512F-NEXT:    retq
369;
370; AVX512VL-LABEL: trunc8i32_8i16:
371; AVX512VL:       # %bb.0: # %entry
372; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
373; AVX512VL-NEXT:    vzeroupper
374; AVX512VL-NEXT:    retq
375;
376; AVX512BW-LABEL: trunc8i32_8i16:
377; AVX512BW:       # %bb.0: # %entry
378; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
379; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
380; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
381; AVX512BW-NEXT:    vzeroupper
382; AVX512BW-NEXT:    retq
383;
384; AVX512BWVL-LABEL: trunc8i32_8i16:
385; AVX512BWVL:       # %bb.0: # %entry
386; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
387; AVX512BWVL-NEXT:    vzeroupper
388; AVX512BWVL-NEXT:    retq
389entry:
390  %0 = trunc <8 x i32> %a to <8 x i16>
391  ret <8 x i16> %0
392}
393
394define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
395; SSE2-LABEL: trunc8i32_8i16_ashr:
396; SSE2:       # %bb.0: # %entry
397; SSE2-NEXT:    psrad $16, %xmm1
398; SSE2-NEXT:    psrad $16, %xmm0
399; SSE2-NEXT:    packssdw %xmm1, %xmm0
400; SSE2-NEXT:    retq
401;
402; SSSE3-LABEL: trunc8i32_8i16_ashr:
403; SSSE3:       # %bb.0: # %entry
404; SSSE3-NEXT:    psrad $16, %xmm1
405; SSSE3-NEXT:    psrad $16, %xmm0
406; SSSE3-NEXT:    packssdw %xmm1, %xmm0
407; SSSE3-NEXT:    retq
408;
409; SSE41-LABEL: trunc8i32_8i16_ashr:
410; SSE41:       # %bb.0: # %entry
411; SSE41-NEXT:    psrld $16, %xmm1
412; SSE41-NEXT:    psrld $16, %xmm0
413; SSE41-NEXT:    packusdw %xmm1, %xmm0
414; SSE41-NEXT:    retq
415;
416; AVX1-LABEL: trunc8i32_8i16_ashr:
417; AVX1:       # %bb.0: # %entry
418; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
419; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
420; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
421; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
422; AVX1-NEXT:    vzeroupper
423; AVX1-NEXT:    retq
424;
425; AVX2-LABEL: trunc8i32_8i16_ashr:
426; AVX2:       # %bb.0: # %entry
427; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
428; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
429; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
430; AVX2-NEXT:    vzeroupper
431; AVX2-NEXT:    retq
432;
433; AVX512F-LABEL: trunc8i32_8i16_ashr:
434; AVX512F:       # %bb.0: # %entry
435; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
436; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
437; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
438; AVX512F-NEXT:    vzeroupper
439; AVX512F-NEXT:    retq
440;
441; AVX512VL-LABEL: trunc8i32_8i16_ashr:
442; AVX512VL:       # %bb.0: # %entry
443; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
444; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
445; AVX512VL-NEXT:    vzeroupper
446; AVX512VL-NEXT:    retq
447;
448; AVX512BW-LABEL: trunc8i32_8i16_ashr:
449; AVX512BW:       # %bb.0: # %entry
450; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
451; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
452; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
453; AVX512BW-NEXT:    vzeroupper
454; AVX512BW-NEXT:    retq
455;
456; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
457; AVX512BWVL:       # %bb.0: # %entry
458; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
459; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
460; AVX512BWVL-NEXT:    vzeroupper
461; AVX512BWVL-NEXT:    retq
462entry:
463  %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
464  %1 = trunc <8 x i32> %0 to <8 x i16>
465  ret <8 x i16> %1
466}
467
468define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
469; SSE2-LABEL: trunc8i32_8i16_lshr:
470; SSE2:       # %bb.0: # %entry
471; SSE2-NEXT:    psrad $16, %xmm1
472; SSE2-NEXT:    psrad $16, %xmm0
473; SSE2-NEXT:    packssdw %xmm1, %xmm0
474; SSE2-NEXT:    retq
475;
476; SSSE3-LABEL: trunc8i32_8i16_lshr:
477; SSSE3:       # %bb.0: # %entry
478; SSSE3-NEXT:    psrad $16, %xmm1
479; SSSE3-NEXT:    psrad $16, %xmm0
480; SSSE3-NEXT:    packssdw %xmm1, %xmm0
481; SSSE3-NEXT:    retq
482;
483; SSE41-LABEL: trunc8i32_8i16_lshr:
484; SSE41:       # %bb.0: # %entry
485; SSE41-NEXT:    psrld $16, %xmm1
486; SSE41-NEXT:    psrld $16, %xmm0
487; SSE41-NEXT:    packusdw %xmm1, %xmm0
488; SSE41-NEXT:    retq
489;
490; AVX1-LABEL: trunc8i32_8i16_lshr:
491; AVX1:       # %bb.0: # %entry
492; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
493; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
494; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
495; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
496; AVX1-NEXT:    vzeroupper
497; AVX1-NEXT:    retq
498;
499; AVX2-LABEL: trunc8i32_8i16_lshr:
500; AVX2:       # %bb.0: # %entry
501; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
502; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
503; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
504; AVX2-NEXT:    vzeroupper
505; AVX2-NEXT:    retq
506;
507; AVX512F-LABEL: trunc8i32_8i16_lshr:
508; AVX512F:       # %bb.0: # %entry
509; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
510; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
511; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
512; AVX512F-NEXT:    vzeroupper
513; AVX512F-NEXT:    retq
514;
515; AVX512VL-LABEL: trunc8i32_8i16_lshr:
516; AVX512VL:       # %bb.0: # %entry
517; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
518; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
519; AVX512VL-NEXT:    vzeroupper
520; AVX512VL-NEXT:    retq
521;
522; AVX512BW-LABEL: trunc8i32_8i16_lshr:
523; AVX512BW:       # %bb.0: # %entry
524; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
525; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
526; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
527; AVX512BW-NEXT:    vzeroupper
528; AVX512BW-NEXT:    retq
529;
530; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
531; AVX512BWVL:       # %bb.0: # %entry
532; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
533; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
534; AVX512BWVL-NEXT:    vzeroupper
535; AVX512BWVL-NEXT:    retq
536entry:
537  %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
538  %1 = trunc <8 x i32> %0 to <8 x i16>
539  ret <8 x i16> %1
540}
541
542define void @trunc8i32_8i8(<8 x i32> %a) {
543; SSE2-LABEL: trunc8i32_8i8:
544; SSE2:       # %bb.0: # %entry
545; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
546; SSE2-NEXT:    pand %xmm2, %xmm1
547; SSE2-NEXT:    pand %xmm2, %xmm0
548; SSE2-NEXT:    packuswb %xmm1, %xmm0
549; SSE2-NEXT:    packuswb %xmm0, %xmm0
550; SSE2-NEXT:    movq %xmm0, (%rax)
551; SSE2-NEXT:    retq
552;
553; SSSE3-LABEL: trunc8i32_8i8:
554; SSSE3:       # %bb.0: # %entry
555; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
556; SSSE3-NEXT:    pshufb %xmm2, %xmm1
557; SSSE3-NEXT:    pshufb %xmm2, %xmm0
558; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
559; SSSE3-NEXT:    movq %xmm0, (%rax)
560; SSSE3-NEXT:    retq
561;
562; SSE41-LABEL: trunc8i32_8i8:
563; SSE41:       # %bb.0: # %entry
564; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
565; SSE41-NEXT:    pshufb %xmm2, %xmm1
566; SSE41-NEXT:    pshufb %xmm2, %xmm0
567; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
568; SSE41-NEXT:    movq %xmm0, (%rax)
569; SSE41-NEXT:    retq
570;
571; AVX1-LABEL: trunc8i32_8i8:
572; AVX1:       # %bb.0: # %entry
573; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
574; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
575; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
576; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
577; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
578; AVX1-NEXT:    vmovq %xmm0, (%rax)
579; AVX1-NEXT:    vzeroupper
580; AVX1-NEXT:    retq
581;
582; AVX2-LABEL: trunc8i32_8i8:
583; AVX2:       # %bb.0: # %entry
584; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
585; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
586; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
587; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
588; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
589; AVX2-NEXT:    vmovq %xmm0, (%rax)
590; AVX2-NEXT:    vzeroupper
591; AVX2-NEXT:    retq
592;
593; AVX512F-LABEL: trunc8i32_8i8:
594; AVX512F:       # %bb.0: # %entry
595; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
596; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
597; AVX512F-NEXT:    vmovq %xmm0, (%rax)
598; AVX512F-NEXT:    vzeroupper
599; AVX512F-NEXT:    retq
600;
601; AVX512VL-LABEL: trunc8i32_8i8:
602; AVX512VL:       # %bb.0: # %entry
603; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
604; AVX512VL-NEXT:    vzeroupper
605; AVX512VL-NEXT:    retq
606;
607; AVX512BW-LABEL: trunc8i32_8i8:
608; AVX512BW:       # %bb.0: # %entry
609; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
610; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
611; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
612; AVX512BW-NEXT:    vzeroupper
613; AVX512BW-NEXT:    retq
614;
615; AVX512BWVL-LABEL: trunc8i32_8i8:
616; AVX512BWVL:       # %bb.0: # %entry
617; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
618; AVX512BWVL-NEXT:    vzeroupper
619; AVX512BWVL-NEXT:    retq
620entry:
621  %0 = trunc <8 x i32> %a to <8 x i8>
622  store <8 x i8> %0, ptr undef, align 4
623  ret void
624}
625
626define void @trunc16i32_16i16(<16 x i32> %a) {
627; SSE2-LABEL: trunc16i32_16i16:
628; SSE2:       # %bb.0: # %entry
629; SSE2-NEXT:    pslld $16, %xmm1
630; SSE2-NEXT:    psrad $16, %xmm1
631; SSE2-NEXT:    pslld $16, %xmm0
632; SSE2-NEXT:    psrad $16, %xmm0
633; SSE2-NEXT:    packssdw %xmm1, %xmm0
634; SSE2-NEXT:    pslld $16, %xmm3
635; SSE2-NEXT:    psrad $16, %xmm3
636; SSE2-NEXT:    pslld $16, %xmm2
637; SSE2-NEXT:    psrad $16, %xmm2
638; SSE2-NEXT:    packssdw %xmm3, %xmm2
639; SSE2-NEXT:    movdqu %xmm2, (%rax)
640; SSE2-NEXT:    movdqu %xmm0, (%rax)
641; SSE2-NEXT:    retq
642;
643; SSSE3-LABEL: trunc16i32_16i16:
644; SSSE3:       # %bb.0: # %entry
645; SSSE3-NEXT:    pslld $16, %xmm1
646; SSSE3-NEXT:    psrad $16, %xmm1
647; SSSE3-NEXT:    pslld $16, %xmm0
648; SSSE3-NEXT:    psrad $16, %xmm0
649; SSSE3-NEXT:    packssdw %xmm1, %xmm0
650; SSSE3-NEXT:    pslld $16, %xmm3
651; SSSE3-NEXT:    psrad $16, %xmm3
652; SSSE3-NEXT:    pslld $16, %xmm2
653; SSSE3-NEXT:    psrad $16, %xmm2
654; SSSE3-NEXT:    packssdw %xmm3, %xmm2
655; SSSE3-NEXT:    movdqu %xmm2, (%rax)
656; SSSE3-NEXT:    movdqu %xmm0, (%rax)
657; SSSE3-NEXT:    retq
658;
659; SSE41-LABEL: trunc16i32_16i16:
660; SSE41:       # %bb.0: # %entry
661; SSE41-NEXT:    pxor %xmm4, %xmm4
662; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
663; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
664; SSE41-NEXT:    packusdw %xmm1, %xmm0
665; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
666; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
667; SSE41-NEXT:    packusdw %xmm3, %xmm2
668; SSE41-NEXT:    movdqu %xmm2, (%rax)
669; SSE41-NEXT:    movdqu %xmm0, (%rax)
670; SSE41-NEXT:    retq
671;
672; AVX1-LABEL: trunc16i32_16i16:
673; AVX1:       # %bb.0: # %entry
674; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
675; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
676; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
677; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
678; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
679; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
680; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
681; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
682; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
683; AVX1-NEXT:    vzeroupper
684; AVX1-NEXT:    retq
685;
686; AVX2-LABEL: trunc16i32_16i16:
687; AVX2:       # %bb.0: # %entry
688; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
689; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
690; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
691; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
692; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
693; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
694; AVX2-NEXT:    vzeroupper
695; AVX2-NEXT:    retq
696;
697; AVX512-LABEL: trunc16i32_16i16:
698; AVX512:       # %bb.0: # %entry
699; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
700; AVX512-NEXT:    vzeroupper
701; AVX512-NEXT:    retq
702entry:
703  %0 = trunc <16 x i32> %a to <16 x i16>
704  store <16 x i16> %0, ptr undef, align 4
705  ret void
706}
707
708define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
709; SSE2-LABEL: trunc16i32_16i16_ashr:
710; SSE2:       # %bb.0: # %entry
711; SSE2-NEXT:    psrad $16, %xmm1
712; SSE2-NEXT:    psrad $16, %xmm0
713; SSE2-NEXT:    packssdw %xmm1, %xmm0
714; SSE2-NEXT:    psrad $16, %xmm3
715; SSE2-NEXT:    psrad $16, %xmm2
716; SSE2-NEXT:    packssdw %xmm3, %xmm2
717; SSE2-NEXT:    movdqu %xmm2, (%rax)
718; SSE2-NEXT:    movdqu %xmm0, (%rax)
719; SSE2-NEXT:    retq
720;
721; SSSE3-LABEL: trunc16i32_16i16_ashr:
722; SSSE3:       # %bb.0: # %entry
723; SSSE3-NEXT:    psrad $16, %xmm1
724; SSSE3-NEXT:    psrad $16, %xmm0
725; SSSE3-NEXT:    packssdw %xmm1, %xmm0
726; SSSE3-NEXT:    psrad $16, %xmm3
727; SSSE3-NEXT:    psrad $16, %xmm2
728; SSSE3-NEXT:    packssdw %xmm3, %xmm2
729; SSSE3-NEXT:    movdqu %xmm2, (%rax)
730; SSSE3-NEXT:    movdqu %xmm0, (%rax)
731; SSSE3-NEXT:    retq
732;
733; SSE41-LABEL: trunc16i32_16i16_ashr:
734; SSE41:       # %bb.0: # %entry
735; SSE41-NEXT:    psrld $16, %xmm3
736; SSE41-NEXT:    psrld $16, %xmm2
737; SSE41-NEXT:    packusdw %xmm3, %xmm2
738; SSE41-NEXT:    psrld $16, %xmm1
739; SSE41-NEXT:    psrld $16, %xmm0
740; SSE41-NEXT:    packusdw %xmm1, %xmm0
741; SSE41-NEXT:    movdqu %xmm2, (%rax)
742; SSE41-NEXT:    movdqu %xmm0, (%rax)
743; SSE41-NEXT:    retq
744;
745; AVX1-LABEL: trunc16i32_16i16_ashr:
746; AVX1:       # %bb.0: # %entry
747; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
748; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
749; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
750; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
751; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
752; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
753; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
754; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
755; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
756; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
757; AVX1-NEXT:    vzeroupper
758; AVX1-NEXT:    retq
759;
760; AVX2-LABEL: trunc16i32_16i16_ashr:
761; AVX2:       # %bb.0: # %entry
762; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
763; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
764; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
765; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
766; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
767; AVX2-NEXT:    vzeroupper
768; AVX2-NEXT:    retq
769;
770; AVX512-LABEL: trunc16i32_16i16_ashr:
771; AVX512:       # %bb.0: # %entry
772; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
773; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
774; AVX512-NEXT:    vzeroupper
775; AVX512-NEXT:    retq
776entry:
777  %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
778  %1 = trunc <16 x i32> %0 to <16 x i16>
779  store <16 x i16> %1, ptr undef, align 4
780  ret void
781}
782
783define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
784; SSE2-LABEL: trunc16i32_16i16_lshr:
785; SSE2:       # %bb.0: # %entry
786; SSE2-NEXT:    psrad $16, %xmm1
787; SSE2-NEXT:    psrad $16, %xmm0
788; SSE2-NEXT:    packssdw %xmm1, %xmm0
789; SSE2-NEXT:    psrad $16, %xmm3
790; SSE2-NEXT:    psrad $16, %xmm2
791; SSE2-NEXT:    packssdw %xmm3, %xmm2
792; SSE2-NEXT:    movdqu %xmm2, (%rax)
793; SSE2-NEXT:    movdqu %xmm0, (%rax)
794; SSE2-NEXT:    retq
795;
796; SSSE3-LABEL: trunc16i32_16i16_lshr:
797; SSSE3:       # %bb.0: # %entry
798; SSSE3-NEXT:    psrad $16, %xmm1
799; SSSE3-NEXT:    psrad $16, %xmm0
800; SSSE3-NEXT:    packssdw %xmm1, %xmm0
801; SSSE3-NEXT:    psrad $16, %xmm3
802; SSSE3-NEXT:    psrad $16, %xmm2
803; SSSE3-NEXT:    packssdw %xmm3, %xmm2
804; SSSE3-NEXT:    movdqu %xmm2, (%rax)
805; SSSE3-NEXT:    movdqu %xmm0, (%rax)
806; SSSE3-NEXT:    retq
807;
808; SSE41-LABEL: trunc16i32_16i16_lshr:
809; SSE41:       # %bb.0: # %entry
810; SSE41-NEXT:    psrld $16, %xmm3
811; SSE41-NEXT:    psrld $16, %xmm2
812; SSE41-NEXT:    packusdw %xmm3, %xmm2
813; SSE41-NEXT:    psrld $16, %xmm1
814; SSE41-NEXT:    psrld $16, %xmm0
815; SSE41-NEXT:    packusdw %xmm1, %xmm0
816; SSE41-NEXT:    movdqu %xmm2, (%rax)
817; SSE41-NEXT:    movdqu %xmm0, (%rax)
818; SSE41-NEXT:    retq
819;
820; AVX1-LABEL: trunc16i32_16i16_lshr:
821; AVX1:       # %bb.0: # %entry
822; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
823; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
824; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
825; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
826; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
827; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
828; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
829; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
830; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
831; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
832; AVX1-NEXT:    vzeroupper
833; AVX1-NEXT:    retq
834;
835; AVX2-LABEL: trunc16i32_16i16_lshr:
836; AVX2:       # %bb.0: # %entry
837; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
838; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
839; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
840; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
841; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
842; AVX2-NEXT:    vzeroupper
843; AVX2-NEXT:    retq
844;
845; AVX512-LABEL: trunc16i32_16i16_lshr:
846; AVX512:       # %bb.0: # %entry
847; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
848; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
849; AVX512-NEXT:    vzeroupper
850; AVX512-NEXT:    retq
851entry:
852  %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
853  %1 = trunc <16 x i32> %0 to <16 x i16>
854  store <16 x i16> %1, ptr undef, align 4
855  ret void
856}
857
858define void @trunc16i32_16i8(<16 x i32> %a) {
859; SSE2-LABEL: trunc16i32_16i8:
860; SSE2:       # %bb.0: # %entry
861; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
862; SSE2-NEXT:    pand %xmm4, %xmm3
863; SSE2-NEXT:    pand %xmm4, %xmm2
864; SSE2-NEXT:    packuswb %xmm3, %xmm2
865; SSE2-NEXT:    pand %xmm4, %xmm1
866; SSE2-NEXT:    pand %xmm4, %xmm0
867; SSE2-NEXT:    packuswb %xmm1, %xmm0
868; SSE2-NEXT:    packuswb %xmm2, %xmm0
869; SSE2-NEXT:    movdqu %xmm0, (%rax)
870; SSE2-NEXT:    retq
871;
872; SSSE3-LABEL: trunc16i32_16i8:
873; SSSE3:       # %bb.0: # %entry
874; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
875; SSSE3-NEXT:    pand %xmm4, %xmm3
876; SSSE3-NEXT:    pand %xmm4, %xmm2
877; SSSE3-NEXT:    packuswb %xmm3, %xmm2
878; SSSE3-NEXT:    pand %xmm4, %xmm1
879; SSSE3-NEXT:    pand %xmm4, %xmm0
880; SSSE3-NEXT:    packuswb %xmm1, %xmm0
881; SSSE3-NEXT:    packuswb %xmm2, %xmm0
882; SSSE3-NEXT:    movdqu %xmm0, (%rax)
883; SSSE3-NEXT:    retq
884;
885; SSE41-LABEL: trunc16i32_16i8:
886; SSE41:       # %bb.0: # %entry
887; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
888; SSE41-NEXT:    pand %xmm4, %xmm3
889; SSE41-NEXT:    pand %xmm4, %xmm2
890; SSE41-NEXT:    packusdw %xmm3, %xmm2
891; SSE41-NEXT:    pand %xmm4, %xmm1
892; SSE41-NEXT:    pand %xmm4, %xmm0
893; SSE41-NEXT:    packusdw %xmm1, %xmm0
894; SSE41-NEXT:    packuswb %xmm2, %xmm0
895; SSE41-NEXT:    movdqu %xmm0, (%rax)
896; SSE41-NEXT:    retq
897;
898; AVX1-LABEL: trunc16i32_16i8:
899; AVX1:       # %bb.0: # %entry
900; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
901; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
902; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
903; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
904; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
905; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
906; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
907; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
908; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
909; AVX1-NEXT:    vzeroupper
910; AVX1-NEXT:    retq
911;
912; AVX2-LABEL: trunc16i32_16i8:
913; AVX2:       # %bb.0: # %entry
914; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
915; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
916; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
917; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
918; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
919; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
920; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
921; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
922; AVX2-NEXT:    vzeroupper
923; AVX2-NEXT:    retq
924;
925; AVX512-LABEL: trunc16i32_16i8:
926; AVX512:       # %bb.0: # %entry
927; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
928; AVX512-NEXT:    vzeroupper
929; AVX512-NEXT:    retq
930entry:
931  %0 = trunc <16 x i32> %a to <16 x i8>
932  store <16 x i8> %0, ptr undef, align 4
933  ret void
934}
935
936define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
937; SSE2-LABEL: trunc16i32_16i8_ashr:
938; SSE2:       # %bb.0: # %entry
939; SSE2-NEXT:    psrld $24, %xmm1
940; SSE2-NEXT:    psrld $24, %xmm0
941; SSE2-NEXT:    packuswb %xmm1, %xmm0
942; SSE2-NEXT:    psrld $24, %xmm3
943; SSE2-NEXT:    psrld $24, %xmm2
944; SSE2-NEXT:    packuswb %xmm3, %xmm2
945; SSE2-NEXT:    packuswb %xmm2, %xmm0
946; SSE2-NEXT:    movdqu %xmm0, (%rax)
947; SSE2-NEXT:    retq
948;
949; SSSE3-LABEL: trunc16i32_16i8_ashr:
950; SSSE3:       # %bb.0: # %entry
951; SSSE3-NEXT:    psrld $24, %xmm1
952; SSSE3-NEXT:    psrld $24, %xmm0
953; SSSE3-NEXT:    packuswb %xmm1, %xmm0
954; SSSE3-NEXT:    psrld $24, %xmm3
955; SSSE3-NEXT:    psrld $24, %xmm2
956; SSSE3-NEXT:    packuswb %xmm3, %xmm2
957; SSSE3-NEXT:    packuswb %xmm2, %xmm0
958; SSSE3-NEXT:    movdqu %xmm0, (%rax)
959; SSSE3-NEXT:    retq
960;
961; SSE41-LABEL: trunc16i32_16i8_ashr:
962; SSE41:       # %bb.0: # %entry
963; SSE41-NEXT:    psrld $24, %xmm1
964; SSE41-NEXT:    psrld $24, %xmm0
965; SSE41-NEXT:    packusdw %xmm1, %xmm0
966; SSE41-NEXT:    psrld $24, %xmm3
967; SSE41-NEXT:    psrld $24, %xmm2
968; SSE41-NEXT:    packusdw %xmm3, %xmm2
969; SSE41-NEXT:    packuswb %xmm2, %xmm0
970; SSE41-NEXT:    movdqu %xmm0, (%rax)
971; SSE41-NEXT:    retq
972;
973; AVX1-LABEL: trunc16i32_16i8_ashr:
974; AVX1:       # %bb.0: # %entry
975; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
976; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
977; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
978; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
979; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
980; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
981; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
982; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
983; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
984; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
985; AVX1-NEXT:    vzeroupper
986; AVX1-NEXT:    retq
987;
988; AVX2-LABEL: trunc16i32_16i8_ashr:
989; AVX2:       # %bb.0: # %entry
990; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
991; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
992; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
993; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
994; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
995; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
996; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
997; AVX2-NEXT:    vzeroupper
998; AVX2-NEXT:    retq
999;
1000; AVX512-LABEL: trunc16i32_16i8_ashr:
1001; AVX512:       # %bb.0: # %entry
1002; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
1003; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
1004; AVX512-NEXT:    vzeroupper
1005; AVX512-NEXT:    retq
1006entry:
1007  %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
1008  %1 = trunc <16 x i32> %0 to <16 x i8>
1009  store <16 x i8> %1, ptr undef, align 4
1010  ret void
1011}
1012
1013define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
1014; SSE2-LABEL: trunc16i32_16i8_lshr:
1015; SSE2:       # %bb.0: # %entry
1016; SSE2-NEXT:    psrld $24, %xmm1
1017; SSE2-NEXT:    psrld $24, %xmm0
1018; SSE2-NEXT:    packuswb %xmm1, %xmm0
1019; SSE2-NEXT:    psrld $24, %xmm3
1020; SSE2-NEXT:    psrld $24, %xmm2
1021; SSE2-NEXT:    packuswb %xmm3, %xmm2
1022; SSE2-NEXT:    packuswb %xmm2, %xmm0
1023; SSE2-NEXT:    movdqu %xmm0, (%rax)
1024; SSE2-NEXT:    retq
1025;
1026; SSSE3-LABEL: trunc16i32_16i8_lshr:
1027; SSSE3:       # %bb.0: # %entry
1028; SSSE3-NEXT:    psrld $24, %xmm1
1029; SSSE3-NEXT:    psrld $24, %xmm0
1030; SSSE3-NEXT:    packuswb %xmm1, %xmm0
1031; SSSE3-NEXT:    psrld $24, %xmm3
1032; SSSE3-NEXT:    psrld $24, %xmm2
1033; SSSE3-NEXT:    packuswb %xmm3, %xmm2
1034; SSSE3-NEXT:    packuswb %xmm2, %xmm0
1035; SSSE3-NEXT:    movdqu %xmm0, (%rax)
1036; SSSE3-NEXT:    retq
1037;
1038; SSE41-LABEL: trunc16i32_16i8_lshr:
1039; SSE41:       # %bb.0: # %entry
1040; SSE41-NEXT:    psrld $24, %xmm1
1041; SSE41-NEXT:    psrld $24, %xmm0
1042; SSE41-NEXT:    packusdw %xmm1, %xmm0
1043; SSE41-NEXT:    psrld $24, %xmm3
1044; SSE41-NEXT:    psrld $24, %xmm2
1045; SSE41-NEXT:    packusdw %xmm3, %xmm2
1046; SSE41-NEXT:    packuswb %xmm2, %xmm0
1047; SSE41-NEXT:    movdqu %xmm0, (%rax)
1048; SSE41-NEXT:    retq
1049;
1050; AVX1-LABEL: trunc16i32_16i8_lshr:
1051; AVX1:       # %bb.0: # %entry
1052; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1053; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
1054; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
1055; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1056; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1057; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
1058; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
1059; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
1060; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1061; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1062; AVX1-NEXT:    vzeroupper
1063; AVX1-NEXT:    retq
1064;
1065; AVX2-LABEL: trunc16i32_16i8_lshr:
1066; AVX2:       # %bb.0: # %entry
1067; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
1068; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
1069; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1070; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1071; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1072; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1073; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1074; AVX2-NEXT:    vzeroupper
1075; AVX2-NEXT:    retq
1076;
1077; AVX512-LABEL: trunc16i32_16i8_lshr:
1078; AVX512:       # %bb.0: # %entry
1079; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
1080; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
1081; AVX512-NEXT:    vzeroupper
1082; AVX512-NEXT:    retq
1083entry:
1084  %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
1085  %1 = trunc <16 x i32> %0 to <16 x i8>
1086  store <16 x i8> %1, ptr undef, align 4
1087  ret void
1088}
1089
1090;PR25684
1091define void @trunc16i16_16i8(<16 x i16> %a) {
1092; SSE-LABEL: trunc16i16_16i8:
1093; SSE:       # %bb.0: # %entry
1094; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1095; SSE-NEXT:    pand %xmm2, %xmm1
1096; SSE-NEXT:    pand %xmm2, %xmm0
1097; SSE-NEXT:    packuswb %xmm1, %xmm0
1098; SSE-NEXT:    movdqu %xmm0, (%rax)
1099; SSE-NEXT:    retq
1100;
1101; AVX1-LABEL: trunc16i16_16i8:
1102; AVX1:       # %bb.0: # %entry
1103; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1104; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1105; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1106; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1107; AVX1-NEXT:    vzeroupper
1108; AVX1-NEXT:    retq
1109;
1110; AVX2-LABEL: trunc16i16_16i8:
1111; AVX2:       # %bb.0: # %entry
1112; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1113; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1114; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1115; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1116; AVX2-NEXT:    vzeroupper
1117; AVX2-NEXT:    retq
1118;
1119; AVX512F-LABEL: trunc16i16_16i8:
1120; AVX512F:       # %bb.0: # %entry
1121; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1122; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1123; AVX512F-NEXT:    vzeroupper
1124; AVX512F-NEXT:    retq
1125;
1126; AVX512VL-LABEL: trunc16i16_16i8:
1127; AVX512VL:       # %bb.0: # %entry
1128; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1129; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1130; AVX512VL-NEXT:    vzeroupper
1131; AVX512VL-NEXT:    retq
1132;
1133; AVX512BW-LABEL: trunc16i16_16i8:
1134; AVX512BW:       # %bb.0: # %entry
1135; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1136; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1137; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1138; AVX512BW-NEXT:    vzeroupper
1139; AVX512BW-NEXT:    retq
1140;
1141; AVX512BWVL-LABEL: trunc16i16_16i8:
1142; AVX512BWVL:       # %bb.0: # %entry
1143; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1144; AVX512BWVL-NEXT:    vzeroupper
1145; AVX512BWVL-NEXT:    retq
1146entry:
1147  %0 = trunc <16 x i16> %a to <16 x i8>
1148  store <16 x i8> %0, ptr undef, align 4
1149  ret void
1150}
1151
1152define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
1153; SSE-LABEL: trunc16i16_16i8_ashr:
1154; SSE:       # %bb.0: # %entry
1155; SSE-NEXT:    psrlw $8, %xmm1
1156; SSE-NEXT:    psrlw $8, %xmm0
1157; SSE-NEXT:    packuswb %xmm1, %xmm0
1158; SSE-NEXT:    movdqu %xmm0, (%rax)
1159; SSE-NEXT:    retq
1160;
1161; AVX1-LABEL: trunc16i16_16i8_ashr:
1162; AVX1:       # %bb.0: # %entry
1163; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1164; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1165; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1166; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1167; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1168; AVX1-NEXT:    vzeroupper
1169; AVX1-NEXT:    retq
1170;
1171; AVX2-LABEL: trunc16i16_16i8_ashr:
1172; AVX2:       # %bb.0: # %entry
1173; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1174; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1175; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1176; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1177; AVX2-NEXT:    vzeroupper
1178; AVX2-NEXT:    retq
1179;
1180; AVX512F-LABEL: trunc16i16_16i8_ashr:
1181; AVX512F:       # %bb.0: # %entry
1182; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1183; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1184; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1185; AVX512F-NEXT:    vzeroupper
1186; AVX512F-NEXT:    retq
1187;
1188; AVX512VL-LABEL: trunc16i16_16i8_ashr:
1189; AVX512VL:       # %bb.0: # %entry
1190; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1191; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1192; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1193; AVX512VL-NEXT:    vzeroupper
1194; AVX512VL-NEXT:    retq
1195;
1196; AVX512BW-LABEL: trunc16i16_16i8_ashr:
1197; AVX512BW:       # %bb.0: # %entry
1198; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1199; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1200; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1201; AVX512BW-NEXT:    vzeroupper
1202; AVX512BW-NEXT:    retq
1203;
1204; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
1205; AVX512BWVL:       # %bb.0: # %entry
1206; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1207; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1208; AVX512BWVL-NEXT:    vzeroupper
1209; AVX512BWVL-NEXT:    retq
1210entry:
1211  %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1212  %1 = trunc <16 x i16> %0 to <16 x i8>
1213  store <16 x i8> %1, ptr undef, align 4
1214  ret void
1215}
1216
1217define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
1218; SSE-LABEL: trunc16i16_16i8_lshr:
1219; SSE:       # %bb.0: # %entry
1220; SSE-NEXT:    psrlw $8, %xmm1
1221; SSE-NEXT:    psrlw $8, %xmm0
1222; SSE-NEXT:    packuswb %xmm1, %xmm0
1223; SSE-NEXT:    movdqu %xmm0, (%rax)
1224; SSE-NEXT:    retq
1225;
1226; AVX1-LABEL: trunc16i16_16i8_lshr:
1227; AVX1:       # %bb.0: # %entry
1228; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1229; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1230; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1231; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1232; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1233; AVX1-NEXT:    vzeroupper
1234; AVX1-NEXT:    retq
1235;
1236; AVX2-LABEL: trunc16i16_16i8_lshr:
1237; AVX2:       # %bb.0: # %entry
1238; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1239; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1240; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1241; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
1242; AVX2-NEXT:    vzeroupper
1243; AVX2-NEXT:    retq
1244;
1245; AVX512F-LABEL: trunc16i16_16i8_lshr:
1246; AVX512F:       # %bb.0: # %entry
1247; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1248; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1249; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1250; AVX512F-NEXT:    vzeroupper
1251; AVX512F-NEXT:    retq
1252;
1253; AVX512VL-LABEL: trunc16i16_16i8_lshr:
1254; AVX512VL:       # %bb.0: # %entry
1255; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1256; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1257; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1258; AVX512VL-NEXT:    vzeroupper
1259; AVX512VL-NEXT:    retq
1260;
1261; AVX512BW-LABEL: trunc16i16_16i8_lshr:
1262; AVX512BW:       # %bb.0: # %entry
1263; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1264; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1265; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
1266; AVX512BW-NEXT:    vzeroupper
1267; AVX512BW-NEXT:    retq
1268;
1269; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
1270; AVX512BWVL:       # %bb.0: # %entry
1271; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1272; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
1273; AVX512BWVL-NEXT:    vzeroupper
1274; AVX512BWVL-NEXT:    retq
1275entry:
1276  %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1277  %1 = trunc <16 x i16> %0 to <16 x i8>
1278  store <16 x i8> %1, ptr undef, align 4
1279  ret void
1280}
1281
1282define void @trunc32i16_32i8(<32 x i16> %a) {
1283; SSE-LABEL: trunc32i16_32i8:
1284; SSE:       # %bb.0: # %entry
1285; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1286; SSE-NEXT:    pand %xmm4, %xmm1
1287; SSE-NEXT:    pand %xmm4, %xmm0
1288; SSE-NEXT:    packuswb %xmm1, %xmm0
1289; SSE-NEXT:    pand %xmm4, %xmm3
1290; SSE-NEXT:    pand %xmm4, %xmm2
1291; SSE-NEXT:    packuswb %xmm3, %xmm2
1292; SSE-NEXT:    movdqu %xmm2, (%rax)
1293; SSE-NEXT:    movdqu %xmm0, (%rax)
1294; SSE-NEXT:    retq
1295;
1296; AVX1-LABEL: trunc32i16_32i8:
1297; AVX1:       # %bb.0: # %entry
1298; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1299; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1300; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1301; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1302; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1303; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1304; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1305; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
1306; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
1307; AVX1-NEXT:    vzeroupper
1308; AVX1-NEXT:    retq
1309;
1310; AVX2-LABEL: trunc32i16_32i8:
1311; AVX2:       # %bb.0: # %entry
1312; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1313; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1314; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1315; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1316; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1317; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
1318; AVX2-NEXT:    vzeroupper
1319; AVX2-NEXT:    retq
1320;
1321; AVX512F-LABEL: trunc32i16_32i8:
1322; AVX512F:       # %bb.0: # %entry
1323; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1324; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1325; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
1326; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1327; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
1328; AVX512F-NEXT:    vzeroupper
1329; AVX512F-NEXT:    retq
1330;
1331; AVX512VL-LABEL: trunc32i16_32i8:
1332; AVX512VL:       # %bb.0: # %entry
1333; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1334; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1335; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
1336; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1337; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
1338; AVX512VL-NEXT:    vzeroupper
1339; AVX512VL-NEXT:    retq
1340;
1341; AVX512BW-LABEL: trunc32i16_32i8:
1342; AVX512BW:       # %bb.0: # %entry
1343; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
1344; AVX512BW-NEXT:    vzeroupper
1345; AVX512BW-NEXT:    retq
1346;
1347; AVX512BWVL-LABEL: trunc32i16_32i8:
1348; AVX512BWVL:       # %bb.0: # %entry
1349; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
1350; AVX512BWVL-NEXT:    vzeroupper
1351; AVX512BWVL-NEXT:    retq
1352entry:
1353  %0 = trunc <32 x i16> %a to <32 x i8>
1354  store <32 x i8> %0, ptr undef, align 4
1355  ret void
1356}
1357
1358define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
1359; SSE-LABEL: trunc2x4i64_8i32:
1360; SSE:       # %bb.0: # %entry
1361; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1362; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1363; SSE-NEXT:    movaps %xmm2, %xmm1
1364; SSE-NEXT:    retq
1365;
1366; AVX1-LABEL: trunc2x4i64_8i32:
1367; AVX1:       # %bb.0: # %entry
1368; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1369; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1370; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1371; AVX1-NEXT:    retq
1372;
1373; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
1374; AVX2-SLOW:       # %bb.0: # %entry
1375; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1376; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1377; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1378; AVX2-SLOW-NEXT:    retq
1379;
1380; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32:
1381; AVX2-FAST-ALL:       # %bb.0: # %entry
1382; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1383; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
1384; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
1385; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1386; AVX2-FAST-ALL-NEXT:    retq
1387;
1388; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32:
1389; AVX2-FAST-PERLANE:       # %bb.0: # %entry
1390; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1391; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1392; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1393; AVX2-FAST-PERLANE-NEXT:    retq
1394;
1395; AVX512F-LABEL: trunc2x4i64_8i32:
1396; AVX512F:       # %bb.0: # %entry
1397; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1398; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1399; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1400; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1401; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1402; AVX512F-NEXT:    retq
1403;
1404; AVX512VL-LABEL: trunc2x4i64_8i32:
1405; AVX512VL:       # %bb.0: # %entry
1406; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1407; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
1408; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1409; AVX512VL-NEXT:    retq
1410;
1411; AVX512BW-LABEL: trunc2x4i64_8i32:
1412; AVX512BW:       # %bb.0: # %entry
1413; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1414; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1415; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1416; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1417; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1418; AVX512BW-NEXT:    retq
1419;
1420; AVX512BWVL-LABEL: trunc2x4i64_8i32:
1421; AVX512BWVL:       # %bb.0: # %entry
1422; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1423; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
1424; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1425; AVX512BWVL-NEXT:    retq
1426entry:
1427  %0 = trunc <4 x i64> %a to <4 x i32>
1428  %1 = trunc <4 x i64> %b to <4 x i32>
1429  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1430  ret <8 x i32> %2
1431}
1432
1433define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
1434; SSE2-LABEL: trunc2x4i64_8i16:
1435; SSE2:       # %bb.0: # %entry
1436; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1437; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1438; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1439; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1440; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1441; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1442; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1443; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1444; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1445; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1446; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1447; SSE2-NEXT:    retq
1448;
1449; SSSE3-LABEL: trunc2x4i64_8i16:
1450; SSSE3:       # %bb.0: # %entry
1451; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1452; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1453; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1454; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1455; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1456; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1457; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1458; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1459; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1460; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1461; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1462; SSSE3-NEXT:    retq
1463;
1464; SSE41-LABEL: trunc2x4i64_8i16:
1465; SSE41:       # %bb.0: # %entry
1466; SSE41-NEXT:    pxor %xmm4, %xmm4
1467; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
1468; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
1469; SSE41-NEXT:    packusdw %xmm3, %xmm2
1470; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
1471; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
1472; SSE41-NEXT:    packusdw %xmm1, %xmm0
1473; SSE41-NEXT:    packusdw %xmm2, %xmm0
1474; SSE41-NEXT:    retq
1475;
1476; AVX1-LABEL: trunc2x4i64_8i16:
1477; AVX1:       # %bb.0: # %entry
1478; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1479; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1480; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1481; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1482; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
1483; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1484; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1485; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1486; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1487; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1488; AVX1-NEXT:    vzeroupper
1489; AVX1-NEXT:    retq
1490;
1491; AVX2-LABEL: trunc2x4i64_8i16:
1492; AVX2:       # %bb.0: # %entry
1493; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1494; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
1495; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1496; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1497; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
1498; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1499; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1500; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1501; AVX2-NEXT:    vzeroupper
1502; AVX2-NEXT:    retq
1503;
1504; AVX512F-LABEL: trunc2x4i64_8i16:
1505; AVX512F:       # %bb.0: # %entry
1506; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1507; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1508; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1509; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1510; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1511; AVX512F-NEXT:    vzeroupper
1512; AVX512F-NEXT:    retq
1513;
1514; AVX512VL-LABEL: trunc2x4i64_8i16:
1515; AVX512VL:       # %bb.0: # %entry
1516; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
1517; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
1518; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1519; AVX512VL-NEXT:    vzeroupper
1520; AVX512VL-NEXT:    retq
1521;
1522; AVX512BW-LABEL: trunc2x4i64_8i16:
1523; AVX512BW:       # %bb.0: # %entry
1524; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1525; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1526; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1527; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1528; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1529; AVX512BW-NEXT:    vzeroupper
1530; AVX512BW-NEXT:    retq
1531;
1532; AVX512BWVL-LABEL: trunc2x4i64_8i16:
1533; AVX512BWVL:       # %bb.0: # %entry
1534; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
1535; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
1536; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1537; AVX512BWVL-NEXT:    vzeroupper
1538; AVX512BWVL-NEXT:    retq
1539entry:
1540  %0 = trunc <4 x i64> %a to <4 x i16>
1541  %1 = trunc <4 x i64> %b to <4 x i16>
1542  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1543  ret <8 x i16> %2
1544}
1545
1546define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
1547; SSE-LABEL: trunc2x2i64_4i32:
1548; SSE:       # %bb.0: # %entry
1549; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1550; SSE-NEXT:    retq
1551;
1552; AVX-LABEL: trunc2x2i64_4i32:
1553; AVX:       # %bb.0: # %entry
1554; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1555; AVX-NEXT:    retq
1556;
1557; AVX512F-LABEL: trunc2x2i64_4i32:
1558; AVX512F:       # %bb.0: # %entry
1559; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1560; AVX512F-NEXT:    retq
1561;
1562; AVX512VL-LABEL: trunc2x2i64_4i32:
1563; AVX512VL:       # %bb.0: # %entry
1564; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1565; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1566; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1567; AVX512VL-NEXT:    vzeroupper
1568; AVX512VL-NEXT:    retq
1569;
1570; AVX512BW-LABEL: trunc2x2i64_4i32:
1571; AVX512BW:       # %bb.0: # %entry
1572; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1573; AVX512BW-NEXT:    retq
1574;
1575; AVX512BWVL-LABEL: trunc2x2i64_4i32:
1576; AVX512BWVL:       # %bb.0: # %entry
1577; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1578; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1579; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
1580; AVX512BWVL-NEXT:    vzeroupper
1581; AVX512BWVL-NEXT:    retq
1582entry:
1583  %0 = trunc <2 x i64> %a to <2 x i32>
1584  %1 = trunc <2 x i64> %b to <2 x i32>
1585  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1586  ret <4 x i32> %2
1587}
1588
1589define i64 @trunc2i64_i64(<2 x i64> %inval) {
1590; SSE-LABEL: trunc2i64_i64:
1591; SSE:       # %bb.0: # %entry
1592; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1593; SSE-NEXT:    movq %xmm0, %rax
1594; SSE-NEXT:    retq
1595;
1596; AVX-LABEL: trunc2i64_i64:
1597; AVX:       # %bb.0: # %entry
1598; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1599; AVX-NEXT:    vmovq %xmm0, %rax
1600; AVX-NEXT:    retq
1601;
1602; AVX512-LABEL: trunc2i64_i64:
1603; AVX512:       # %bb.0: # %entry
1604; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1605; AVX512-NEXT:    vmovq %xmm0, %rax
1606; AVX512-NEXT:    retq
1607entry:
1608  %0 = trunc <2 x i64> %inval to <2 x i32>
1609  %1 = bitcast <2 x i32> %0 to i64
1610  ret i64 %1
1611}
1612
1613define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
1614; SSE2-LABEL: trunc2x4i32_8i16:
1615; SSE2:       # %bb.0: # %entry
1616; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1617; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1618; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1619; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1620; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1621; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1622; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1623; SSE2-NEXT:    retq
1624;
1625; SSSE3-LABEL: trunc2x4i32_8i16:
1626; SSSE3:       # %bb.0: # %entry
1627; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1628; SSSE3-NEXT:    pshufb %xmm2, %xmm1
1629; SSSE3-NEXT:    pshufb %xmm2, %xmm0
1630; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1631; SSSE3-NEXT:    retq
1632;
1633; SSE41-LABEL: trunc2x4i32_8i16:
1634; SSE41:       # %bb.0: # %entry
1635; SSE41-NEXT:    pxor %xmm2, %xmm2
1636; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1637; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1638; SSE41-NEXT:    packusdw %xmm1, %xmm0
1639; SSE41-NEXT:    retq
1640;
1641; AVX-LABEL: trunc2x4i32_8i16:
1642; AVX:       # %bb.0: # %entry
1643; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1644; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1645; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1646; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1647; AVX-NEXT:    retq
1648;
1649; AVX512F-LABEL: trunc2x4i32_8i16:
1650; AVX512F:       # %bb.0: # %entry
1651; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1652; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1653; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1654; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1655; AVX512F-NEXT:    vzeroupper
1656; AVX512F-NEXT:    retq
1657;
1658; AVX512VL-LABEL: trunc2x4i32_8i16:
1659; AVX512VL:       # %bb.0: # %entry
1660; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1661; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1662; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1663; AVX512VL-NEXT:    vzeroupper
1664; AVX512VL-NEXT:    retq
1665;
1666; AVX512BW-LABEL: trunc2x4i32_8i16:
1667; AVX512BW:       # %bb.0: # %entry
1668; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1669; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1670; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1671; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1672; AVX512BW-NEXT:    vzeroupper
1673; AVX512BW-NEXT:    retq
1674;
1675; AVX512BWVL-LABEL: trunc2x4i32_8i16:
1676; AVX512BWVL:       # %bb.0: # %entry
1677; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1678; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1679; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1680; AVX512BWVL-NEXT:    vzeroupper
1681; AVX512BWVL-NEXT:    retq
1682entry:
1683  %0 = trunc <4 x i32> %a to <4 x i16>
1684  %1 = trunc <4 x i32> %b to <4 x i16>
1685  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1686  ret <8 x i16> %2
1687}
1688
1689; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1690define i64 @trunc4i32_i64(<4 x i32> %inval) {
1691; SSE2-LABEL: trunc4i32_i64:
1692; SSE2:       # %bb.0: # %entry
1693; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1694; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1695; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1696; SSE2-NEXT:    movq %xmm0, %rax
1697; SSE2-NEXT:    retq
1698;
1699; SSSE3-LABEL: trunc4i32_i64:
1700; SSSE3:       # %bb.0: # %entry
1701; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1702; SSSE3-NEXT:    movq %xmm0, %rax
1703; SSSE3-NEXT:    retq
1704;
1705; SSE41-LABEL: trunc4i32_i64:
1706; SSE41:       # %bb.0: # %entry
1707; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1708; SSE41-NEXT:    movq %xmm0, %rax
1709; SSE41-NEXT:    retq
1710;
1711; AVX-LABEL: trunc4i32_i64:
1712; AVX:       # %bb.0: # %entry
1713; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1714; AVX-NEXT:    vmovq %xmm0, %rax
1715; AVX-NEXT:    retq
1716;
1717; AVX512F-LABEL: trunc4i32_i64:
1718; AVX512F:       # %bb.0: # %entry
1719; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1720; AVX512F-NEXT:    vmovq %xmm0, %rax
1721; AVX512F-NEXT:    retq
1722;
1723; AVX512VL-LABEL: trunc4i32_i64:
1724; AVX512VL:       # %bb.0: # %entry
1725; AVX512VL-NEXT:    vpmovdw %xmm0, %xmm0
1726; AVX512VL-NEXT:    vmovq %xmm0, %rax
1727; AVX512VL-NEXT:    retq
1728;
1729; AVX512BW-LABEL: trunc4i32_i64:
1730; AVX512BW:       # %bb.0: # %entry
1731; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
1732; AVX512BW-NEXT:    vmovq %xmm0, %rax
1733; AVX512BW-NEXT:    retq
1734;
1735; AVX512BWVL-LABEL: trunc4i32_i64:
1736; AVX512BWVL:       # %bb.0: # %entry
1737; AVX512BWVL-NEXT:    vpmovdw %xmm0, %xmm0
1738; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
1739; AVX512BWVL-NEXT:    retq
1740entry:
1741  %0 = trunc <4 x i32> %inval to <4 x i16>
1742  %1 = bitcast <4 x i16> %0 to i64
1743  ret i64 %1
1744}
1745
1746define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
1747; SSE2-LABEL: trunc2x8i16_16i8:
1748; SSE2:       # %bb.0: # %entry
1749; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1750; SSE2-NEXT:    pand %xmm2, %xmm0
1751; SSE2-NEXT:    pand %xmm2, %xmm1
1752; SSE2-NEXT:    packuswb %xmm1, %xmm0
1753; SSE2-NEXT:    retq
1754;
1755; SSSE3-LABEL: trunc2x8i16_16i8:
1756; SSSE3:       # %bb.0: # %entry
1757; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1758; SSSE3-NEXT:    pand %xmm2, %xmm1
1759; SSSE3-NEXT:    pand %xmm2, %xmm0
1760; SSSE3-NEXT:    packuswb %xmm1, %xmm0
1761; SSSE3-NEXT:    retq
1762;
1763; SSE41-LABEL: trunc2x8i16_16i8:
1764; SSE41:       # %bb.0: # %entry
1765; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1766; SSE41-NEXT:    pand %xmm2, %xmm1
1767; SSE41-NEXT:    pand %xmm2, %xmm0
1768; SSE41-NEXT:    packuswb %xmm1, %xmm0
1769; SSE41-NEXT:    retq
1770;
1771; AVX-LABEL: trunc2x8i16_16i8:
1772; AVX:       # %bb.0: # %entry
1773; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1774; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
1775; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
1776; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1777; AVX-NEXT:    retq
1778;
1779; AVX512F-LABEL: trunc2x8i16_16i8:
1780; AVX512F:       # %bb.0: # %entry
1781; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1782; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
1783; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
1784; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1785; AVX512F-NEXT:    retq
1786;
1787; AVX512VL-LABEL: trunc2x8i16_16i8:
1788; AVX512VL:       # %bb.0: # %entry
1789; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1790; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
1791; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
1792; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1793; AVX512VL-NEXT:    retq
1794;
1795; AVX512BW-LABEL: trunc2x8i16_16i8:
1796; AVX512BW:       # %bb.0: # %entry
1797; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1798; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1799; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1800; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1801; AVX512BW-NEXT:    vzeroupper
1802; AVX512BW-NEXT:    retq
1803;
1804; AVX512BWVL-LABEL: trunc2x8i16_16i8:
1805; AVX512BWVL:       # %bb.0: # %entry
1806; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1807; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1808; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1809; AVX512BWVL-NEXT:    vzeroupper
1810; AVX512BWVL-NEXT:    retq
1811entry:
1812  %0 = trunc <8 x i16> %a to <8 x i8>
1813  %1 = trunc <8 x i16> %b to <8 x i8>
1814  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1815  ret <16 x i8> %2
1816}
1817
1818; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
1819define i64 @trunc8i16_i64(<8 x i16> %inval) {
1820; SSE2-LABEL: trunc8i16_i64:
1821; SSE2:       # %bb.0: # %entry
1822; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1823; SSE2-NEXT:    packuswb %xmm0, %xmm0
1824; SSE2-NEXT:    movq %xmm0, %rax
1825; SSE2-NEXT:    retq
1826;
1827; SSSE3-LABEL: trunc8i16_i64:
1828; SSSE3:       # %bb.0: # %entry
1829; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1830; SSSE3-NEXT:    movq %xmm0, %rax
1831; SSSE3-NEXT:    retq
1832;
1833; SSE41-LABEL: trunc8i16_i64:
1834; SSE41:       # %bb.0: # %entry
1835; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1836; SSE41-NEXT:    movq %xmm0, %rax
1837; SSE41-NEXT:    retq
1838;
1839; AVX-LABEL: trunc8i16_i64:
1840; AVX:       # %bb.0: # %entry
1841; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1842; AVX-NEXT:    vmovq %xmm0, %rax
1843; AVX-NEXT:    retq
1844;
1845; AVX512F-LABEL: trunc8i16_i64:
1846; AVX512F:       # %bb.0: # %entry
1847; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1848; AVX512F-NEXT:    vmovq %xmm0, %rax
1849; AVX512F-NEXT:    retq
1850;
1851; AVX512VL-LABEL: trunc8i16_i64:
1852; AVX512VL:       # %bb.0: # %entry
1853; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1854; AVX512VL-NEXT:    vmovq %xmm0, %rax
1855; AVX512VL-NEXT:    retq
1856;
1857; AVX512BW-LABEL: trunc8i16_i64:
1858; AVX512BW:       # %bb.0: # %entry
1859; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
1860; AVX512BW-NEXT:    vmovq %xmm0, %rax
1861; AVX512BW-NEXT:    retq
1862;
1863; AVX512BWVL-LABEL: trunc8i16_i64:
1864; AVX512BWVL:       # %bb.0: # %entry
1865; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
1866; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
1867; AVX512BWVL-NEXT:    retq
1868entry:
1869  %0 = trunc <8 x i16> %inval to <8 x i8>
1870  %1 = bitcast <8 x i8> %0 to i64
1871  ret i64 %1
1872}
1873
1874define <16 x i8> @trunc16i64_16i8_const() {
1875; SSE-LABEL: trunc16i64_16i8_const:
1876; SSE:       # %bb.0: # %entry
1877; SSE-NEXT:    xorps %xmm0, %xmm0
1878; SSE-NEXT:    retq
1879;
1880; AVX-LABEL: trunc16i64_16i8_const:
1881; AVX:       # %bb.0: # %entry
1882; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1883; AVX-NEXT:    retq
1884;
1885; AVX512-LABEL: trunc16i64_16i8_const:
1886; AVX512:       # %bb.0: # %entry
1887; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1888; AVX512-NEXT:    retq
1889
1890entry:
1891  %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
1892  %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
1893  ret <16 x i8> %1
1894}
1895
1896define <8 x i16> @PR32160(<8 x i32> %x) {
1897; SSE-LABEL: PR32160:
1898; SSE:       # %bb.0:
1899; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1900; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1901; SSE-NEXT:    retq
1902;
1903; AVX-LABEL: PR32160:
1904; AVX:       # %bb.0:
1905; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
1906; AVX-NEXT:    vzeroupper
1907; AVX-NEXT:    retq
1908;
1909; AVX512F-LABEL: PR32160:
1910; AVX512F:       # %bb.0:
1911; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1912; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
1913; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1914; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
1915; AVX512F-NEXT:    vzeroupper
1916; AVX512F-NEXT:    retq
1917;
1918; AVX512VL-LABEL: PR32160:
1919; AVX512VL:       # %bb.0:
1920; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
1921; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1922; AVX512VL-NEXT:    vzeroupper
1923; AVX512VL-NEXT:    retq
1924;
1925; AVX512BW-LABEL: PR32160:
1926; AVX512BW:       # %bb.0:
1927; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1928; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
1929; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1930; AVX512BW-NEXT:    vzeroupper
1931; AVX512BW-NEXT:    retq
1932;
1933; AVX512BWVL-LABEL: PR32160:
1934; AVX512BWVL:       # %bb.0:
1935; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
1936; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
1937; AVX512BWVL-NEXT:    vzeroupper
1938; AVX512BWVL-NEXT:    retq
1939  %shuf = trunc <8 x i32> %x to <8 x i16>
1940  %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1941  ret <8 x i16> %trunc
1942}
1943
1944define void @PR34773(ptr %a0, ptr %a1) {
1945; SSE-LABEL: PR34773:
1946; SSE:       # %bb.0:
1947; SSE-NEXT:    movdqu (%rdi), %xmm0
1948; SSE-NEXT:    movdqu 16(%rdi), %xmm1
1949; SSE-NEXT:    movdqu 32(%rdi), %xmm2
1950; SSE-NEXT:    movdqu 48(%rdi), %xmm3
1951; SSE-NEXT:    psrlw $8, %xmm1
1952; SSE-NEXT:    psrlw $8, %xmm0
1953; SSE-NEXT:    packuswb %xmm1, %xmm0
1954; SSE-NEXT:    psrlw $8, %xmm3
1955; SSE-NEXT:    psrlw $8, %xmm2
1956; SSE-NEXT:    packuswb %xmm3, %xmm2
1957; SSE-NEXT:    movdqu %xmm0, (%rsi)
1958; SSE-NEXT:    movdqu %xmm2, 16(%rsi)
1959; SSE-NEXT:    retq
1960;
1961; AVX1-LABEL: PR34773:
1962; AVX1:       # %bb.0:
1963; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
1964; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm1
1965; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm2
1966; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
1967; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1968; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1969; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1970; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
1971; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1972; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
1973; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
1974; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
1975; AVX1-NEXT:    retq
1976;
1977; AVX2-LABEL: PR34773:
1978; AVX2:       # %bb.0:
1979; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
1980; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
1981; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1982; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1983; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1984; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1985; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1986; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1987; AVX2-NEXT:    vmovdqu %xmm0, (%rsi)
1988; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi)
1989; AVX2-NEXT:    vzeroupper
1990; AVX2-NEXT:    retq
1991;
1992; AVX512F-LABEL: PR34773:
1993; AVX512F:       # %bb.0:
1994; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
1995; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
1996; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1997; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
1998; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1999; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
2000; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2001; AVX512F-NEXT:    vpmovdb %zmm0, 16(%rsi)
2002; AVX512F-NEXT:    vzeroupper
2003; AVX512F-NEXT:    retq
2004;
2005; AVX512VL-LABEL: PR34773:
2006; AVX512VL:       # %bb.0:
2007; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
2008; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
2009; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
2010; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
2011; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2012; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
2013; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
2014; AVX512VL-NEXT:    vpmovdb %zmm0, 16(%rsi)
2015; AVX512VL-NEXT:    vzeroupper
2016; AVX512VL-NEXT:    retq
2017;
2018; AVX512BW-LABEL: PR34773:
2019; AVX512BW:       # %bb.0:
2020; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
2021; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm1
2022; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
2023; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
2024; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2025; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
2026; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
2027; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi)
2028; AVX512BW-NEXT:    vzeroupper
2029; AVX512BW-NEXT:    retq
2030;
2031; AVX512BWVL-LABEL: PR34773:
2032; AVX512BWVL:       # %bb.0:
2033; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
2034; AVX512BWVL-NEXT:    vpsrlw $8, 32(%rdi), %ymm1
2035; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
2036; AVX512BWVL-NEXT:    vpmovwb %ymm1, 16(%rsi)
2037; AVX512BWVL-NEXT:    vzeroupper
2038; AVX512BWVL-NEXT:    retq
2039  %1  = getelementptr i16, ptr %a0, i64 16
2040  %2  = getelementptr i8, ptr %a1, i64 16
2041  %3  = load <16 x i16>, ptr %a0, align 2
2042  %4  = load <16 x i16>, ptr %1, align 2
2043  %5  = lshr <16 x i16> %3, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2044  %6 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2045  %7 = trunc <16 x i16> %5  to <16 x i8>
2046  %8 = trunc <16 x i16> %6 to <16 x i8>
2047  store <16 x i8> %7, ptr %a1, align 1
2048  store <16 x i8> %8, ptr %2, align 1
2049  ret void
2050}
2051
2052; Store merging must not infinitely fight store splitting.
2053
2054define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) align 2 {
2055; SSE2-LABEL: store_merge_split:
2056; SSE2:       # %bb.0:
2057; SSE2-NEXT:    pslld $16, %xmm1
2058; SSE2-NEXT:    psrad $16, %xmm1
2059; SSE2-NEXT:    pslld $16, %xmm0
2060; SSE2-NEXT:    psrad $16, %xmm0
2061; SSE2-NEXT:    packssdw %xmm1, %xmm0
2062; SSE2-NEXT:    pslld $16, %xmm3
2063; SSE2-NEXT:    psrad $16, %xmm3
2064; SSE2-NEXT:    pslld $16, %xmm2
2065; SSE2-NEXT:    psrad $16, %xmm2
2066; SSE2-NEXT:    packssdw %xmm3, %xmm2
2067; SSE2-NEXT:    shlq $4, %rdi
2068; SSE2-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2069; SSE2-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2070; SSE2-NEXT:    retq
2071;
2072; SSSE3-LABEL: store_merge_split:
2073; SSSE3:       # %bb.0:
2074; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2075; SSSE3-NEXT:    pshufb %xmm4, %xmm1
2076; SSSE3-NEXT:    pshufb %xmm4, %xmm0
2077; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2078; SSSE3-NEXT:    pshufb %xmm4, %xmm3
2079; SSSE3-NEXT:    pshufb %xmm4, %xmm2
2080; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2081; SSSE3-NEXT:    shlq $4, %rdi
2082; SSSE3-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2083; SSSE3-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2084; SSSE3-NEXT:    retq
2085;
2086; SSE41-LABEL: store_merge_split:
2087; SSE41:       # %bb.0:
2088; SSE41-NEXT:    pxor %xmm4, %xmm4
2089; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
2090; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
2091; SSE41-NEXT:    packusdw %xmm1, %xmm0
2092; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
2093; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
2094; SSE41-NEXT:    packusdw %xmm3, %xmm2
2095; SSE41-NEXT:    shlq $4, %rdi
2096; SSE41-NEXT:    movdqu %xmm0, (%rsi,%rdi)
2097; SSE41-NEXT:    movdqu %xmm2, 16(%rsi,%rdi)
2098; SSE41-NEXT:    retq
2099;
2100; AVX1-LABEL: store_merge_split:
2101; AVX1:       # %bb.0:
2102; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2103; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2104; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2105; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
2106; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2107; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2108; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
2109; AVX1-NEXT:    shlq $4, %rdi
2110; AVX1-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2111; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2112; AVX1-NEXT:    vzeroupper
2113; AVX1-NEXT:    retq
2114;
2115; AVX2-LABEL: store_merge_split:
2116; AVX2:       # %bb.0:
2117; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2118; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2119; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2120; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2121; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2122; AVX2-NEXT:    shlq $4, %rdi
2123; AVX2-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2124; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2125; AVX2-NEXT:    vzeroupper
2126; AVX2-NEXT:    retq
2127;
2128; AVX512F-LABEL: store_merge_split:
2129; AVX512F:       # %bb.0:
2130; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2131; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2132; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2133; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
2134; AVX512F-NEXT:    shlq $4, %rdi
2135; AVX512F-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2136; AVX512F-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2137; AVX512F-NEXT:    vzeroupper
2138; AVX512F-NEXT:    retq
2139;
2140; AVX512VL-LABEL: store_merge_split:
2141; AVX512VL:       # %bb.0:
2142; AVX512VL-NEXT:    shlq $4, %rdi
2143; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi,%rdi)
2144; AVX512VL-NEXT:    vpmovdw %ymm1, 16(%rsi,%rdi)
2145; AVX512VL-NEXT:    vzeroupper
2146; AVX512VL-NEXT:    retq
2147;
2148; AVX512BW-LABEL: store_merge_split:
2149; AVX512BW:       # %bb.0:
2150; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2151; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2152; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
2153; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1
2154; AVX512BW-NEXT:    shlq $4, %rdi
2155; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi,%rdi)
2156; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi,%rdi)
2157; AVX512BW-NEXT:    vzeroupper
2158; AVX512BW-NEXT:    retq
2159;
2160; AVX512BWVL-LABEL: store_merge_split:
2161; AVX512BWVL:       # %bb.0:
2162; AVX512BWVL-NEXT:    shlq $4, %rdi
2163; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi,%rdi)
2164; AVX512BWVL-NEXT:    vpmovdw %ymm1, 16(%rsi,%rdi)
2165; AVX512BWVL-NEXT:    vzeroupper
2166; AVX512BWVL-NEXT:    retq
2167  %t1 = trunc <8 x i32> %w1 to <8 x i16>
2168  %t2 = trunc <8 x i32> %w2 to <8 x i16>
2169  %g1 = getelementptr inbounds <8 x i16>, ptr %p, i64 %idx
2170  %g2 = getelementptr inbounds <8 x i16>, ptr %g1, i64 1
2171  store <8 x i16> %t1, ptr %g1, align 2
2172  store <8 x i16> %t2, ptr %g2, align 2
2173  ret void
2174}
2175