1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f  | FileCheck %s --check-prefixes=AVX512,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
6
7; trunc(concat(x,y)) -> pack
8
9define <16 x i16> @trunc_concat_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind {
10; AVX1-LABEL: trunc_concat_packssdw_256:
11; AVX1:       # %bb.0:
12; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm2
13; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
14; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
15; AVX1-NEXT:    vpsrad $23, %xmm1, %xmm3
16; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
17; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
18; AVX1-NEXT:    vpsrad $23, %xmm1, %xmm1
19; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
20; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
21; AVX1-NEXT:    retq
22;
23; AVX2-LABEL: trunc_concat_packssdw_256:
24; AVX2:       # %bb.0:
25; AVX2-NEXT:    vpsrad $17, %ymm0, %ymm0
26; AVX2-NEXT:    vpsrad $23, %ymm1, %ymm1
27; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
28; AVX2-NEXT:    retq
29;
30; AVX512-LABEL: trunc_concat_packssdw_256:
31; AVX512:       # %bb.0:
32; AVX512-NEXT:    vpsrad $17, %ymm0, %ymm0
33; AVX512-NEXT:    vpsrad $23, %ymm1, %ymm1
34; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
35; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
36; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
37; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
38; AVX512-NEXT:    retq
39  %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
40  %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
41  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
42  %4 = trunc <16 x i32> %3 to <16 x i16>
43  ret <16 x i16> %4
44}
45
46define <16 x i16> @trunc_concat_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind {
47; AVX1-LABEL: trunc_concat_packusdw_256:
48; AVX1:       # %bb.0:
49; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm2
50; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
51; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm0
52; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
53; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
54; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
55; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
56; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
57; AVX1-NEXT:    retq
58;
59; AVX2-LABEL: trunc_concat_packusdw_256:
60; AVX2:       # %bb.0:
61; AVX2-NEXT:    vpsrld $17, %ymm0, %ymm0
62; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15]
63; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
64; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
65; AVX2-NEXT:    retq
66;
67; AVX512-LABEL: trunc_concat_packusdw_256:
68; AVX512:       # %bb.0:
69; AVX512-NEXT:    vpsrld $17, %ymm0, %ymm0
70; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
71; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
72; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
73; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
74; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
75; AVX512-NEXT:    retq
76  %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
77  %2 = and  <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
78  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
79  %4 = trunc <16 x i32> %3 to <16 x i16>
80  ret <16 x i16> %4
81}
82
83define <32 x i8> @trunc_concat_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind {
84; AVX1-LABEL: trunc_concat_packsswb_256:
85; AVX1:       # %bb.0:
86; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm2
87; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
88; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
89; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
90; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
91; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
92; AVX1-NEXT:    vpacksswb %xmm1, %xmm2, %xmm1
93; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
94; AVX1-NEXT:    retq
95;
96; AVX2-LABEL: trunc_concat_packsswb_256:
97; AVX2:       # %bb.0:
98; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
99; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
100; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
101; AVX2-NEXT:    retq
102;
103; AVX512F-LABEL: trunc_concat_packsswb_256:
104; AVX512F:       # %bb.0:
105; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm0
106; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
107; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
108; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
109; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
110; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
111; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
112; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
113; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
114; AVX512F-NEXT:    retq
115;
116; AVX512BW-LABEL: trunc_concat_packsswb_256:
117; AVX512BW:       # %bb.0:
118; AVX512BW-NEXT:    vpsraw $15, %ymm0, %ymm0
119; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
120; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
121; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
122; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
123; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
124; AVX512BW-NEXT:    retq
125  %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
126  %2 = and  <16 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
127  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
128  %4 = trunc <32 x i16> %3 to <32 x i8>
129  ret <32 x i8> %4
130}
131
132define <32 x i8> @trunc_concat_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind {
133; AVX1-LABEL: trunc_concat_packuswb_256:
134; AVX1:       # %bb.0:
135; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm2
136; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
137; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
138; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
139; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
140; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
141; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
142; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
143; AVX1-NEXT:    retq
144;
145; AVX2-LABEL: trunc_concat_packuswb_256:
146; AVX2:       # %bb.0:
147; AVX2-NEXT:    vpsrlw $15, %ymm0, %ymm0
148; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
149; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
150; AVX2-NEXT:    retq
151;
152; AVX512F-LABEL: trunc_concat_packuswb_256:
153; AVX512F:       # %bb.0:
154; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm0
155; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
156; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
157; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
158; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
159; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
160; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
161; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
162; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
163; AVX512F-NEXT:    retq
164;
165; AVX512BW-LABEL: trunc_concat_packuswb_256:
166; AVX512BW:       # %bb.0:
167; AVX512BW-NEXT:    vpsrlw $15, %ymm0, %ymm0
168; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
169; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
170; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
171; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
172; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
173; AVX512BW-NEXT:    retq
174  %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
175  %2 = and  <16 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
176  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
177  %4 = trunc <32 x i16> %3 to <32 x i8>
178  ret <32 x i8> %4
179}
180
181; concat(trunc(x),trunc(y)) -> pack
182
183
184define <16 x i16> @concat_trunc_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind {
185; AVX1-LABEL: concat_trunc_packssdw_256:
186; AVX1:       # %bb.0:
187; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm2
188; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
189; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
190; AVX1-NEXT:    vpsrad $23, %xmm1, %xmm3
191; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
192; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
193; AVX1-NEXT:    vpsrad $23, %xmm1, %xmm1
194; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
195; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
196; AVX1-NEXT:    retq
197;
198; AVX2-LABEL: concat_trunc_packssdw_256:
199; AVX2:       # %bb.0:
200; AVX2-NEXT:    vpsrad $17, %ymm0, %ymm0
201; AVX2-NEXT:    vpsrad $23, %ymm1, %ymm1
202; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
203; AVX2-NEXT:    retq
204;
205; AVX512-LABEL: concat_trunc_packssdw_256:
206; AVX512:       # %bb.0:
207; AVX512-NEXT:    vpsrad $17, %ymm0, %ymm0
208; AVX512-NEXT:    vpsrad $23, %ymm1, %ymm1
209; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
210; AVX512-NEXT:    vpmovdw %ymm1, %xmm1
211; AVX512-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
212; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
213; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
214; AVX512-NEXT:    retq
215  %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
216  %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
217  %3 = trunc <8 x i32> %1 to <8 x i16>
218  %4 = trunc <8 x i32> %2 to <8 x i16>
219  %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
220  ret <16 x i16> %5
221}
222
223define <16 x i16> @concat_trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind {
224; AVX1-LABEL: concat_trunc_packusdw_256:
225; AVX1:       # %bb.0:
226; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
227; AVX1-NEXT:    vpsrld $17, %xmm2, %xmm2
228; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm0
229; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
230; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
231; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
232; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
233; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
234; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
235; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
236; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
237; AVX1-NEXT:    retq
238;
239; AVX2-LABEL: concat_trunc_packusdw_256:
240; AVX2:       # %bb.0:
241; AVX2-NEXT:    vpsrld $17, %ymm0, %ymm0
242; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
243; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
244; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
245; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
246; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
247; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
248; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
249; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
250; AVX2-NEXT:    retq
251;
252; AVX512-LABEL: concat_trunc_packusdw_256:
253; AVX512:       # %bb.0:
254; AVX512-NEXT:    vpsrld $17, %ymm0, %ymm0
255; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
256; AVX512-NEXT:    vpmovdw %ymm1, %xmm1
257; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
258; AVX512-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
259; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
260; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
261; AVX512-NEXT:    retq
262  %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
263  %2 = and  <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
264  %3 = trunc <8 x i32> %1 to <8 x i16>
265  %4 = trunc <8 x i32> %2 to <8 x i16>
266  %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
267  ret <16 x i16> %5
268}
269
270define <32 x i8> @concat_trunc_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind {
271; AVX1-LABEL: concat_trunc_packsswb_256:
272; AVX1:       # %bb.0:
273; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
274; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm2
275; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
276; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
277; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
278; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
279; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
280; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
281; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
282; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
283; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
284; AVX1-NEXT:    retq
285;
286; AVX2-LABEL: concat_trunc_packsswb_256:
287; AVX2:       # %bb.0:
288; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
289; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
290; AVX2-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
291; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
292; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
293; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
294; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
295; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
296; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
297; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
298; AVX2-NEXT:    retq
299;
300; AVX512F-LABEL: concat_trunc_packsswb_256:
301; AVX512F:       # %bb.0:
302; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm0
303; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
304; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
305; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
306; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
307; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
308; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
309; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
310; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
311; AVX512F-NEXT:    retq
312;
313; AVX512BW-LABEL: concat_trunc_packsswb_256:
314; AVX512BW:       # %bb.0:
315; AVX512BW-NEXT:    vpsraw $15, %ymm0, %ymm0
316; AVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
317; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
318; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
319; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
320; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
321; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
322; AVX512BW-NEXT:    retq
323  %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
324  %2 = and  <16 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
325  %3 = trunc <16 x i16> %1 to <16 x i8>
326  %4 = trunc <16 x i16> %2 to <16 x i8>
327  %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
328  ret <32 x i8> %5
329}
330
331define <32 x i8> @concat_trunc_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind {
332; AVX1-LABEL: concat_trunc_packuswb_256:
333; AVX1:       # %bb.0:
334; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
335; AVX1-NEXT:    vpsrlw $15, %xmm2, %xmm2
336; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
337; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
338; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
339; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
340; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
341; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
342; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
343; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
344; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
345; AVX1-NEXT:    retq
346;
347; AVX2-LABEL: concat_trunc_packuswb_256:
348; AVX2:       # %bb.0:
349; AVX2-NEXT:    vpsrlw $15, %ymm0, %ymm0
350; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
351; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
352; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
353; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
354; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
355; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
356; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
357; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
358; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
359; AVX2-NEXT:    retq
360;
361; AVX512F-LABEL: concat_trunc_packuswb_256:
362; AVX512F:       # %bb.0:
363; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm0
364; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
365; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
366; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
367; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
368; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
369; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
370; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
371; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
372; AVX512F-NEXT:    retq
373;
374; AVX512BW-LABEL: concat_trunc_packuswb_256:
375; AVX512BW:       # %bb.0:
376; AVX512BW-NEXT:    vpsrlw $15, %ymm0, %ymm0
377; AVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
378; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
379; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
380; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1]
381; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
382; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
383; AVX512BW-NEXT:    retq
384  %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
385  %2 = and  <16 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
386  %3 = trunc <16 x i16> %1 to <16 x i8>
387  %4 = trunc <16 x i16> %2 to <16 x i8>
388  %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
389  ret <32 x i8> %5
390}
391