1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2   | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
8
9; trunc(concat(x,y)) -> pack
10
11define <8 x i16> @trunc_concat_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
12; SSE-LABEL: trunc_concat_packssdw_128:
13; SSE:       # %bb.0:
14; SSE-NEXT:    psrad $17, %xmm0
15; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
16; SSE-NEXT:    packssdw %xmm1, %xmm0
17; SSE-NEXT:    retq
18;
19; AVX1-LABEL: trunc_concat_packssdw_128:
20; AVX1:       # %bb.0:
21; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
22; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
23; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
24; AVX1-NEXT:    retq
25;
26; AVX2-LABEL: trunc_concat_packssdw_128:
27; AVX2:       # %bb.0:
28; AVX2-NEXT:    vpsrad $17, %xmm0, %xmm0
29; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
30; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
31; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
32; AVX2-NEXT:    retq
33;
34; AVX512-LABEL: trunc_concat_packssdw_128:
35; AVX512:       # %bb.0:
36; AVX512-NEXT:    vpsrad $17, %xmm0, %xmm0
37; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
38; AVX512-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
39; AVX512-NEXT:    retq
40  %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
41  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
42  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
43  %4 = trunc <8 x i32> %3 to <8 x i16>
44  ret <8 x i16> %4
45}
46
47define <8 x i16> @trunc_concat_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
48; SSE2-LABEL: trunc_concat_packusdw_128:
49; SSE2:       # %bb.0:
50; SSE2-NEXT:    psrld $17, %xmm0
51; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
52; SSE2-NEXT:    packssdw %xmm1, %xmm0
53; SSE2-NEXT:    retq
54;
55; SSE4-LABEL: trunc_concat_packusdw_128:
56; SSE4:       # %bb.0:
57; SSE4-NEXT:    psrld $17, %xmm0
58; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
59; SSE4-NEXT:    packusdw %xmm1, %xmm0
60; SSE4-NEXT:    retq
61;
62; AVX1-LABEL: trunc_concat_packusdw_128:
63; AVX1:       # %bb.0:
64; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm0
65; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
66; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
67; AVX1-NEXT:    retq
68;
69; AVX2-LABEL: trunc_concat_packusdw_128:
70; AVX2:       # %bb.0:
71; AVX2-NEXT:    vpsrld $17, %xmm0, %xmm0
72; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
73; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
74; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
75; AVX2-NEXT:    retq
76;
77; AVX512-LABEL: trunc_concat_packusdw_128:
78; AVX512:       # %bb.0:
79; AVX512-NEXT:    vpsrld $17, %xmm0, %xmm0
80; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
81; AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
82; AVX512-NEXT:    retq
83  %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
84  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
85  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
86  %4 = trunc <8 x i32> %3 to <8 x i16>
87  ret <8 x i16> %4
88}
89
90define <16 x i8> @trunc_concat_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
91; SSE-LABEL: trunc_concat_packsswb_128:
92; SSE:       # %bb.0:
93; SSE-NEXT:    psraw $15, %xmm0
94; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
95; SSE-NEXT:    packsswb %xmm1, %xmm0
96; SSE-NEXT:    retq
97;
98; AVX-LABEL: trunc_concat_packsswb_128:
99; AVX:       # %bb.0:
100; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
101; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
102; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
103; AVX-NEXT:    retq
104  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
105  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
106  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
107  %4 = trunc <16 x i16> %3 to <16 x i8>
108  ret <16 x i8> %4
109}
110
111define <16 x i8> @trunc_concat_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
112; SSE-LABEL: trunc_concat_packuswb_128:
113; SSE:       # %bb.0:
114; SSE-NEXT:    psrlw $15, %xmm0
115; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
116; SSE-NEXT:    packuswb %xmm1, %xmm0
117; SSE-NEXT:    retq
118;
119; AVX-LABEL: trunc_concat_packuswb_128:
120; AVX:       # %bb.0:
121; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
122; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
123; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
124; AVX-NEXT:    retq
125  %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
126  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
127  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
128  %4 = trunc <16 x i16> %3 to <16 x i8>
129  ret <16 x i8> %4
130}
131
132; concat(trunc(x),trunc(y)) -> pack
133
134define <8 x i16> @concat_trunc_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
135; SSE2-LABEL: concat_trunc_packssdw_128:
136; SSE2:       # %bb.0:
137; SSE2-NEXT:    psrad $17, %xmm0
138; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
139; SSE2-NEXT:    packssdw %xmm0, %xmm0
140; SSE2-NEXT:    packuswb %xmm1, %xmm1
141; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
142; SSE2-NEXT:    retq
143;
144; SSE4-LABEL: concat_trunc_packssdw_128:
145; SSE4:       # %bb.0:
146; SSE4-NEXT:    psrad $17, %xmm0
147; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
148; SSE4-NEXT:    packssdw %xmm1, %xmm0
149; SSE4-NEXT:    retq
150;
151; AVX1-LABEL: concat_trunc_packssdw_128:
152; AVX1:       # %bb.0:
153; AVX1-NEXT:    vpsrad $17, %xmm0, %xmm0
154; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
155; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
156; AVX1-NEXT:    retq
157;
158; AVX2-LABEL: concat_trunc_packssdw_128:
159; AVX2:       # %bb.0:
160; AVX2-NEXT:    vpsrad $17, %xmm0, %xmm0
161; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
162; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
163; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
164; AVX2-NEXT:    retq
165;
166; AVX512-LABEL: concat_trunc_packssdw_128:
167; AVX512:       # %bb.0:
168; AVX512-NEXT:    vpsrad $17, %xmm0, %xmm0
169; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
170; AVX512-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
171; AVX512-NEXT:    retq
172  %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
173  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
174  %3 = trunc <4 x i32> %1 to <4 x i16>
175  %4 = trunc <4 x i32> %2 to <4 x i16>
176  %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
177  ret <8 x i16> %5
178}
179
180define <8 x i16> @concat_trunc_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
181; SSE2-LABEL: concat_trunc_packusdw_128:
182; SSE2:       # %bb.0:
183; SSE2-NEXT:    psrld $17, %xmm0
184; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
185; SSE2-NEXT:    packssdw %xmm0, %xmm0
186; SSE2-NEXT:    packuswb %xmm1, %xmm1
187; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
188; SSE2-NEXT:    retq
189;
190; SSE4-LABEL: concat_trunc_packusdw_128:
191; SSE4:       # %bb.0:
192; SSE4-NEXT:    psrld $17, %xmm0
193; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
194; SSE4-NEXT:    packusdw %xmm1, %xmm0
195; SSE4-NEXT:    retq
196;
197; AVX1-LABEL: concat_trunc_packusdw_128:
198; AVX1:       # %bb.0:
199; AVX1-NEXT:    vpsrld $17, %xmm0, %xmm0
200; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
201; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
202; AVX1-NEXT:    retq
203;
204; AVX2-LABEL: concat_trunc_packusdw_128:
205; AVX2:       # %bb.0:
206; AVX2-NEXT:    vpsrld $17, %xmm0, %xmm0
207; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
208; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
209; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
210; AVX2-NEXT:    retq
211;
212; AVX512-LABEL: concat_trunc_packusdw_128:
213; AVX512:       # %bb.0:
214; AVX512-NEXT:    vpsrld $17, %xmm0, %xmm0
215; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
216; AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
217; AVX512-NEXT:    retq
218  %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
219  %2 = and  <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
220  %3 = trunc <4 x i32> %1 to <4 x i16>
221  %4 = trunc <4 x i32> %2 to <4 x i16>
222  %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
223  ret <8 x i16> %5
224}
225
226define <16 x i8> @concat_trunc_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
227; SSE-LABEL: concat_trunc_packsswb_128:
228; SSE:       # %bb.0:
229; SSE-NEXT:    psraw $15, %xmm0
230; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
231; SSE-NEXT:    packsswb %xmm1, %xmm0
232; SSE-NEXT:    retq
233;
234; AVX-LABEL: concat_trunc_packsswb_128:
235; AVX:       # %bb.0:
236; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
237; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
238; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
239; AVX-NEXT:    retq
240  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
241  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
242  %3 = trunc <8 x i16> %1 to <8 x i8>
243  %4 = trunc <8 x i16> %2 to <8 x i8>
244  %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
245  ret <16 x i8> %5
246}
247
248define <16 x i8> @concat_trunc_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
249; SSE-LABEL: concat_trunc_packuswb_128:
250; SSE:       # %bb.0:
251; SSE-NEXT:    psrlw $15, %xmm0
252; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
253; SSE-NEXT:    packuswb %xmm1, %xmm0
254; SSE-NEXT:    retq
255;
256; AVX-LABEL: concat_trunc_packuswb_128:
257; AVX:       # %bb.0:
258; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
259; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
260; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
261; AVX-NEXT:    retq
262  %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
263  %2 = and  <8 x i16> %a1, <i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1, i16  1>
264  %3 = trunc <8 x i16> %1 to <8 x i8>
265  %4 = trunc <8 x i16> %2 to <8 x i8>
266  %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
267  ret <16 x i8> %5
268}
269