1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
16; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX
17
18;
19; PACKUS saturation truncation to vXi32
20;
21
22define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) {
23; SSE2-LABEL: trunc_packus_v2i64_v2i32:
24; SSE2:       # %bb.0:
25; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
26; SSE2-NEXT:    movdqa %xmm0, %xmm2
27; SSE2-NEXT:    pxor %xmm1, %xmm2
28; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
29; SSE2-NEXT:    pxor %xmm4, %xmm4
30; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
31; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
32; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
33; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
34; SSE2-NEXT:    pand %xmm4, %xmm2
35; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
36; SSE2-NEXT:    por %xmm2, %xmm3
37; SSE2-NEXT:    pand %xmm3, %xmm0
38; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
39; SSE2-NEXT:    por %xmm0, %xmm3
40; SSE2-NEXT:    movdqa %xmm3, %xmm0
41; SSE2-NEXT:    pxor %xmm1, %xmm0
42; SSE2-NEXT:    movdqa %xmm0, %xmm2
43; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
44; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
45; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
46; SSE2-NEXT:    pand %xmm2, %xmm0
47; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
48; SSE2-NEXT:    por %xmm0, %xmm1
49; SSE2-NEXT:    pand %xmm3, %xmm1
50; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
51; SSE2-NEXT:    retq
52;
53; SSSE3-LABEL: trunc_packus_v2i64_v2i32:
54; SSSE3:       # %bb.0:
55; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
56; SSSE3-NEXT:    movdqa %xmm0, %xmm2
57; SSSE3-NEXT:    pxor %xmm1, %xmm2
58; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
59; SSSE3-NEXT:    pxor %xmm4, %xmm4
60; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
61; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
62; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
63; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
64; SSSE3-NEXT:    pand %xmm4, %xmm2
65; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
66; SSSE3-NEXT:    por %xmm2, %xmm3
67; SSSE3-NEXT:    pand %xmm3, %xmm0
68; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
69; SSSE3-NEXT:    por %xmm0, %xmm3
70; SSSE3-NEXT:    movdqa %xmm3, %xmm0
71; SSSE3-NEXT:    pxor %xmm1, %xmm0
72; SSSE3-NEXT:    movdqa %xmm0, %xmm2
73; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
74; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
75; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
76; SSSE3-NEXT:    pand %xmm2, %xmm0
77; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
78; SSSE3-NEXT:    por %xmm0, %xmm1
79; SSSE3-NEXT:    pand %xmm3, %xmm1
80; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
81; SSSE3-NEXT:    retq
82;
83; SSE41-LABEL: trunc_packus_v2i64_v2i32:
84; SSE41:       # %bb.0:
85; SSE41-NEXT:    movdqa %xmm0, %xmm1
86; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [4294967295,4294967295]
87; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
88; SSE41-NEXT:    pxor %xmm3, %xmm0
89; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
90; SSE41-NEXT:    movdqa %xmm4, %xmm5
91; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
92; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
93; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
94; SSE41-NEXT:    pand %xmm5, %xmm0
95; SSE41-NEXT:    por %xmm4, %xmm0
96; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
97; SSE41-NEXT:    xorpd %xmm1, %xmm1
98; SSE41-NEXT:    movapd %xmm2, %xmm4
99; SSE41-NEXT:    xorpd %xmm3, %xmm4
100; SSE41-NEXT:    movapd %xmm4, %xmm5
101; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
102; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
103; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
104; SSE41-NEXT:    pand %xmm5, %xmm0
105; SSE41-NEXT:    por %xmm4, %xmm0
106; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
107; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
108; SSE41-NEXT:    retq
109;
110; AVX-LABEL: trunc_packus_v2i64_v2i32:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
113; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
114; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
115; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
116; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
117; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
118; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
119; AVX-NEXT:    retq
120;
121; AVX512F-LABEL: trunc_packus_v2i64_v2i32:
122; AVX512F:       # %bb.0:
123; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
124; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
125; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
126; AVX512F-NEXT:    vpmovusqd %zmm0, %ymm0
127; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
128; AVX512F-NEXT:    vzeroupper
129; AVX512F-NEXT:    retq
130;
131; AVX512VL-LABEL: trunc_packus_v2i64_v2i32:
132; AVX512VL:       # %bb.0:
133; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
134; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
135; AVX512VL-NEXT:    vpmovusqd %xmm0, %xmm0
136; AVX512VL-NEXT:    retq
137;
138; AVX512BW-LABEL: trunc_packus_v2i64_v2i32:
139; AVX512BW:       # %bb.0:
140; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
141; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
142; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
143; AVX512BW-NEXT:    vpmovusqd %zmm0, %ymm0
144; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
145; AVX512BW-NEXT:    vzeroupper
146; AVX512BW-NEXT:    retq
147;
148; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32:
149; AVX512BWVL:       # %bb.0:
150; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
151; AVX512BWVL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
152; AVX512BWVL-NEXT:    vpmovusqd %xmm0, %xmm0
153; AVX512BWVL-NEXT:    retq
154;
155; SKX-LABEL: trunc_packus_v2i64_v2i32:
156; SKX:       # %bb.0:
157; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
158; SKX-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
159; SKX-NEXT:    vpmovusqd %xmm0, %xmm0
160; SKX-NEXT:    retq
161  %1 = icmp slt <2 x i64> %a0, <i64 4294967295, i64 4294967295>
162  %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295>
163  %3 = icmp sgt <2 x i64> %2, zeroinitializer
164  %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer
165  %5 = trunc <2 x i64> %4 to <2 x i32>
166  ret <2 x i32> %5
167}
168
169define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
170; SSE2-LABEL: trunc_packus_v2i64_v2i32_store:
171; SSE2:       # %bb.0:
172; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
173; SSE2-NEXT:    movdqa %xmm0, %xmm2
174; SSE2-NEXT:    pxor %xmm1, %xmm2
175; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
176; SSE2-NEXT:    pxor %xmm4, %xmm4
177; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
178; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
179; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
180; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
181; SSE2-NEXT:    pand %xmm4, %xmm2
182; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
183; SSE2-NEXT:    por %xmm2, %xmm3
184; SSE2-NEXT:    pand %xmm3, %xmm0
185; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
186; SSE2-NEXT:    por %xmm0, %xmm3
187; SSE2-NEXT:    movdqa %xmm3, %xmm0
188; SSE2-NEXT:    pxor %xmm1, %xmm0
189; SSE2-NEXT:    movdqa %xmm0, %xmm2
190; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
191; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
192; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
193; SSE2-NEXT:    pand %xmm2, %xmm0
194; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
195; SSE2-NEXT:    por %xmm0, %xmm1
196; SSE2-NEXT:    pand %xmm3, %xmm1
197; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
198; SSE2-NEXT:    movq %xmm0, (%rdi)
199; SSE2-NEXT:    retq
200;
201; SSSE3-LABEL: trunc_packus_v2i64_v2i32_store:
202; SSSE3:       # %bb.0:
203; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
204; SSSE3-NEXT:    movdqa %xmm0, %xmm2
205; SSSE3-NEXT:    pxor %xmm1, %xmm2
206; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
207; SSSE3-NEXT:    pxor %xmm4, %xmm4
208; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
209; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
210; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
211; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
212; SSSE3-NEXT:    pand %xmm4, %xmm2
213; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
214; SSSE3-NEXT:    por %xmm2, %xmm3
215; SSSE3-NEXT:    pand %xmm3, %xmm0
216; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
217; SSSE3-NEXT:    por %xmm0, %xmm3
218; SSSE3-NEXT:    movdqa %xmm3, %xmm0
219; SSSE3-NEXT:    pxor %xmm1, %xmm0
220; SSSE3-NEXT:    movdqa %xmm0, %xmm2
221; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
222; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
223; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
224; SSSE3-NEXT:    pand %xmm2, %xmm0
225; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
226; SSSE3-NEXT:    por %xmm0, %xmm1
227; SSSE3-NEXT:    pand %xmm3, %xmm1
228; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
229; SSSE3-NEXT:    movq %xmm0, (%rdi)
230; SSSE3-NEXT:    retq
231;
232; SSE41-LABEL: trunc_packus_v2i64_v2i32_store:
233; SSE41:       # %bb.0:
234; SSE41-NEXT:    movdqa %xmm0, %xmm1
235; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [4294967295,4294967295]
236; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
237; SSE41-NEXT:    pxor %xmm3, %xmm0
238; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
239; SSE41-NEXT:    movdqa %xmm4, %xmm5
240; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
241; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
242; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
243; SSE41-NEXT:    pand %xmm5, %xmm0
244; SSE41-NEXT:    por %xmm4, %xmm0
245; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
246; SSE41-NEXT:    xorpd %xmm1, %xmm1
247; SSE41-NEXT:    movapd %xmm2, %xmm4
248; SSE41-NEXT:    xorpd %xmm3, %xmm4
249; SSE41-NEXT:    movapd %xmm4, %xmm5
250; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
251; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
252; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
253; SSE41-NEXT:    pand %xmm5, %xmm0
254; SSE41-NEXT:    por %xmm4, %xmm0
255; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
256; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
257; SSE41-NEXT:    movq %xmm0, (%rdi)
258; SSE41-NEXT:    retq
259;
260; AVX-LABEL: trunc_packus_v2i64_v2i32_store:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
263; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
264; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
265; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
266; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
267; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
268; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
269; AVX-NEXT:    vmovq %xmm0, (%rdi)
270; AVX-NEXT:    retq
271;
272; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store:
273; AVX512F:       # %bb.0:
274; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
275; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
276; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
277; AVX512F-NEXT:    vpmovusqd %zmm0, %ymm0
278; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
279; AVX512F-NEXT:    vzeroupper
280; AVX512F-NEXT:    retq
281;
282; AVX512VL-LABEL: trunc_packus_v2i64_v2i32_store:
283; AVX512VL:       # %bb.0:
284; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
285; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
286; AVX512VL-NEXT:    vpmovusqd %xmm0, (%rdi)
287; AVX512VL-NEXT:    retq
288;
289; AVX512BW-LABEL: trunc_packus_v2i64_v2i32_store:
290; AVX512BW:       # %bb.0:
291; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
292; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
293; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
294; AVX512BW-NEXT:    vpmovusqd %zmm0, %ymm0
295; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
296; AVX512BW-NEXT:    vzeroupper
297; AVX512BW-NEXT:    retq
298;
299; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32_store:
300; AVX512BWVL:       # %bb.0:
301; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
302; AVX512BWVL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
303; AVX512BWVL-NEXT:    vpmovusqd %xmm0, (%rdi)
304; AVX512BWVL-NEXT:    retq
305;
306; SKX-LABEL: trunc_packus_v2i64_v2i32_store:
307; SKX:       # %bb.0:
308; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
309; SKX-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
310; SKX-NEXT:    vpmovusqd %xmm0, (%rdi)
311; SKX-NEXT:    retq
312  %1 = icmp slt <2 x i64> %a0, <i64 4294967295, i64 4294967295>
313  %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295>
314  %3 = icmp sgt <2 x i64> %2, zeroinitializer
315  %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer
316  %5 = trunc <2 x i64> %4 to <2 x i32>
317  store <2 x i32> %5, ptr %p1
318  ret void
319}
320
321define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
322; SSE2-LABEL: trunc_packus_v4i64_v4i32:
323; SSE2:       # %bb.0:
324; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
325; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
326; SSE2-NEXT:    movdqa %xmm0, %xmm4
327; SSE2-NEXT:    pxor %xmm2, %xmm4
328; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
329; SSE2-NEXT:    pxor %xmm6, %xmm6
330; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
331; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
332; SSE2-NEXT:    movdqa %xmm3, %xmm7
333; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
334; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
335; SSE2-NEXT:    pand %xmm5, %xmm4
336; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
337; SSE2-NEXT:    por %xmm4, %xmm5
338; SSE2-NEXT:    pand %xmm5, %xmm0
339; SSE2-NEXT:    pandn %xmm8, %xmm5
340; SSE2-NEXT:    por %xmm5, %xmm0
341; SSE2-NEXT:    movdqa %xmm1, %xmm4
342; SSE2-NEXT:    pxor %xmm2, %xmm4
343; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
344; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
345; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
346; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
347; SSE2-NEXT:    pand %xmm5, %xmm4
348; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
349; SSE2-NEXT:    por %xmm4, %xmm3
350; SSE2-NEXT:    pand %xmm3, %xmm1
351; SSE2-NEXT:    pandn %xmm8, %xmm3
352; SSE2-NEXT:    por %xmm1, %xmm3
353; SSE2-NEXT:    movdqa %xmm3, %xmm1
354; SSE2-NEXT:    pxor %xmm2, %xmm1
355; SSE2-NEXT:    movdqa %xmm1, %xmm4
356; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
357; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
358; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
359; SSE2-NEXT:    pand %xmm4, %xmm1
360; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
361; SSE2-NEXT:    por %xmm1, %xmm4
362; SSE2-NEXT:    pand %xmm3, %xmm4
363; SSE2-NEXT:    movdqa %xmm0, %xmm1
364; SSE2-NEXT:    pxor %xmm2, %xmm1
365; SSE2-NEXT:    movdqa %xmm1, %xmm3
366; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
367; SSE2-NEXT:    pcmpeqd %xmm2, %xmm1
368; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
369; SSE2-NEXT:    pand %xmm3, %xmm1
370; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
371; SSE2-NEXT:    por %xmm1, %xmm2
372; SSE2-NEXT:    pand %xmm2, %xmm0
373; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
374; SSE2-NEXT:    retq
375;
376; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
377; SSSE3:       # %bb.0:
378; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
379; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
380; SSSE3-NEXT:    movdqa %xmm0, %xmm4
381; SSSE3-NEXT:    pxor %xmm2, %xmm4
382; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
383; SSSE3-NEXT:    pxor %xmm6, %xmm6
384; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm5
385; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
386; SSSE3-NEXT:    movdqa %xmm3, %xmm7
387; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
388; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
389; SSSE3-NEXT:    pand %xmm5, %xmm4
390; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
391; SSSE3-NEXT:    por %xmm4, %xmm5
392; SSSE3-NEXT:    pand %xmm5, %xmm0
393; SSSE3-NEXT:    pandn %xmm8, %xmm5
394; SSSE3-NEXT:    por %xmm5, %xmm0
395; SSSE3-NEXT:    movdqa %xmm1, %xmm4
396; SSSE3-NEXT:    pxor %xmm2, %xmm4
397; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
398; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm5
399; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
400; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
401; SSSE3-NEXT:    pand %xmm5, %xmm4
402; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
403; SSSE3-NEXT:    por %xmm4, %xmm3
404; SSSE3-NEXT:    pand %xmm3, %xmm1
405; SSSE3-NEXT:    pandn %xmm8, %xmm3
406; SSSE3-NEXT:    por %xmm1, %xmm3
407; SSSE3-NEXT:    movdqa %xmm3, %xmm1
408; SSSE3-NEXT:    pxor %xmm2, %xmm1
409; SSSE3-NEXT:    movdqa %xmm1, %xmm4
410; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
411; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
412; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
413; SSSE3-NEXT:    pand %xmm4, %xmm1
414; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
415; SSSE3-NEXT:    por %xmm1, %xmm4
416; SSSE3-NEXT:    pand %xmm3, %xmm4
417; SSSE3-NEXT:    movdqa %xmm0, %xmm1
418; SSSE3-NEXT:    pxor %xmm2, %xmm1
419; SSSE3-NEXT:    movdqa %xmm1, %xmm3
420; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
421; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
422; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
423; SSSE3-NEXT:    pand %xmm3, %xmm1
424; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
425; SSSE3-NEXT:    por %xmm1, %xmm2
426; SSSE3-NEXT:    pand %xmm2, %xmm0
427; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
428; SSSE3-NEXT:    retq
429;
430; SSE41-LABEL: trunc_packus_v4i64_v4i32:
431; SSE41:       # %bb.0:
432; SSE41-NEXT:    movdqa %xmm0, %xmm2
433; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4294967295,4294967295]
434; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
435; SSE41-NEXT:    pxor %xmm3, %xmm0
436; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
437; SSE41-NEXT:    movdqa %xmm6, %xmm5
438; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
439; SSE41-NEXT:    movdqa %xmm6, %xmm7
440; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
441; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
442; SSE41-NEXT:    pand %xmm5, %xmm0
443; SSE41-NEXT:    por %xmm7, %xmm0
444; SSE41-NEXT:    movapd %xmm4, %xmm5
445; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
446; SSE41-NEXT:    movdqa %xmm1, %xmm0
447; SSE41-NEXT:    pxor %xmm3, %xmm0
448; SSE41-NEXT:    movdqa %xmm6, %xmm2
449; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
450; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
451; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
452; SSE41-NEXT:    pand %xmm2, %xmm0
453; SSE41-NEXT:    por %xmm6, %xmm0
454; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
455; SSE41-NEXT:    xorpd %xmm1, %xmm1
456; SSE41-NEXT:    movapd %xmm4, %xmm2
457; SSE41-NEXT:    xorpd %xmm3, %xmm2
458; SSE41-NEXT:    movapd %xmm2, %xmm6
459; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
460; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
461; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
462; SSE41-NEXT:    pand %xmm6, %xmm0
463; SSE41-NEXT:    por %xmm2, %xmm0
464; SSE41-NEXT:    pxor %xmm2, %xmm2
465; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
466; SSE41-NEXT:    movapd %xmm5, %xmm4
467; SSE41-NEXT:    xorpd %xmm3, %xmm4
468; SSE41-NEXT:    movapd %xmm4, %xmm6
469; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
470; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
471; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
472; SSE41-NEXT:    pand %xmm6, %xmm0
473; SSE41-NEXT:    por %xmm4, %xmm0
474; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
475; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
476; SSE41-NEXT:    movaps %xmm1, %xmm0
477; SSE41-NEXT:    retq
478;
479; AVX1-LABEL: trunc_packus_v4i64_v4i32:
480; AVX1:       # %bb.0:
481; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
482; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
483; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
484; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
485; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
486; AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
487; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
488; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
489; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
490; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
491; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
492; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
493; AVX1-NEXT:    vzeroupper
494; AVX1-NEXT:    retq
495;
496; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32:
497; AVX2-SLOW:       # %bb.0:
498; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
499; AVX2-SLOW-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
500; AVX2-SLOW-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
501; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
502; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
503; AVX2-SLOW-NEXT:    vpand %ymm0, %ymm1, %ymm0
504; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
505; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
506; AVX2-SLOW-NEXT:    vzeroupper
507; AVX2-SLOW-NEXT:    retq
508;
509; AVX2-FAST-ALL-LABEL: trunc_packus_v4i64_v4i32:
510; AVX2-FAST-ALL:       # %bb.0:
511; AVX2-FAST-ALL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
512; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
513; AVX2-FAST-ALL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
514; AVX2-FAST-ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
515; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
516; AVX2-FAST-ALL-NEXT:    vpand %ymm0, %ymm1, %ymm0
517; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
518; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
519; AVX2-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
520; AVX2-FAST-ALL-NEXT:    vzeroupper
521; AVX2-FAST-ALL-NEXT:    retq
522;
523; AVX2-FAST-PERLANE-LABEL: trunc_packus_v4i64_v4i32:
524; AVX2-FAST-PERLANE:       # %bb.0:
525; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
526; AVX2-FAST-PERLANE-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
527; AVX2-FAST-PERLANE-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
528; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
529; AVX2-FAST-PERLANE-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
530; AVX2-FAST-PERLANE-NEXT:    vpand %ymm0, %ymm1, %ymm0
531; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
532; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
533; AVX2-FAST-PERLANE-NEXT:    vzeroupper
534; AVX2-FAST-PERLANE-NEXT:    retq
535;
536; AVX512F-LABEL: trunc_packus_v4i64_v4i32:
537; AVX512F:       # %bb.0:
538; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
539; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
540; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
541; AVX512F-NEXT:    vpmovusqd %zmm0, %ymm0
542; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
543; AVX512F-NEXT:    vzeroupper
544; AVX512F-NEXT:    retq
545;
546; AVX512VL-LABEL: trunc_packus_v4i64_v4i32:
547; AVX512VL:       # %bb.0:
548; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
549; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
550; AVX512VL-NEXT:    vpmovusqd %ymm0, %xmm0
551; AVX512VL-NEXT:    vzeroupper
552; AVX512VL-NEXT:    retq
553;
554; AVX512BW-LABEL: trunc_packus_v4i64_v4i32:
555; AVX512BW:       # %bb.0:
556; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
557; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
558; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
559; AVX512BW-NEXT:    vpmovusqd %zmm0, %ymm0
560; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
561; AVX512BW-NEXT:    vzeroupper
562; AVX512BW-NEXT:    retq
563;
564; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32:
565; AVX512BWVL:       # %bb.0:
566; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
567; AVX512BWVL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
568; AVX512BWVL-NEXT:    vpmovusqd %ymm0, %xmm0
569; AVX512BWVL-NEXT:    vzeroupper
570; AVX512BWVL-NEXT:    retq
571;
572; SKX-LABEL: trunc_packus_v4i64_v4i32:
573; SKX:       # %bb.0:
574; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
575; SKX-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
576; SKX-NEXT:    vpmovusqd %ymm0, %xmm0
577; SKX-NEXT:    vzeroupper
578; SKX-NEXT:    retq
579  %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
580  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
581  %3 = icmp sgt <4 x i64> %2, zeroinitializer
582  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
583  %5 = trunc <4 x i64> %4 to <4 x i32>
584  ret <4 x i32> %5
585}
586
587
588define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" {
589; SSE2-LABEL: trunc_packus_v8i64_v8i32:
590; SSE2:       # %bb.0:
591; SSE2-NEXT:    movdqa (%rdi), %xmm3
592; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
593; SSE2-NEXT:    movdqa 32(%rdi), %xmm6
594; SSE2-NEXT:    movdqa 48(%rdi), %xmm10
595; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
596; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
597; SSE2-NEXT:    movdqa %xmm3, %xmm2
598; SSE2-NEXT:    pxor %xmm11, %xmm2
599; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
600; SSE2-NEXT:    pxor %xmm9, %xmm9
601; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
602; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483647,2147483647]
603; SSE2-NEXT:    movdqa %xmm1, %xmm5
604; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
605; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
606; SSE2-NEXT:    pand %xmm7, %xmm0
607; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
608; SSE2-NEXT:    por %xmm0, %xmm2
609; SSE2-NEXT:    pand %xmm2, %xmm3
610; SSE2-NEXT:    pandn %xmm8, %xmm2
611; SSE2-NEXT:    por %xmm3, %xmm2
612; SSE2-NEXT:    movdqa %xmm4, %xmm0
613; SSE2-NEXT:    pxor %xmm11, %xmm0
614; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
615; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
616; SSE2-NEXT:    movdqa %xmm1, %xmm5
617; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
618; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
619; SSE2-NEXT:    pand %xmm3, %xmm0
620; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
621; SSE2-NEXT:    por %xmm0, %xmm3
622; SSE2-NEXT:    pand %xmm3, %xmm4
623; SSE2-NEXT:    pandn %xmm8, %xmm3
624; SSE2-NEXT:    por %xmm4, %xmm3
625; SSE2-NEXT:    movdqa %xmm6, %xmm0
626; SSE2-NEXT:    pxor %xmm11, %xmm0
627; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
628; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
629; SSE2-NEXT:    movdqa %xmm1, %xmm5
630; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
631; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
632; SSE2-NEXT:    pand %xmm4, %xmm0
633; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
634; SSE2-NEXT:    por %xmm0, %xmm4
635; SSE2-NEXT:    pand %xmm4, %xmm6
636; SSE2-NEXT:    pandn %xmm8, %xmm4
637; SSE2-NEXT:    por %xmm6, %xmm4
638; SSE2-NEXT:    movdqa %xmm10, %xmm0
639; SSE2-NEXT:    pxor %xmm11, %xmm0
640; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
641; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
642; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
643; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
644; SSE2-NEXT:    pand %xmm5, %xmm0
645; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
646; SSE2-NEXT:    por %xmm0, %xmm1
647; SSE2-NEXT:    pand %xmm1, %xmm10
648; SSE2-NEXT:    pandn %xmm8, %xmm1
649; SSE2-NEXT:    por %xmm10, %xmm1
650; SSE2-NEXT:    movdqa %xmm1, %xmm0
651; SSE2-NEXT:    pxor %xmm11, %xmm0
652; SSE2-NEXT:    movdqa %xmm0, %xmm5
653; SSE2-NEXT:    pcmpgtd %xmm11, %xmm5
654; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
655; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
656; SSE2-NEXT:    pand %xmm5, %xmm0
657; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
658; SSE2-NEXT:    por %xmm0, %xmm5
659; SSE2-NEXT:    pand %xmm1, %xmm5
660; SSE2-NEXT:    movdqa %xmm4, %xmm0
661; SSE2-NEXT:    pxor %xmm11, %xmm0
662; SSE2-NEXT:    movdqa %xmm0, %xmm1
663; SSE2-NEXT:    pcmpgtd %xmm11, %xmm1
664; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
665; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
666; SSE2-NEXT:    pand %xmm1, %xmm0
667; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
668; SSE2-NEXT:    por %xmm0, %xmm1
669; SSE2-NEXT:    pand %xmm4, %xmm1
670; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
671; SSE2-NEXT:    movdqa %xmm3, %xmm0
672; SSE2-NEXT:    pxor %xmm11, %xmm0
673; SSE2-NEXT:    movdqa %xmm0, %xmm4
674; SSE2-NEXT:    pcmpgtd %xmm11, %xmm4
675; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
676; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
677; SSE2-NEXT:    pand %xmm4, %xmm0
678; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
679; SSE2-NEXT:    por %xmm0, %xmm4
680; SSE2-NEXT:    pand %xmm3, %xmm4
681; SSE2-NEXT:    movdqa %xmm2, %xmm0
682; SSE2-NEXT:    pxor %xmm11, %xmm0
683; SSE2-NEXT:    movdqa %xmm0, %xmm3
684; SSE2-NEXT:    pcmpgtd %xmm11, %xmm3
685; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
686; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
687; SSE2-NEXT:    pand %xmm3, %xmm5
688; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
689; SSE2-NEXT:    por %xmm5, %xmm0
690; SSE2-NEXT:    pand %xmm2, %xmm0
691; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
692; SSE2-NEXT:    retq
693;
694; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
695; SSSE3:       # %bb.0:
696; SSSE3-NEXT:    movdqa (%rdi), %xmm3
697; SSSE3-NEXT:    movdqa 16(%rdi), %xmm4
698; SSSE3-NEXT:    movdqa 32(%rdi), %xmm6
699; SSSE3-NEXT:    movdqa 48(%rdi), %xmm10
700; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
701; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
702; SSSE3-NEXT:    movdqa %xmm3, %xmm2
703; SSSE3-NEXT:    pxor %xmm11, %xmm2
704; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
705; SSSE3-NEXT:    pxor %xmm9, %xmm9
706; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
707; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483647,2147483647]
708; SSSE3-NEXT:    movdqa %xmm1, %xmm5
709; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
710; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
711; SSSE3-NEXT:    pand %xmm7, %xmm0
712; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
713; SSSE3-NEXT:    por %xmm0, %xmm2
714; SSSE3-NEXT:    pand %xmm2, %xmm3
715; SSSE3-NEXT:    pandn %xmm8, %xmm2
716; SSSE3-NEXT:    por %xmm3, %xmm2
717; SSSE3-NEXT:    movdqa %xmm4, %xmm0
718; SSSE3-NEXT:    pxor %xmm11, %xmm0
719; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
720; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
721; SSSE3-NEXT:    movdqa %xmm1, %xmm5
722; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
723; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
724; SSSE3-NEXT:    pand %xmm3, %xmm0
725; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
726; SSSE3-NEXT:    por %xmm0, %xmm3
727; SSSE3-NEXT:    pand %xmm3, %xmm4
728; SSSE3-NEXT:    pandn %xmm8, %xmm3
729; SSSE3-NEXT:    por %xmm4, %xmm3
730; SSSE3-NEXT:    movdqa %xmm6, %xmm0
731; SSSE3-NEXT:    pxor %xmm11, %xmm0
732; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
733; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
734; SSSE3-NEXT:    movdqa %xmm1, %xmm5
735; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
736; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
737; SSSE3-NEXT:    pand %xmm4, %xmm0
738; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
739; SSSE3-NEXT:    por %xmm0, %xmm4
740; SSSE3-NEXT:    pand %xmm4, %xmm6
741; SSSE3-NEXT:    pandn %xmm8, %xmm4
742; SSSE3-NEXT:    por %xmm6, %xmm4
743; SSSE3-NEXT:    movdqa %xmm10, %xmm0
744; SSSE3-NEXT:    pxor %xmm11, %xmm0
745; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
746; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
747; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
748; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
749; SSSE3-NEXT:    pand %xmm5, %xmm0
750; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
751; SSSE3-NEXT:    por %xmm0, %xmm1
752; SSSE3-NEXT:    pand %xmm1, %xmm10
753; SSSE3-NEXT:    pandn %xmm8, %xmm1
754; SSSE3-NEXT:    por %xmm10, %xmm1
755; SSSE3-NEXT:    movdqa %xmm1, %xmm0
756; SSSE3-NEXT:    pxor %xmm11, %xmm0
757; SSSE3-NEXT:    movdqa %xmm0, %xmm5
758; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm5
759; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
760; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
761; SSSE3-NEXT:    pand %xmm5, %xmm0
762; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
763; SSSE3-NEXT:    por %xmm0, %xmm5
764; SSSE3-NEXT:    pand %xmm1, %xmm5
765; SSSE3-NEXT:    movdqa %xmm4, %xmm0
766; SSSE3-NEXT:    pxor %xmm11, %xmm0
767; SSSE3-NEXT:    movdqa %xmm0, %xmm1
768; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm1
769; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
770; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
771; SSSE3-NEXT:    pand %xmm1, %xmm0
772; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
773; SSSE3-NEXT:    por %xmm0, %xmm1
774; SSSE3-NEXT:    pand %xmm4, %xmm1
775; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
776; SSSE3-NEXT:    movdqa %xmm3, %xmm0
777; SSSE3-NEXT:    pxor %xmm11, %xmm0
778; SSSE3-NEXT:    movdqa %xmm0, %xmm4
779; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm4
780; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
781; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
782; SSSE3-NEXT:    pand %xmm4, %xmm0
783; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
784; SSSE3-NEXT:    por %xmm0, %xmm4
785; SSSE3-NEXT:    pand %xmm3, %xmm4
786; SSSE3-NEXT:    movdqa %xmm2, %xmm0
787; SSSE3-NEXT:    pxor %xmm11, %xmm0
788; SSSE3-NEXT:    movdqa %xmm0, %xmm3
789; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm3
790; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
791; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
792; SSSE3-NEXT:    pand %xmm3, %xmm5
793; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
794; SSSE3-NEXT:    por %xmm5, %xmm0
795; SSSE3-NEXT:    pand %xmm2, %xmm0
796; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
797; SSSE3-NEXT:    retq
798;
799; SSE41-LABEL: trunc_packus_v8i64_v8i32:
800; SSE41:       # %bb.0:
801; SSE41-NEXT:    movdqa (%rdi), %xmm5
802; SSE41-NEXT:    movdqa 16(%rdi), %xmm4
803; SSE41-NEXT:    movdqa 32(%rdi), %xmm10
804; SSE41-NEXT:    movdqa 48(%rdi), %xmm9
805; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [4294967295,4294967295]
806; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
807; SSE41-NEXT:    movdqa %xmm5, %xmm0
808; SSE41-NEXT:    pxor %xmm3, %xmm0
809; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
810; SSE41-NEXT:    movdqa %xmm2, %xmm7
811; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
812; SSE41-NEXT:    movdqa %xmm2, %xmm6
813; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
814; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
815; SSE41-NEXT:    pand %xmm7, %xmm0
816; SSE41-NEXT:    por %xmm6, %xmm0
817; SSE41-NEXT:    movapd %xmm1, %xmm8
818; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm8
819; SSE41-NEXT:    movdqa %xmm4, %xmm0
820; SSE41-NEXT:    pxor %xmm3, %xmm0
821; SSE41-NEXT:    movdqa %xmm2, %xmm5
822; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
823; SSE41-NEXT:    movdqa %xmm2, %xmm6
824; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
825; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
826; SSE41-NEXT:    pand %xmm5, %xmm0
827; SSE41-NEXT:    por %xmm6, %xmm0
828; SSE41-NEXT:    movapd %xmm1, %xmm5
829; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm5
830; SSE41-NEXT:    movdqa %xmm10, %xmm0
831; SSE41-NEXT:    pxor %xmm3, %xmm0
832; SSE41-NEXT:    movdqa %xmm2, %xmm4
833; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
834; SSE41-NEXT:    movdqa %xmm2, %xmm6
835; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
836; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
837; SSE41-NEXT:    pand %xmm4, %xmm0
838; SSE41-NEXT:    por %xmm6, %xmm0
839; SSE41-NEXT:    movapd %xmm1, %xmm4
840; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm4
841; SSE41-NEXT:    movdqa %xmm9, %xmm0
842; SSE41-NEXT:    pxor %xmm3, %xmm0
843; SSE41-NEXT:    movdqa %xmm2, %xmm6
844; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
845; SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
846; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
847; SSE41-NEXT:    pand %xmm6, %xmm0
848; SSE41-NEXT:    por %xmm2, %xmm0
849; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm1
850; SSE41-NEXT:    pxor %xmm2, %xmm2
851; SSE41-NEXT:    movapd %xmm1, %xmm6
852; SSE41-NEXT:    xorpd %xmm3, %xmm6
853; SSE41-NEXT:    movapd %xmm6, %xmm7
854; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
855; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
856; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
857; SSE41-NEXT:    pand %xmm7, %xmm0
858; SSE41-NEXT:    por %xmm6, %xmm0
859; SSE41-NEXT:    pxor %xmm6, %xmm6
860; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
861; SSE41-NEXT:    movapd %xmm4, %xmm1
862; SSE41-NEXT:    xorpd %xmm3, %xmm1
863; SSE41-NEXT:    movapd %xmm1, %xmm7
864; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
865; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
866; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
867; SSE41-NEXT:    pand %xmm7, %xmm0
868; SSE41-NEXT:    por %xmm1, %xmm0
869; SSE41-NEXT:    pxor %xmm1, %xmm1
870; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
871; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
872; SSE41-NEXT:    movapd %xmm5, %xmm4
873; SSE41-NEXT:    xorpd %xmm3, %xmm4
874; SSE41-NEXT:    movapd %xmm4, %xmm6
875; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
876; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
877; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
878; SSE41-NEXT:    pand %xmm6, %xmm0
879; SSE41-NEXT:    por %xmm4, %xmm0
880; SSE41-NEXT:    pxor %xmm4, %xmm4
881; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
882; SSE41-NEXT:    movapd %xmm8, %xmm5
883; SSE41-NEXT:    xorpd %xmm3, %xmm5
884; SSE41-NEXT:    movapd %xmm5, %xmm6
885; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
886; SSE41-NEXT:    pcmpgtd %xmm3, %xmm5
887; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
888; SSE41-NEXT:    pand %xmm6, %xmm0
889; SSE41-NEXT:    por %xmm5, %xmm0
890; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
891; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
892; SSE41-NEXT:    movaps %xmm2, %xmm0
893; SSE41-NEXT:    retq
894;
895; AVX1-LABEL: trunc_packus_v8i64_v8i32:
896; AVX1:       # %bb.0:
897; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
898; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
899; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
900; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
901; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295]
902; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
903; AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
904; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm5
905; AVX1-NEXT:    vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
906; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm5
907; AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
908; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
909; AVX1-NEXT:    vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
910; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
911; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm5
912; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
913; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
914; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm0
915; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm5
916; AVX1-NEXT:    vpand %xmm3, %xmm5, %xmm3
917; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm4
918; AVX1-NEXT:    vpand %xmm1, %xmm4, %xmm1
919; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
920; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
921; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
922; AVX1-NEXT:    retq
923;
924; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
925; AVX2-SLOW:       # %bb.0:
926; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
927; AVX2-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
928; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
929; AVX2-SLOW-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
930; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
931; AVX2-SLOW-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
932; AVX2-SLOW-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
933; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
934; AVX2-SLOW-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
935; AVX2-SLOW-NEXT:    vpand %ymm1, %ymm3, %ymm1
936; AVX2-SLOW-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
937; AVX2-SLOW-NEXT:    vpand %ymm0, %ymm2, %ymm0
938; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
939; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
940; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
941; AVX2-SLOW-NEXT:    retq
942;
943; AVX2-FAST-ALL-LABEL: trunc_packus_v8i64_v8i32:
944; AVX2-FAST-ALL:       # %bb.0:
945; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdi), %ymm0
946; AVX2-FAST-ALL-NEXT:    vmovdqa 32(%rdi), %ymm1
947; AVX2-FAST-ALL-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
948; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
949; AVX2-FAST-ALL-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
950; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
951; AVX2-FAST-ALL-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
952; AVX2-FAST-ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
953; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
954; AVX2-FAST-ALL-NEXT:    vpand %ymm1, %ymm3, %ymm1
955; AVX2-FAST-ALL-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
956; AVX2-FAST-ALL-NEXT:    vpand %ymm0, %ymm2, %ymm0
957; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
958; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm0
959; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
960; AVX2-FAST-ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
961; AVX2-FAST-ALL-NEXT:    retq
962;
963; AVX2-FAST-PERLANE-LABEL: trunc_packus_v8i64_v8i32:
964; AVX2-FAST-PERLANE:       # %bb.0:
965; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
966; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
967; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
968; AVX2-FAST-PERLANE-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
969; AVX2-FAST-PERLANE-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
970; AVX2-FAST-PERLANE-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
971; AVX2-FAST-PERLANE-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
972; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
973; AVX2-FAST-PERLANE-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
974; AVX2-FAST-PERLANE-NEXT:    vpand %ymm1, %ymm3, %ymm1
975; AVX2-FAST-PERLANE-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
976; AVX2-FAST-PERLANE-NEXT:    vpand %ymm0, %ymm2, %ymm0
977; AVX2-FAST-PERLANE-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
978; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
979; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
980; AVX2-FAST-PERLANE-NEXT:    retq
981;
982; AVX512-LABEL: trunc_packus_v8i64_v8i32:
983; AVX512:       # %bb.0:
984; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
985; AVX512-NEXT:    vpmaxsq (%rdi), %zmm0, %zmm0
986; AVX512-NEXT:    vpmovusqd %zmm0, %ymm0
987; AVX512-NEXT:    retq
988;
989; SKX-LABEL: trunc_packus_v8i64_v8i32:
990; SKX:       # %bb.0:
991; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
992; SKX-NEXT:    vpmaxsq (%rdi), %ymm0, %ymm1
993; SKX-NEXT:    vpmovusqd %ymm1, %xmm1
994; SKX-NEXT:    vpmaxsq 32(%rdi), %ymm0, %ymm0
995; SKX-NEXT:    vpmovusqd %ymm0, %xmm0
996; SKX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
997; SKX-NEXT:    retq
998  %a0 = load <8 x i64>, ptr %p0
999  %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1000  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1001  %3 = icmp sgt <8 x i64> %2, zeroinitializer
1002  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1003  %5 = trunc <8 x i64> %4 to <8 x i32>
1004  ret <8 x i32> %5
1005}
1006
1007;
1008; PACKUS saturation truncation to vXi16
1009;
1010
1011define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) {
1012; SSE2-LABEL: trunc_packus_v2i64_v2i16:
1013; SSE2:       # %bb.0:
1014; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
1015; SSE2-NEXT:    movdqa %xmm0, %xmm2
1016; SSE2-NEXT:    pxor %xmm1, %xmm2
1017; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1018; SSE2-NEXT:    pxor %xmm4, %xmm4
1019; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
1020; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
1021; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
1022; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1023; SSE2-NEXT:    pand %xmm4, %xmm2
1024; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1025; SSE2-NEXT:    por %xmm2, %xmm3
1026; SSE2-NEXT:    pand %xmm3, %xmm0
1027; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1028; SSE2-NEXT:    por %xmm0, %xmm3
1029; SSE2-NEXT:    movdqa %xmm3, %xmm0
1030; SSE2-NEXT:    pxor %xmm1, %xmm0
1031; SSE2-NEXT:    movdqa %xmm0, %xmm2
1032; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1033; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1034; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
1035; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1036; SSE2-NEXT:    pand %xmm4, %xmm0
1037; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1038; SSE2-NEXT:    por %xmm0, %xmm1
1039; SSE2-NEXT:    pand %xmm3, %xmm1
1040; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1041; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1042; SSE2-NEXT:    retq
1043;
1044; SSSE3-LABEL: trunc_packus_v2i64_v2i16:
1045; SSSE3:       # %bb.0:
1046; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
1047; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1048; SSSE3-NEXT:    pxor %xmm1, %xmm2
1049; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1050; SSSE3-NEXT:    pxor %xmm4, %xmm4
1051; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
1052; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
1053; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
1054; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1055; SSSE3-NEXT:    pand %xmm4, %xmm2
1056; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1057; SSSE3-NEXT:    por %xmm2, %xmm3
1058; SSSE3-NEXT:    pand %xmm3, %xmm0
1059; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1060; SSSE3-NEXT:    por %xmm0, %xmm3
1061; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1062; SSSE3-NEXT:    pxor %xmm1, %xmm0
1063; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1064; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1065; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1066; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
1067; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1068; SSSE3-NEXT:    pand %xmm4, %xmm0
1069; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1070; SSSE3-NEXT:    por %xmm0, %xmm1
1071; SSSE3-NEXT:    pand %xmm3, %xmm1
1072; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1073; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1074; SSSE3-NEXT:    retq
1075;
1076; SSE41-LABEL: trunc_packus_v2i64_v2i16:
1077; SSE41:       # %bb.0:
1078; SSE41-NEXT:    movdqa %xmm0, %xmm1
1079; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65535,65535]
1080; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
1081; SSE41-NEXT:    pxor %xmm3, %xmm0
1082; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
1083; SSE41-NEXT:    movdqa %xmm4, %xmm5
1084; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
1085; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
1086; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
1087; SSE41-NEXT:    pand %xmm5, %xmm0
1088; SSE41-NEXT:    por %xmm4, %xmm0
1089; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
1090; SSE41-NEXT:    xorpd %xmm1, %xmm1
1091; SSE41-NEXT:    movapd %xmm2, %xmm4
1092; SSE41-NEXT:    xorpd %xmm3, %xmm4
1093; SSE41-NEXT:    movapd %xmm4, %xmm5
1094; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
1095; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
1096; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
1097; SSE41-NEXT:    pand %xmm5, %xmm0
1098; SSE41-NEXT:    por %xmm4, %xmm0
1099; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
1100; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1101; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1102; SSE41-NEXT:    retq
1103;
1104; AVX1-LABEL: trunc_packus_v2i64_v2i16:
1105; AVX1:       # %bb.0:
1106; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1107; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1108; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1109; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1110; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
1111; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
1112; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1113; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1114; AVX1-NEXT:    retq
1115;
1116; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16:
1117; AVX2-SLOW:       # %bb.0:
1118; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1119; AVX2-SLOW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1120; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1121; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1122; AVX2-SLOW-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
1123; AVX2-SLOW-NEXT:    vpand %xmm0, %xmm1, %xmm0
1124; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1125; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1126; AVX2-SLOW-NEXT:    retq
1127;
1128; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16:
1129; AVX2-FAST:       # %bb.0:
1130; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1131; AVX2-FAST-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1132; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1133; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1134; AVX2-FAST-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
1135; AVX2-FAST-NEXT:    vpand %xmm0, %xmm1, %xmm0
1136; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1137; AVX2-FAST-NEXT:    retq
1138;
1139; AVX512F-LABEL: trunc_packus_v2i64_v2i16:
1140; AVX512F:       # %bb.0:
1141; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1142; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1143; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1144; AVX512F-NEXT:    vpmovusqw %zmm0, %xmm0
1145; AVX512F-NEXT:    vzeroupper
1146; AVX512F-NEXT:    retq
1147;
1148; AVX512VL-LABEL: trunc_packus_v2i64_v2i16:
1149; AVX512VL:       # %bb.0:
1150; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1151; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
1152; AVX512VL-NEXT:    vpmovusqw %xmm0, %xmm0
1153; AVX512VL-NEXT:    retq
1154;
1155; AVX512BW-LABEL: trunc_packus_v2i64_v2i16:
1156; AVX512BW:       # %bb.0:
1157; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1158; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1159; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1160; AVX512BW-NEXT:    vpmovusqw %zmm0, %xmm0
1161; AVX512BW-NEXT:    vzeroupper
1162; AVX512BW-NEXT:    retq
1163;
1164; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16:
1165; AVX512BWVL:       # %bb.0:
1166; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1167; AVX512BWVL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
1168; AVX512BWVL-NEXT:    vpmovusqw %xmm0, %xmm0
1169; AVX512BWVL-NEXT:    retq
1170;
1171; SKX-LABEL: trunc_packus_v2i64_v2i16:
1172; SKX:       # %bb.0:
1173; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1174; SKX-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
1175; SKX-NEXT:    vpmovusqw %xmm0, %xmm0
1176; SKX-NEXT:    retq
1177  %1 = icmp slt <2 x i64> %a0, <i64 65535, i64 65535>
1178  %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 65535, i64 65535>
1179  %3 = icmp sgt <2 x i64> %2, zeroinitializer
1180  %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer
1181  %5 = trunc <2 x i64> %4 to <2 x i16>
1182  ret <2 x i16> %5
1183}
1184
1185define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
1186; SSE2-LABEL: trunc_packus_v2i64_v2i16_store:
1187; SSE2:       # %bb.0:
1188; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
1189; SSE2-NEXT:    movdqa %xmm0, %xmm2
1190; SSE2-NEXT:    pxor %xmm1, %xmm2
1191; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1192; SSE2-NEXT:    pxor %xmm4, %xmm4
1193; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
1194; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
1195; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
1196; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1197; SSE2-NEXT:    pand %xmm4, %xmm2
1198; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1199; SSE2-NEXT:    por %xmm2, %xmm3
1200; SSE2-NEXT:    pand %xmm3, %xmm0
1201; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1202; SSE2-NEXT:    por %xmm0, %xmm3
1203; SSE2-NEXT:    movdqa %xmm3, %xmm0
1204; SSE2-NEXT:    pxor %xmm1, %xmm0
1205; SSE2-NEXT:    movdqa %xmm0, %xmm2
1206; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1207; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1208; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
1209; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1210; SSE2-NEXT:    pand %xmm4, %xmm0
1211; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1212; SSE2-NEXT:    por %xmm0, %xmm1
1213; SSE2-NEXT:    pand %xmm3, %xmm1
1214; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1215; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1216; SSE2-NEXT:    movd %xmm0, (%rdi)
1217; SSE2-NEXT:    retq
1218;
1219; SSSE3-LABEL: trunc_packus_v2i64_v2i16_store:
1220; SSSE3:       # %bb.0:
1221; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
1222; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1223; SSSE3-NEXT:    pxor %xmm1, %xmm2
1224; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1225; SSSE3-NEXT:    pxor %xmm4, %xmm4
1226; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
1227; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
1228; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
1229; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1230; SSSE3-NEXT:    pand %xmm4, %xmm2
1231; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1232; SSSE3-NEXT:    por %xmm2, %xmm3
1233; SSSE3-NEXT:    pand %xmm3, %xmm0
1234; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1235; SSSE3-NEXT:    por %xmm0, %xmm3
1236; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1237; SSSE3-NEXT:    pxor %xmm1, %xmm0
1238; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1239; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1240; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1241; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
1242; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1243; SSSE3-NEXT:    pand %xmm4, %xmm0
1244; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
1245; SSSE3-NEXT:    por %xmm0, %xmm1
1246; SSSE3-NEXT:    pand %xmm3, %xmm1
1247; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1248; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1249; SSSE3-NEXT:    movd %xmm0, (%rdi)
1250; SSSE3-NEXT:    retq
1251;
1252; SSE41-LABEL: trunc_packus_v2i64_v2i16_store:
1253; SSE41:       # %bb.0:
1254; SSE41-NEXT:    movdqa %xmm0, %xmm1
1255; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65535,65535]
1256; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
1257; SSE41-NEXT:    pxor %xmm3, %xmm0
1258; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
1259; SSE41-NEXT:    movdqa %xmm4, %xmm5
1260; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
1261; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
1262; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
1263; SSE41-NEXT:    pand %xmm5, %xmm0
1264; SSE41-NEXT:    por %xmm4, %xmm0
1265; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
1266; SSE41-NEXT:    xorpd %xmm1, %xmm1
1267; SSE41-NEXT:    movapd %xmm2, %xmm4
1268; SSE41-NEXT:    xorpd %xmm3, %xmm4
1269; SSE41-NEXT:    movapd %xmm4, %xmm5
1270; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
1271; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
1272; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
1273; SSE41-NEXT:    pand %xmm5, %xmm0
1274; SSE41-NEXT:    por %xmm4, %xmm0
1275; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
1276; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1277; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1278; SSE41-NEXT:    movd %xmm0, (%rdi)
1279; SSE41-NEXT:    retq
1280;
1281; AVX1-LABEL: trunc_packus_v2i64_v2i16_store:
1282; AVX1:       # %bb.0:
1283; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1284; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1285; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1286; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1287; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
1288; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
1289; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1290; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1291; AVX1-NEXT:    vmovd %xmm0, (%rdi)
1292; AVX1-NEXT:    retq
1293;
1294; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store:
1295; AVX2-SLOW:       # %bb.0:
1296; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1297; AVX2-SLOW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1298; AVX2-SLOW-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1299; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1300; AVX2-SLOW-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
1301; AVX2-SLOW-NEXT:    vpand %xmm0, %xmm1, %xmm0
1302; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1303; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1304; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rdi)
1305; AVX2-SLOW-NEXT:    retq
1306;
1307; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store:
1308; AVX2-FAST:       # %bb.0:
1309; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1310; AVX2-FAST-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1311; AVX2-FAST-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1312; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1313; AVX2-FAST-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
1314; AVX2-FAST-NEXT:    vpand %xmm0, %xmm1, %xmm0
1315; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
1316; AVX2-FAST-NEXT:    vmovd %xmm0, (%rdi)
1317; AVX2-FAST-NEXT:    retq
1318;
1319; AVX512F-LABEL: trunc_packus_v2i64_v2i16_store:
1320; AVX512F:       # %bb.0:
1321; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1322; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1323; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1324; AVX512F-NEXT:    vpmovusqw %zmm0, %xmm0
1325; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
1326; AVX512F-NEXT:    vzeroupper
1327; AVX512F-NEXT:    retq
1328;
1329; AVX512VL-LABEL: trunc_packus_v2i64_v2i16_store:
1330; AVX512VL:       # %bb.0:
1331; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1332; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
1333; AVX512VL-NEXT:    vpmovusqw %xmm0, (%rdi)
1334; AVX512VL-NEXT:    retq
1335;
1336; AVX512BW-LABEL: trunc_packus_v2i64_v2i16_store:
1337; AVX512BW:       # %bb.0:
1338; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1339; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1340; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1341; AVX512BW-NEXT:    vpmovusqw %zmm0, %xmm0
1342; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
1343; AVX512BW-NEXT:    vzeroupper
1344; AVX512BW-NEXT:    retq
1345;
1346; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16_store:
1347; AVX512BWVL:       # %bb.0:
1348; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1349; AVX512BWVL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
1350; AVX512BWVL-NEXT:    vpmovusqw %xmm0, (%rdi)
1351; AVX512BWVL-NEXT:    retq
1352;
1353; SKX-LABEL: trunc_packus_v2i64_v2i16_store:
1354; SKX:       # %bb.0:
1355; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1356; SKX-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
1357; SKX-NEXT:    vpmovusqw %xmm0, (%rdi)
1358; SKX-NEXT:    retq
1359  %1 = icmp slt <2 x i64> %a0, <i64 65535, i64 65535>
1360  %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 65535, i64 65535>
1361  %3 = icmp sgt <2 x i64> %2, zeroinitializer
1362  %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer
1363  %5 = trunc <2 x i64> %4 to <2 x i16>
1364  store <2 x i16> %5, ptr%p1
1365  ret void
1366}
1367
1368define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) {
1369; SSE2-LABEL: trunc_packus_v4i64_v4i16:
1370; SSE2:       # %bb.0:
1371; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
1372; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1373; SSE2-NEXT:    movdqa %xmm1, %xmm3
1374; SSE2-NEXT:    pxor %xmm2, %xmm3
1375; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1376; SSE2-NEXT:    pxor %xmm9, %xmm9
1377; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
1378; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
1379; SSE2-NEXT:    movdqa %xmm4, %xmm7
1380; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
1381; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
1382; SSE2-NEXT:    pand %xmm5, %xmm6
1383; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
1384; SSE2-NEXT:    por %xmm6, %xmm3
1385; SSE2-NEXT:    pand %xmm3, %xmm1
1386; SSE2-NEXT:    pandn %xmm8, %xmm3
1387; SSE2-NEXT:    por %xmm1, %xmm3
1388; SSE2-NEXT:    movdqa %xmm0, %xmm1
1389; SSE2-NEXT:    pxor %xmm2, %xmm1
1390; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
1391; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
1392; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
1393; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
1394; SSE2-NEXT:    pand %xmm5, %xmm1
1395; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1396; SSE2-NEXT:    por %xmm1, %xmm4
1397; SSE2-NEXT:    pand %xmm4, %xmm0
1398; SSE2-NEXT:    pandn %xmm8, %xmm4
1399; SSE2-NEXT:    por %xmm0, %xmm4
1400; SSE2-NEXT:    movdqa %xmm4, %xmm0
1401; SSE2-NEXT:    pxor %xmm2, %xmm0
1402; SSE2-NEXT:    movdqa %xmm0, %xmm1
1403; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
1404; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
1405; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
1406; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1407; SSE2-NEXT:    pand %xmm5, %xmm0
1408; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1409; SSE2-NEXT:    por %xmm0, %xmm1
1410; SSE2-NEXT:    pand %xmm4, %xmm1
1411; SSE2-NEXT:    movdqa %xmm3, %xmm0
1412; SSE2-NEXT:    pxor %xmm2, %xmm0
1413; SSE2-NEXT:    movdqa %xmm0, %xmm4
1414; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
1415; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1416; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
1417; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1418; SSE2-NEXT:    pand %xmm5, %xmm0
1419; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1420; SSE2-NEXT:    por %xmm0, %xmm2
1421; SSE2-NEXT:    pand %xmm3, %xmm2
1422; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1423; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
1424; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1425; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1426; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1427; SSE2-NEXT:    retq
1428;
1429; SSSE3-LABEL: trunc_packus_v4i64_v4i16:
1430; SSSE3:       # %bb.0:
1431; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
1432; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1433; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1434; SSSE3-NEXT:    pxor %xmm2, %xmm3
1435; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1436; SSSE3-NEXT:    pxor %xmm9, %xmm9
1437; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
1438; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
1439; SSSE3-NEXT:    movdqa %xmm4, %xmm7
1440; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
1441; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
1442; SSSE3-NEXT:    pand %xmm5, %xmm6
1443; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
1444; SSSE3-NEXT:    por %xmm6, %xmm3
1445; SSSE3-NEXT:    pand %xmm3, %xmm1
1446; SSSE3-NEXT:    pandn %xmm8, %xmm3
1447; SSSE3-NEXT:    por %xmm1, %xmm3
1448; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1449; SSSE3-NEXT:    pxor %xmm2, %xmm1
1450; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
1451; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
1452; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
1453; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
1454; SSSE3-NEXT:    pand %xmm5, %xmm1
1455; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1456; SSSE3-NEXT:    por %xmm1, %xmm4
1457; SSSE3-NEXT:    pand %xmm4, %xmm0
1458; SSSE3-NEXT:    pandn %xmm8, %xmm4
1459; SSSE3-NEXT:    por %xmm0, %xmm4
1460; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1461; SSSE3-NEXT:    pxor %xmm2, %xmm0
1462; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1463; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
1464; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
1465; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
1466; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1467; SSSE3-NEXT:    pand %xmm5, %xmm0
1468; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1469; SSSE3-NEXT:    por %xmm0, %xmm1
1470; SSSE3-NEXT:    pand %xmm4, %xmm1
1471; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1472; SSSE3-NEXT:    pxor %xmm2, %xmm0
1473; SSSE3-NEXT:    movdqa %xmm0, %xmm4
1474; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
1475; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1476; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
1477; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1478; SSSE3-NEXT:    pand %xmm5, %xmm0
1479; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1480; SSSE3-NEXT:    por %xmm0, %xmm2
1481; SSSE3-NEXT:    pand %xmm3, %xmm2
1482; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1483; SSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
1484; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1485; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1486; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1487; SSSE3-NEXT:    retq
1488;
1489; SSE41-LABEL: trunc_packus_v4i64_v4i16:
1490; SSE41:       # %bb.0:
1491; SSE41-NEXT:    movdqa %xmm0, %xmm2
1492; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [65535,65535]
1493; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
1494; SSE41-NEXT:    pxor %xmm3, %xmm0
1495; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183]
1496; SSE41-NEXT:    movdqa %xmm6, %xmm5
1497; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
1498; SSE41-NEXT:    movdqa %xmm6, %xmm7
1499; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
1500; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
1501; SSE41-NEXT:    pand %xmm5, %xmm0
1502; SSE41-NEXT:    por %xmm7, %xmm0
1503; SSE41-NEXT:    movapd %xmm4, %xmm5
1504; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
1505; SSE41-NEXT:    movdqa %xmm1, %xmm0
1506; SSE41-NEXT:    pxor %xmm3, %xmm0
1507; SSE41-NEXT:    movdqa %xmm6, %xmm2
1508; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
1509; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
1510; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
1511; SSE41-NEXT:    pand %xmm2, %xmm0
1512; SSE41-NEXT:    por %xmm6, %xmm0
1513; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
1514; SSE41-NEXT:    xorpd %xmm1, %xmm1
1515; SSE41-NEXT:    movapd %xmm4, %xmm2
1516; SSE41-NEXT:    xorpd %xmm3, %xmm2
1517; SSE41-NEXT:    movapd %xmm2, %xmm6
1518; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
1519; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
1520; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1521; SSE41-NEXT:    pand %xmm6, %xmm0
1522; SSE41-NEXT:    por %xmm2, %xmm0
1523; SSE41-NEXT:    pxor %xmm2, %xmm2
1524; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
1525; SSE41-NEXT:    movapd %xmm5, %xmm4
1526; SSE41-NEXT:    xorpd %xmm3, %xmm4
1527; SSE41-NEXT:    movapd %xmm4, %xmm6
1528; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
1529; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
1530; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
1531; SSE41-NEXT:    pand %xmm6, %xmm0
1532; SSE41-NEXT:    por %xmm4, %xmm0
1533; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
1534; SSE41-NEXT:    packusdw %xmm2, %xmm1
1535; SSE41-NEXT:    packusdw %xmm1, %xmm1
1536; SSE41-NEXT:    movdqa %xmm1, %xmm0
1537; SSE41-NEXT:    retq
1538;
1539; AVX1-LABEL: trunc_packus_v4i64_v4i16:
1540; AVX1:       # %bb.0:
1541; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1542; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1543; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
1544; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1545; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
1546; AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
1547; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1548; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
1549; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
1550; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
1551; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1552; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
1553; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
1554; AVX1-NEXT:    vzeroupper
1555; AVX1-NEXT:    retq
1556;
1557; AVX2-LABEL: trunc_packus_v4i64_v4i16:
1558; AVX2:       # %bb.0:
1559; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535]
1560; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
1561; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1562; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1563; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
1564; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
1565; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1566; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1567; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
1568; AVX2-NEXT:    vzeroupper
1569; AVX2-NEXT:    retq
1570;
1571; AVX512F-LABEL: trunc_packus_v4i64_v4i16:
1572; AVX512F:       # %bb.0:
1573; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1574; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1575; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1576; AVX512F-NEXT:    vpmovusqw %zmm0, %xmm0
1577; AVX512F-NEXT:    vzeroupper
1578; AVX512F-NEXT:    retq
1579;
1580; AVX512VL-LABEL: trunc_packus_v4i64_v4i16:
1581; AVX512VL:       # %bb.0:
1582; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1583; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
1584; AVX512VL-NEXT:    vpmovusqw %ymm0, %xmm0
1585; AVX512VL-NEXT:    vzeroupper
1586; AVX512VL-NEXT:    retq
1587;
1588; AVX512BW-LABEL: trunc_packus_v4i64_v4i16:
1589; AVX512BW:       # %bb.0:
1590; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1591; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1592; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1593; AVX512BW-NEXT:    vpmovusqw %zmm0, %xmm0
1594; AVX512BW-NEXT:    vzeroupper
1595; AVX512BW-NEXT:    retq
1596;
1597; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i16:
1598; AVX512BWVL:       # %bb.0:
1599; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1600; AVX512BWVL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
1601; AVX512BWVL-NEXT:    vpmovusqw %ymm0, %xmm0
1602; AVX512BWVL-NEXT:    vzeroupper
1603; AVX512BWVL-NEXT:    retq
1604;
1605; SKX-LABEL: trunc_packus_v4i64_v4i16:
1606; SKX:       # %bb.0:
1607; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1608; SKX-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
1609; SKX-NEXT:    vpmovusqw %ymm0, %xmm0
1610; SKX-NEXT:    vzeroupper
1611; SKX-NEXT:    retq
1612  %1 = icmp slt <4 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535>
1613  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535>
1614  %3 = icmp sgt <4 x i64> %2, zeroinitializer
1615  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
1616  %5 = trunc <4 x i64> %4 to <4 x i16>
1617  ret <4 x i16> %5
1618}
1619
1620define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
1621; SSE2-LABEL: trunc_packus_v4i64_v4i16_store:
1622; SSE2:       # %bb.0:
1623; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
1624; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1625; SSE2-NEXT:    movdqa %xmm1, %xmm3
1626; SSE2-NEXT:    pxor %xmm2, %xmm3
1627; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1628; SSE2-NEXT:    pxor %xmm9, %xmm9
1629; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
1630; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
1631; SSE2-NEXT:    movdqa %xmm4, %xmm7
1632; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
1633; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
1634; SSE2-NEXT:    pand %xmm5, %xmm6
1635; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
1636; SSE2-NEXT:    por %xmm6, %xmm3
1637; SSE2-NEXT:    pand %xmm3, %xmm1
1638; SSE2-NEXT:    pandn %xmm8, %xmm3
1639; SSE2-NEXT:    por %xmm1, %xmm3
1640; SSE2-NEXT:    movdqa %xmm0, %xmm1
1641; SSE2-NEXT:    pxor %xmm2, %xmm1
1642; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
1643; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
1644; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
1645; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
1646; SSE2-NEXT:    pand %xmm5, %xmm1
1647; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1648; SSE2-NEXT:    por %xmm1, %xmm4
1649; SSE2-NEXT:    pand %xmm4, %xmm0
1650; SSE2-NEXT:    pandn %xmm8, %xmm4
1651; SSE2-NEXT:    por %xmm0, %xmm4
1652; SSE2-NEXT:    movdqa %xmm4, %xmm0
1653; SSE2-NEXT:    pxor %xmm2, %xmm0
1654; SSE2-NEXT:    movdqa %xmm0, %xmm1
1655; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
1656; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
1657; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
1658; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1659; SSE2-NEXT:    pand %xmm5, %xmm0
1660; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1661; SSE2-NEXT:    por %xmm0, %xmm1
1662; SSE2-NEXT:    pand %xmm4, %xmm1
1663; SSE2-NEXT:    movdqa %xmm3, %xmm0
1664; SSE2-NEXT:    pxor %xmm2, %xmm0
1665; SSE2-NEXT:    movdqa %xmm0, %xmm4
1666; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
1667; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1668; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
1669; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1670; SSE2-NEXT:    pand %xmm5, %xmm0
1671; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1672; SSE2-NEXT:    por %xmm0, %xmm2
1673; SSE2-NEXT:    pand %xmm3, %xmm2
1674; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1675; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1676; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1677; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1678; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1679; SSE2-NEXT:    movq %xmm1, (%rdi)
1680; SSE2-NEXT:    retq
1681;
1682; SSSE3-LABEL: trunc_packus_v4i64_v4i16_store:
1683; SSSE3:       # %bb.0:
1684; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
1685; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
1686; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1687; SSSE3-NEXT:    pxor %xmm2, %xmm3
1688; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1689; SSSE3-NEXT:    pxor %xmm9, %xmm9
1690; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
1691; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
1692; SSSE3-NEXT:    movdqa %xmm4, %xmm7
1693; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
1694; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
1695; SSSE3-NEXT:    pand %xmm5, %xmm6
1696; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
1697; SSSE3-NEXT:    por %xmm6, %xmm3
1698; SSSE3-NEXT:    pand %xmm3, %xmm1
1699; SSSE3-NEXT:    pandn %xmm8, %xmm3
1700; SSSE3-NEXT:    por %xmm1, %xmm3
1701; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1702; SSSE3-NEXT:    pxor %xmm2, %xmm1
1703; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
1704; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
1705; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
1706; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
1707; SSSE3-NEXT:    pand %xmm5, %xmm1
1708; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1709; SSSE3-NEXT:    por %xmm1, %xmm4
1710; SSSE3-NEXT:    pand %xmm4, %xmm0
1711; SSSE3-NEXT:    pandn %xmm8, %xmm4
1712; SSSE3-NEXT:    por %xmm0, %xmm4
1713; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1714; SSSE3-NEXT:    pxor %xmm2, %xmm0
1715; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1716; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
1717; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
1718; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
1719; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1720; SSSE3-NEXT:    pand %xmm5, %xmm0
1721; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1722; SSSE3-NEXT:    por %xmm0, %xmm1
1723; SSSE3-NEXT:    pand %xmm4, %xmm1
1724; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1725; SSSE3-NEXT:    pxor %xmm2, %xmm0
1726; SSSE3-NEXT:    movdqa %xmm0, %xmm4
1727; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
1728; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
1729; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
1730; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1731; SSSE3-NEXT:    pand %xmm5, %xmm0
1732; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
1733; SSSE3-NEXT:    por %xmm0, %xmm2
1734; SSSE3-NEXT:    pand %xmm3, %xmm2
1735; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1736; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1737; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1738; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1739; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1740; SSSE3-NEXT:    movq %xmm1, (%rdi)
1741; SSSE3-NEXT:    retq
1742;
1743; SSE41-LABEL: trunc_packus_v4i64_v4i16_store:
1744; SSE41:       # %bb.0:
1745; SSE41-NEXT:    movdqa %xmm0, %xmm2
1746; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [65535,65535]
1747; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
1748; SSE41-NEXT:    pxor %xmm3, %xmm0
1749; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147549183,2147549183]
1750; SSE41-NEXT:    movdqa %xmm6, %xmm5
1751; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
1752; SSE41-NEXT:    movdqa %xmm6, %xmm7
1753; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
1754; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
1755; SSE41-NEXT:    pand %xmm5, %xmm0
1756; SSE41-NEXT:    por %xmm7, %xmm0
1757; SSE41-NEXT:    movapd %xmm4, %xmm5
1758; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
1759; SSE41-NEXT:    movdqa %xmm1, %xmm0
1760; SSE41-NEXT:    pxor %xmm3, %xmm0
1761; SSE41-NEXT:    movdqa %xmm6, %xmm2
1762; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
1763; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
1764; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
1765; SSE41-NEXT:    pand %xmm2, %xmm0
1766; SSE41-NEXT:    por %xmm6, %xmm0
1767; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
1768; SSE41-NEXT:    xorpd %xmm1, %xmm1
1769; SSE41-NEXT:    movapd %xmm4, %xmm2
1770; SSE41-NEXT:    xorpd %xmm3, %xmm2
1771; SSE41-NEXT:    movapd %xmm2, %xmm6
1772; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
1773; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
1774; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
1775; SSE41-NEXT:    pand %xmm6, %xmm0
1776; SSE41-NEXT:    por %xmm2, %xmm0
1777; SSE41-NEXT:    pxor %xmm2, %xmm2
1778; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
1779; SSE41-NEXT:    movapd %xmm5, %xmm4
1780; SSE41-NEXT:    xorpd %xmm3, %xmm4
1781; SSE41-NEXT:    movapd %xmm4, %xmm6
1782; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
1783; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
1784; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
1785; SSE41-NEXT:    pand %xmm6, %xmm0
1786; SSE41-NEXT:    por %xmm4, %xmm0
1787; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
1788; SSE41-NEXT:    packusdw %xmm2, %xmm1
1789; SSE41-NEXT:    packusdw %xmm1, %xmm1
1790; SSE41-NEXT:    movq %xmm1, (%rdi)
1791; SSE41-NEXT:    retq
1792;
1793; AVX1-LABEL: trunc_packus_v4i64_v4i16_store:
1794; AVX1:       # %bb.0:
1795; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535]
1796; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
1797; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
1798; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1799; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
1800; AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
1801; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1802; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
1803; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
1804; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
1805; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1806; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
1807; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
1808; AVX1-NEXT:    vmovq %xmm0, (%rdi)
1809; AVX1-NEXT:    vzeroupper
1810; AVX1-NEXT:    retq
1811;
1812; AVX2-LABEL: trunc_packus_v4i64_v4i16_store:
1813; AVX2:       # %bb.0:
1814; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535]
1815; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
1816; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1817; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1818; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
1819; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
1820; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1821; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1822; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
1823; AVX2-NEXT:    vmovq %xmm0, (%rdi)
1824; AVX2-NEXT:    vzeroupper
1825; AVX2-NEXT:    retq
1826;
1827; AVX512F-LABEL: trunc_packus_v4i64_v4i16_store:
1828; AVX512F:       # %bb.0:
1829; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1830; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1831; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1832; AVX512F-NEXT:    vpmovusqw %zmm0, %xmm0
1833; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
1834; AVX512F-NEXT:    vzeroupper
1835; AVX512F-NEXT:    retq
1836;
1837; AVX512VL-LABEL: trunc_packus_v4i64_v4i16_store:
1838; AVX512VL:       # %bb.0:
1839; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1840; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
1841; AVX512VL-NEXT:    vpmovusqw %ymm0, (%rdi)
1842; AVX512VL-NEXT:    vzeroupper
1843; AVX512VL-NEXT:    retq
1844;
1845; AVX512BW-LABEL: trunc_packus_v4i64_v4i16_store:
1846; AVX512BW:       # %bb.0:
1847; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1848; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1849; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
1850; AVX512BW-NEXT:    vpmovusqw %zmm0, %xmm0
1851; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
1852; AVX512BW-NEXT:    vzeroupper
1853; AVX512BW-NEXT:    retq
1854;
1855; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i16_store:
1856; AVX512BWVL:       # %bb.0:
1857; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1858; AVX512BWVL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
1859; AVX512BWVL-NEXT:    vpmovusqw %ymm0, (%rdi)
1860; AVX512BWVL-NEXT:    vzeroupper
1861; AVX512BWVL-NEXT:    retq
1862;
1863; SKX-LABEL: trunc_packus_v4i64_v4i16_store:
1864; SKX:       # %bb.0:
1865; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1866; SKX-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
1867; SKX-NEXT:    vpmovusqw %ymm0, (%rdi)
1868; SKX-NEXT:    vzeroupper
1869; SKX-NEXT:    retq
1870  %1 = icmp slt <4 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535>
1871  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535>
1872  %3 = icmp sgt <4 x i64> %2, zeroinitializer
1873  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
1874  %5 = trunc <4 x i64> %4 to <4 x i16>
1875  store <4 x i16> %5, ptr%p1
1876  ret void
1877}
1878
1879define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" {
1880; SSE2-LABEL: trunc_packus_v8i64_v8i16:
1881; SSE2:       # %bb.0:
1882; SSE2-NEXT:    movdqa (%rdi), %xmm4
1883; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
1884; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
1885; SSE2-NEXT:    movdqa 48(%rdi), %xmm6
1886; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
1887; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
1888; SSE2-NEXT:    movdqa %xmm2, %xmm1
1889; SSE2-NEXT:    pxor %xmm11, %xmm1
1890; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
1891; SSE2-NEXT:    pxor %xmm9, %xmm9
1892; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
1893; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
1894; SSE2-NEXT:    movdqa %xmm3, %xmm5
1895; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1896; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1897; SSE2-NEXT:    pand %xmm7, %xmm0
1898; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
1899; SSE2-NEXT:    por %xmm0, %xmm1
1900; SSE2-NEXT:    pand %xmm1, %xmm2
1901; SSE2-NEXT:    pandn %xmm8, %xmm1
1902; SSE2-NEXT:    por %xmm2, %xmm1
1903; SSE2-NEXT:    movdqa %xmm4, %xmm0
1904; SSE2-NEXT:    pxor %xmm11, %xmm0
1905; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1906; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
1907; SSE2-NEXT:    movdqa %xmm3, %xmm5
1908; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
1909; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1910; SSE2-NEXT:    pand %xmm2, %xmm0
1911; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
1912; SSE2-NEXT:    por %xmm0, %xmm2
1913; SSE2-NEXT:    pand %xmm2, %xmm4
1914; SSE2-NEXT:    pandn %xmm8, %xmm2
1915; SSE2-NEXT:    por %xmm4, %xmm2
1916; SSE2-NEXT:    movdqa %xmm6, %xmm0
1917; SSE2-NEXT:    pxor %xmm11, %xmm0
1918; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1919; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
1920; SSE2-NEXT:    movdqa %xmm3, %xmm5
1921; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
1922; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1923; SSE2-NEXT:    pand %xmm4, %xmm0
1924; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
1925; SSE2-NEXT:    por %xmm0, %xmm4
1926; SSE2-NEXT:    pand %xmm4, %xmm6
1927; SSE2-NEXT:    pandn %xmm8, %xmm4
1928; SSE2-NEXT:    por %xmm6, %xmm4
1929; SSE2-NEXT:    movdqa %xmm10, %xmm0
1930; SSE2-NEXT:    pxor %xmm11, %xmm0
1931; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
1932; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
1933; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1934; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
1935; SSE2-NEXT:    pand %xmm5, %xmm0
1936; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
1937; SSE2-NEXT:    por %xmm0, %xmm5
1938; SSE2-NEXT:    pand %xmm5, %xmm10
1939; SSE2-NEXT:    pandn %xmm8, %xmm5
1940; SSE2-NEXT:    por %xmm10, %xmm5
1941; SSE2-NEXT:    movdqa %xmm5, %xmm0
1942; SSE2-NEXT:    pxor %xmm11, %xmm0
1943; SSE2-NEXT:    movdqa %xmm0, %xmm3
1944; SSE2-NEXT:    pcmpgtd %xmm11, %xmm3
1945; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
1946; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
1947; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1948; SSE2-NEXT:    pand %xmm6, %xmm0
1949; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1950; SSE2-NEXT:    por %xmm0, %xmm3
1951; SSE2-NEXT:    pand %xmm5, %xmm3
1952; SSE2-NEXT:    movdqa %xmm4, %xmm0
1953; SSE2-NEXT:    pxor %xmm11, %xmm0
1954; SSE2-NEXT:    movdqa %xmm0, %xmm5
1955; SSE2-NEXT:    pcmpgtd %xmm11, %xmm5
1956; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
1957; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
1958; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1959; SSE2-NEXT:    pand %xmm6, %xmm0
1960; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1961; SSE2-NEXT:    por %xmm0, %xmm5
1962; SSE2-NEXT:    pand %xmm4, %xmm5
1963; SSE2-NEXT:    movdqa %xmm2, %xmm0
1964; SSE2-NEXT:    pxor %xmm11, %xmm0
1965; SSE2-NEXT:    movdqa %xmm0, %xmm4
1966; SSE2-NEXT:    pcmpgtd %xmm11, %xmm4
1967; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1968; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
1969; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1970; SSE2-NEXT:    pand %xmm6, %xmm0
1971; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1972; SSE2-NEXT:    por %xmm0, %xmm4
1973; SSE2-NEXT:    pand %xmm2, %xmm4
1974; SSE2-NEXT:    movdqa %xmm1, %xmm0
1975; SSE2-NEXT:    pxor %xmm11, %xmm0
1976; SSE2-NEXT:    movdqa %xmm0, %xmm2
1977; SSE2-NEXT:    pcmpgtd %xmm11, %xmm2
1978; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
1979; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
1980; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1981; SSE2-NEXT:    pand %xmm6, %xmm0
1982; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1983; SSE2-NEXT:    por %xmm0, %xmm2
1984; SSE2-NEXT:    pand %xmm1, %xmm2
1985; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1986; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1987; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
1988; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1989; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1990; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
1991; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
1992; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1993; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1994; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1995; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1996; SSE2-NEXT:    retq
1997;
1998; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
1999; SSSE3:       # %bb.0:
2000; SSSE3-NEXT:    movdqa (%rdi), %xmm4
2001; SSSE3-NEXT:    movdqa 16(%rdi), %xmm2
2002; SSSE3-NEXT:    movdqa 32(%rdi), %xmm10
2003; SSSE3-NEXT:    movdqa 48(%rdi), %xmm6
2004; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535]
2005; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
2006; SSSE3-NEXT:    movdqa %xmm2, %xmm1
2007; SSSE3-NEXT:    pxor %xmm11, %xmm1
2008; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
2009; SSSE3-NEXT:    pxor %xmm9, %xmm9
2010; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
2011; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147549183,2147549183]
2012; SSSE3-NEXT:    movdqa %xmm3, %xmm5
2013; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
2014; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
2015; SSSE3-NEXT:    pand %xmm7, %xmm0
2016; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
2017; SSSE3-NEXT:    por %xmm0, %xmm1
2018; SSSE3-NEXT:    pand %xmm1, %xmm2
2019; SSSE3-NEXT:    pandn %xmm8, %xmm1
2020; SSSE3-NEXT:    por %xmm2, %xmm1
2021; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2022; SSSE3-NEXT:    pxor %xmm11, %xmm0
2023; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
2024; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
2025; SSSE3-NEXT:    movdqa %xmm3, %xmm5
2026; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
2027; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
2028; SSSE3-NEXT:    pand %xmm2, %xmm0
2029; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
2030; SSSE3-NEXT:    por %xmm0, %xmm2
2031; SSSE3-NEXT:    pand %xmm2, %xmm4
2032; SSSE3-NEXT:    pandn %xmm8, %xmm2
2033; SSSE3-NEXT:    por %xmm4, %xmm2
2034; SSSE3-NEXT:    movdqa %xmm6, %xmm0
2035; SSSE3-NEXT:    pxor %xmm11, %xmm0
2036; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2037; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
2038; SSSE3-NEXT:    movdqa %xmm3, %xmm5
2039; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
2040; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
2041; SSSE3-NEXT:    pand %xmm4, %xmm0
2042; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
2043; SSSE3-NEXT:    por %xmm0, %xmm4
2044; SSSE3-NEXT:    pand %xmm4, %xmm6
2045; SSSE3-NEXT:    pandn %xmm8, %xmm4
2046; SSSE3-NEXT:    por %xmm6, %xmm4
2047; SSSE3-NEXT:    movdqa %xmm10, %xmm0
2048; SSSE3-NEXT:    pxor %xmm11, %xmm0
2049; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2050; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
2051; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
2052; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
2053; SSSE3-NEXT:    pand %xmm5, %xmm0
2054; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2055; SSSE3-NEXT:    por %xmm0, %xmm5
2056; SSSE3-NEXT:    pand %xmm5, %xmm10
2057; SSSE3-NEXT:    pandn %xmm8, %xmm5
2058; SSSE3-NEXT:    por %xmm10, %xmm5
2059; SSSE3-NEXT:    movdqa %xmm5, %xmm0
2060; SSSE3-NEXT:    pxor %xmm11, %xmm0
2061; SSSE3-NEXT:    movdqa %xmm0, %xmm3
2062; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm3
2063; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2064; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
2065; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2066; SSSE3-NEXT:    pand %xmm6, %xmm0
2067; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2068; SSSE3-NEXT:    por %xmm0, %xmm3
2069; SSSE3-NEXT:    pand %xmm5, %xmm3
2070; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2071; SSSE3-NEXT:    pxor %xmm11, %xmm0
2072; SSSE3-NEXT:    movdqa %xmm0, %xmm5
2073; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm5
2074; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2075; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
2076; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2077; SSSE3-NEXT:    pand %xmm6, %xmm0
2078; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2079; SSSE3-NEXT:    por %xmm0, %xmm5
2080; SSSE3-NEXT:    pand %xmm4, %xmm5
2081; SSSE3-NEXT:    movdqa %xmm2, %xmm0
2082; SSSE3-NEXT:    pxor %xmm11, %xmm0
2083; SSSE3-NEXT:    movdqa %xmm0, %xmm4
2084; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm4
2085; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
2086; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
2087; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2088; SSSE3-NEXT:    pand %xmm6, %xmm0
2089; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2090; SSSE3-NEXT:    por %xmm0, %xmm4
2091; SSSE3-NEXT:    pand %xmm2, %xmm4
2092; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2093; SSSE3-NEXT:    pxor %xmm11, %xmm0
2094; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2095; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm2
2096; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
2097; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
2098; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2099; SSSE3-NEXT:    pand %xmm6, %xmm0
2100; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2101; SSSE3-NEXT:    por %xmm0, %xmm2
2102; SSSE3-NEXT:    pand %xmm1, %xmm2
2103; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2104; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2105; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
2106; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2107; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2108; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
2109; SSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
2110; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2111; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2112; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2113; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2114; SSSE3-NEXT:    retq
2115;
2116; SSE41-LABEL: trunc_packus_v8i64_v8i16:
2117; SSE41:       # %bb.0:
2118; SSE41-NEXT:    movdqa (%rdi), %xmm10
2119; SSE41-NEXT:    movdqa 16(%rdi), %xmm9
2120; SSE41-NEXT:    movdqa 32(%rdi), %xmm3
2121; SSE41-NEXT:    movdqa 48(%rdi), %xmm5
2122; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [65535,65535]
2123; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
2124; SSE41-NEXT:    movdqa %xmm3, %xmm0
2125; SSE41-NEXT:    pxor %xmm2, %xmm0
2126; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
2127; SSE41-NEXT:    movdqa %xmm4, %xmm7
2128; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
2129; SSE41-NEXT:    movdqa %xmm4, %xmm6
2130; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
2131; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
2132; SSE41-NEXT:    pand %xmm7, %xmm0
2133; SSE41-NEXT:    por %xmm6, %xmm0
2134; SSE41-NEXT:    movapd %xmm1, %xmm8
2135; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
2136; SSE41-NEXT:    movdqa %xmm5, %xmm0
2137; SSE41-NEXT:    pxor %xmm2, %xmm0
2138; SSE41-NEXT:    movdqa %xmm4, %xmm3
2139; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
2140; SSE41-NEXT:    movdqa %xmm4, %xmm6
2141; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
2142; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
2143; SSE41-NEXT:    pand %xmm3, %xmm0
2144; SSE41-NEXT:    por %xmm6, %xmm0
2145; SSE41-NEXT:    movapd %xmm1, %xmm6
2146; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm6
2147; SSE41-NEXT:    movdqa %xmm10, %xmm0
2148; SSE41-NEXT:    pxor %xmm2, %xmm0
2149; SSE41-NEXT:    movdqa %xmm4, %xmm3
2150; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
2151; SSE41-NEXT:    movdqa %xmm4, %xmm5
2152; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
2153; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
2154; SSE41-NEXT:    pand %xmm3, %xmm0
2155; SSE41-NEXT:    por %xmm5, %xmm0
2156; SSE41-NEXT:    movapd %xmm1, %xmm3
2157; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
2158; SSE41-NEXT:    movdqa %xmm9, %xmm0
2159; SSE41-NEXT:    pxor %xmm2, %xmm0
2160; SSE41-NEXT:    movdqa %xmm4, %xmm5
2161; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
2162; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
2163; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2164; SSE41-NEXT:    pand %xmm5, %xmm0
2165; SSE41-NEXT:    por %xmm4, %xmm0
2166; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm1
2167; SSE41-NEXT:    pxor %xmm5, %xmm5
2168; SSE41-NEXT:    movapd %xmm1, %xmm4
2169; SSE41-NEXT:    xorpd %xmm2, %xmm4
2170; SSE41-NEXT:    movapd %xmm4, %xmm7
2171; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
2172; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
2173; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2174; SSE41-NEXT:    pand %xmm7, %xmm0
2175; SSE41-NEXT:    por %xmm4, %xmm0
2176; SSE41-NEXT:    pxor %xmm4, %xmm4
2177; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
2178; SSE41-NEXT:    movapd %xmm3, %xmm1
2179; SSE41-NEXT:    xorpd %xmm2, %xmm1
2180; SSE41-NEXT:    movapd %xmm1, %xmm7
2181; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
2182; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
2183; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
2184; SSE41-NEXT:    pand %xmm7, %xmm0
2185; SSE41-NEXT:    por %xmm1, %xmm0
2186; SSE41-NEXT:    pxor %xmm1, %xmm1
2187; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
2188; SSE41-NEXT:    packusdw %xmm4, %xmm1
2189; SSE41-NEXT:    movapd %xmm6, %xmm3
2190; SSE41-NEXT:    xorpd %xmm2, %xmm3
2191; SSE41-NEXT:    movapd %xmm3, %xmm4
2192; SSE41-NEXT:    pcmpeqd %xmm2, %xmm4
2193; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
2194; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
2195; SSE41-NEXT:    pand %xmm4, %xmm0
2196; SSE41-NEXT:    por %xmm3, %xmm0
2197; SSE41-NEXT:    pxor %xmm3, %xmm3
2198; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
2199; SSE41-NEXT:    movapd %xmm8, %xmm4
2200; SSE41-NEXT:    xorpd %xmm2, %xmm4
2201; SSE41-NEXT:    movapd %xmm4, %xmm6
2202; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
2203; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
2204; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2205; SSE41-NEXT:    pand %xmm6, %xmm0
2206; SSE41-NEXT:    por %xmm4, %xmm0
2207; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm5
2208; SSE41-NEXT:    packusdw %xmm3, %xmm5
2209; SSE41-NEXT:    packusdw %xmm5, %xmm1
2210; SSE41-NEXT:    movdqa %xmm1, %xmm0
2211; SSE41-NEXT:    retq
2212;
2213; AVX1-LABEL: trunc_packus_v8i64_v8i16:
2214; AVX1:       # %bb.0:
2215; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2216; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
2217; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
2218; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
2219; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65535,65535]
2220; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
2221; AVX1-NEXT:    vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
2222; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm5
2223; AVX1-NEXT:    vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
2224; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm5
2225; AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
2226; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
2227; AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
2228; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2229; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm5
2230; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm1
2231; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
2232; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm0
2233; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2234; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm1
2235; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2236; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm3
2237; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
2238; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
2239; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2240; AVX1-NEXT:    retq
2241;
2242; AVX2-LABEL: trunc_packus_v8i64_v8i16:
2243; AVX2:       # %bb.0:
2244; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2245; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2246; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
2247; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
2248; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
2249; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
2250; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
2251; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2252; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
2253; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm1
2254; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
2255; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
2256; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
2257; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2258; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2259; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2260; AVX2-NEXT:    vzeroupper
2261; AVX2-NEXT:    retq
2262;
2263; AVX512-LABEL: trunc_packus_v8i64_v8i16:
2264; AVX512:       # %bb.0:
2265; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2266; AVX512-NEXT:    vpmaxsq (%rdi), %zmm0, %zmm0
2267; AVX512-NEXT:    vpmovusqw %zmm0, %xmm0
2268; AVX512-NEXT:    vzeroupper
2269; AVX512-NEXT:    retq
2270;
2271; SKX-LABEL: trunc_packus_v8i64_v8i16:
2272; SKX:       # %bb.0:
2273; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2274; SKX-NEXT:    vpmaxsq 32(%rdi), %ymm0, %ymm1
2275; SKX-NEXT:    vpmovusqw %ymm1, %xmm1
2276; SKX-NEXT:    vpmaxsq (%rdi), %ymm0, %ymm0
2277; SKX-NEXT:    vpmovusqw %ymm0, %xmm0
2278; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2279; SKX-NEXT:    vzeroupper
2280; SKX-NEXT:    retq
2281  %a0 = load <8 x i64>, ptr %p0
2282  %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
2283  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
2284  %3 = icmp sgt <8 x i64> %2, zeroinitializer
2285  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
2286  %5 = trunc <8 x i64> %4 to <8 x i16>
2287  ret <8 x i16> %5
2288}
2289
2290define <4 x i16> @trunc_packus_v4i32_v4i16(<4 x i32> %a0) {
2291; SSE2-LABEL: trunc_packus_v4i32_v4i16:
2292; SSE2:       # %bb.0:
2293; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2294; SSE2-NEXT:    movdqa %xmm1, %xmm2
2295; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
2296; SSE2-NEXT:    pand %xmm2, %xmm0
2297; SSE2-NEXT:    pandn %xmm1, %xmm2
2298; SSE2-NEXT:    por %xmm0, %xmm2
2299; SSE2-NEXT:    pxor %xmm0, %xmm0
2300; SSE2-NEXT:    movdqa %xmm2, %xmm1
2301; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2302; SSE2-NEXT:    pand %xmm2, %xmm1
2303; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
2304; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2305; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2306; SSE2-NEXT:    retq
2307;
2308; SSSE3-LABEL: trunc_packus_v4i32_v4i16:
2309; SSSE3:       # %bb.0:
2310; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2311; SSSE3-NEXT:    movdqa %xmm1, %xmm2
2312; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
2313; SSSE3-NEXT:    pand %xmm2, %xmm0
2314; SSSE3-NEXT:    pandn %xmm1, %xmm2
2315; SSSE3-NEXT:    por %xmm2, %xmm0
2316; SSSE3-NEXT:    pxor %xmm1, %xmm1
2317; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2318; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2319; SSSE3-NEXT:    pand %xmm2, %xmm0
2320; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2321; SSSE3-NEXT:    retq
2322;
2323; SSE41-LABEL: trunc_packus_v4i32_v4i16:
2324; SSE41:       # %bb.0:
2325; SSE41-NEXT:    packusdw %xmm0, %xmm0
2326; SSE41-NEXT:    retq
2327;
2328; AVX-LABEL: trunc_packus_v4i32_v4i16:
2329; AVX:       # %bb.0:
2330; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
2331; AVX-NEXT:    retq
2332;
2333; AVX512-LABEL: trunc_packus_v4i32_v4i16:
2334; AVX512:       # %bb.0:
2335; AVX512-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
2336; AVX512-NEXT:    retq
2337;
2338; SKX-LABEL: trunc_packus_v4i32_v4i16:
2339; SKX:       # %bb.0:
2340; SKX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
2341; SKX-NEXT:    retq
2342  %1 = icmp slt <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535>
2343  %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
2344  %3 = icmp sgt <4 x i32> %2, zeroinitializer
2345  %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer
2346  %5 = trunc <4 x i32> %4 to <4 x i16>
2347  ret <4 x i16> %5
2348}
2349
2350define void @trunc_packus_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) {
2351; SSE2-LABEL: trunc_packus_v4i32_v4i16_store:
2352; SSE2:       # %bb.0:
2353; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2354; SSE2-NEXT:    movdqa %xmm1, %xmm2
2355; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
2356; SSE2-NEXT:    pand %xmm2, %xmm0
2357; SSE2-NEXT:    pandn %xmm1, %xmm2
2358; SSE2-NEXT:    por %xmm0, %xmm2
2359; SSE2-NEXT:    pxor %xmm0, %xmm0
2360; SSE2-NEXT:    movdqa %xmm2, %xmm1
2361; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2362; SSE2-NEXT:    pand %xmm2, %xmm1
2363; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
2364; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2365; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2366; SSE2-NEXT:    movq %xmm0, (%rdi)
2367; SSE2-NEXT:    retq
2368;
2369; SSSE3-LABEL: trunc_packus_v4i32_v4i16_store:
2370; SSSE3:       # %bb.0:
2371; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2372; SSSE3-NEXT:    movdqa %xmm1, %xmm2
2373; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
2374; SSSE3-NEXT:    pand %xmm2, %xmm0
2375; SSSE3-NEXT:    pandn %xmm1, %xmm2
2376; SSSE3-NEXT:    por %xmm0, %xmm2
2377; SSSE3-NEXT:    pxor %xmm0, %xmm0
2378; SSSE3-NEXT:    movdqa %xmm2, %xmm1
2379; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2380; SSSE3-NEXT:    pand %xmm2, %xmm1
2381; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
2382; SSSE3-NEXT:    movq %xmm1, (%rdi)
2383; SSSE3-NEXT:    retq
2384;
2385; SSE41-LABEL: trunc_packus_v4i32_v4i16_store:
2386; SSE41:       # %bb.0:
2387; SSE41-NEXT:    packusdw %xmm0, %xmm0
2388; SSE41-NEXT:    movq %xmm0, (%rdi)
2389; SSE41-NEXT:    retq
2390;
2391; AVX-LABEL: trunc_packus_v4i32_v4i16_store:
2392; AVX:       # %bb.0:
2393; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
2394; AVX-NEXT:    vmovq %xmm0, (%rdi)
2395; AVX-NEXT:    retq
2396;
2397; AVX512F-LABEL: trunc_packus_v4i32_v4i16_store:
2398; AVX512F:       # %bb.0:
2399; AVX512F-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
2400; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
2401; AVX512F-NEXT:    retq
2402;
2403; AVX512VL-LABEL: trunc_packus_v4i32_v4i16_store:
2404; AVX512VL:       # %bb.0:
2405; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2406; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
2407; AVX512VL-NEXT:    vpmovusdw %xmm0, (%rdi)
2408; AVX512VL-NEXT:    retq
2409;
2410; AVX512BW-LABEL: trunc_packus_v4i32_v4i16_store:
2411; AVX512BW:       # %bb.0:
2412; AVX512BW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
2413; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
2414; AVX512BW-NEXT:    retq
2415;
2416; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i16_store:
2417; AVX512BWVL:       # %bb.0:
2418; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2419; AVX512BWVL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
2420; AVX512BWVL-NEXT:    vpmovusdw %xmm0, (%rdi)
2421; AVX512BWVL-NEXT:    retq
2422;
2423; SKX-LABEL: trunc_packus_v4i32_v4i16_store:
2424; SKX:       # %bb.0:
2425; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2426; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
2427; SKX-NEXT:    vpmovusdw %xmm0, (%rdi)
2428; SKX-NEXT:    retq
2429  %1 = icmp slt <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535>
2430  %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
2431  %3 = icmp sgt <4 x i32> %2, zeroinitializer
2432  %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer
2433  %5 = trunc <4 x i32> %4 to <4 x i16>
2434  store <4 x i16> %5, ptr%p1
2435  ret void
2436}
2437
2438define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
2439; SSE2-LABEL: trunc_packus_v8i32_v8i16:
2440; SSE2:       # %bb.0:
2441; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
2442; SSE2-NEXT:    movdqa %xmm2, %xmm3
2443; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2444; SSE2-NEXT:    pand %xmm3, %xmm1
2445; SSE2-NEXT:    pandn %xmm2, %xmm3
2446; SSE2-NEXT:    por %xmm1, %xmm3
2447; SSE2-NEXT:    movdqa %xmm2, %xmm1
2448; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2449; SSE2-NEXT:    pand %xmm1, %xmm0
2450; SSE2-NEXT:    pandn %xmm2, %xmm1
2451; SSE2-NEXT:    por %xmm1, %xmm0
2452; SSE2-NEXT:    pxor %xmm1, %xmm1
2453; SSE2-NEXT:    movdqa %xmm0, %xmm2
2454; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2455; SSE2-NEXT:    pand %xmm2, %xmm0
2456; SSE2-NEXT:    movdqa %xmm3, %xmm2
2457; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2458; SSE2-NEXT:    pand %xmm3, %xmm2
2459; SSE2-NEXT:    pslld $16, %xmm2
2460; SSE2-NEXT:    psrad $16, %xmm2
2461; SSE2-NEXT:    pslld $16, %xmm0
2462; SSE2-NEXT:    psrad $16, %xmm0
2463; SSE2-NEXT:    packssdw %xmm2, %xmm0
2464; SSE2-NEXT:    retq
2465;
2466; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
2467; SSSE3:       # %bb.0:
2468; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
2469; SSSE3-NEXT:    movdqa %xmm2, %xmm3
2470; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
2471; SSSE3-NEXT:    pand %xmm3, %xmm1
2472; SSSE3-NEXT:    pandn %xmm2, %xmm3
2473; SSSE3-NEXT:    por %xmm1, %xmm3
2474; SSSE3-NEXT:    movdqa %xmm2, %xmm1
2475; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2476; SSSE3-NEXT:    pand %xmm1, %xmm0
2477; SSSE3-NEXT:    pandn %xmm2, %xmm1
2478; SSSE3-NEXT:    por %xmm1, %xmm0
2479; SSSE3-NEXT:    pxor %xmm1, %xmm1
2480; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2481; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2482; SSSE3-NEXT:    pand %xmm2, %xmm0
2483; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2484; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2485; SSSE3-NEXT:    pand %xmm3, %xmm2
2486; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2487; SSSE3-NEXT:    pshufb %xmm1, %xmm2
2488; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2489; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2490; SSSE3-NEXT:    retq
2491;
2492; SSE41-LABEL: trunc_packus_v8i32_v8i16:
2493; SSE41:       # %bb.0:
2494; SSE41-NEXT:    packusdw %xmm1, %xmm0
2495; SSE41-NEXT:    retq
2496;
2497; AVX1-LABEL: trunc_packus_v8i32_v8i16:
2498; AVX1:       # %bb.0:
2499; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2500; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2501; AVX1-NEXT:    vzeroupper
2502; AVX1-NEXT:    retq
2503;
2504; AVX2-LABEL: trunc_packus_v8i32_v8i16:
2505; AVX2:       # %bb.0:
2506; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2507; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2508; AVX2-NEXT:    vzeroupper
2509; AVX2-NEXT:    retq
2510;
2511; AVX512F-LABEL: trunc_packus_v8i32_v8i16:
2512; AVX512F:       # %bb.0:
2513; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
2514; AVX512F-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2515; AVX512F-NEXT:    vzeroupper
2516; AVX512F-NEXT:    retq
2517;
2518; AVX512VL-LABEL: trunc_packus_v8i32_v8i16:
2519; AVX512VL:       # %bb.0:
2520; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2521; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
2522; AVX512VL-NEXT:    vpmovusdw %ymm0, %xmm0
2523; AVX512VL-NEXT:    vzeroupper
2524; AVX512VL-NEXT:    retq
2525;
2526; AVX512BW-LABEL: trunc_packus_v8i32_v8i16:
2527; AVX512BW:       # %bb.0:
2528; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
2529; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2530; AVX512BW-NEXT:    vzeroupper
2531; AVX512BW-NEXT:    retq
2532;
2533; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16:
2534; AVX512BWVL:       # %bb.0:
2535; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2536; AVX512BWVL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
2537; AVX512BWVL-NEXT:    vpmovusdw %ymm0, %xmm0
2538; AVX512BWVL-NEXT:    vzeroupper
2539; AVX512BWVL-NEXT:    retq
2540;
2541; SKX-LABEL: trunc_packus_v8i32_v8i16:
2542; SKX:       # %bb.0:
2543; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2544; SKX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
2545; SKX-NEXT:    vpmovusdw %ymm0, %xmm0
2546; SKX-NEXT:    vzeroupper
2547; SKX-NEXT:    retq
2548  %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2549  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2550  %3 = icmp sgt <8 x i32> %2, zeroinitializer
2551  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
2552  %5 = trunc <8 x i32> %4 to <8 x i16>
2553  ret <8 x i16> %5
2554}
2555
2556define <16 x i16> @trunc_packus_v16i32_v16i16(ptr %p0) "min-legal-vector-width"="256" {
2557; SSE2-LABEL: trunc_packus_v16i32_v16i16:
2558; SSE2:       # %bb.0:
2559; SSE2-NEXT:    movdqa (%rdi), %xmm1
2560; SSE2-NEXT:    movdqa 16(%rdi), %xmm3
2561; SSE2-NEXT:    movdqa 32(%rdi), %xmm0
2562; SSE2-NEXT:    movdqa 48(%rdi), %xmm4
2563; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
2564; SSE2-NEXT:    movdqa %xmm5, %xmm2
2565; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
2566; SSE2-NEXT:    pand %xmm2, %xmm3
2567; SSE2-NEXT:    pandn %xmm5, %xmm2
2568; SSE2-NEXT:    por %xmm3, %xmm2
2569; SSE2-NEXT:    movdqa %xmm5, %xmm3
2570; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2571; SSE2-NEXT:    pand %xmm3, %xmm1
2572; SSE2-NEXT:    pandn %xmm5, %xmm3
2573; SSE2-NEXT:    por %xmm1, %xmm3
2574; SSE2-NEXT:    movdqa %xmm5, %xmm6
2575; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
2576; SSE2-NEXT:    pand %xmm6, %xmm4
2577; SSE2-NEXT:    pandn %xmm5, %xmm6
2578; SSE2-NEXT:    por %xmm4, %xmm6
2579; SSE2-NEXT:    movdqa %xmm5, %xmm4
2580; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
2581; SSE2-NEXT:    pand %xmm4, %xmm0
2582; SSE2-NEXT:    pandn %xmm5, %xmm4
2583; SSE2-NEXT:    por %xmm0, %xmm4
2584; SSE2-NEXT:    pxor %xmm5, %xmm5
2585; SSE2-NEXT:    movdqa %xmm4, %xmm1
2586; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
2587; SSE2-NEXT:    pand %xmm4, %xmm1
2588; SSE2-NEXT:    movdqa %xmm6, %xmm4
2589; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
2590; SSE2-NEXT:    pand %xmm6, %xmm4
2591; SSE2-NEXT:    movdqa %xmm3, %xmm0
2592; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
2593; SSE2-NEXT:    pand %xmm3, %xmm0
2594; SSE2-NEXT:    movdqa %xmm2, %xmm3
2595; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
2596; SSE2-NEXT:    pand %xmm2, %xmm3
2597; SSE2-NEXT:    pslld $16, %xmm3
2598; SSE2-NEXT:    psrad $16, %xmm3
2599; SSE2-NEXT:    pslld $16, %xmm0
2600; SSE2-NEXT:    psrad $16, %xmm0
2601; SSE2-NEXT:    packssdw %xmm3, %xmm0
2602; SSE2-NEXT:    pslld $16, %xmm4
2603; SSE2-NEXT:    psrad $16, %xmm4
2604; SSE2-NEXT:    pslld $16, %xmm1
2605; SSE2-NEXT:    psrad $16, %xmm1
2606; SSE2-NEXT:    packssdw %xmm4, %xmm1
2607; SSE2-NEXT:    retq
2608;
2609; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
2610; SSSE3:       # %bb.0:
2611; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2612; SSSE3-NEXT:    movdqa 16(%rdi), %xmm3
2613; SSSE3-NEXT:    movdqa 32(%rdi), %xmm0
2614; SSSE3-NEXT:    movdqa 48(%rdi), %xmm4
2615; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
2616; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2617; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
2618; SSSE3-NEXT:    pand %xmm2, %xmm3
2619; SSSE3-NEXT:    pandn %xmm5, %xmm2
2620; SSSE3-NEXT:    por %xmm3, %xmm2
2621; SSSE3-NEXT:    movdqa %xmm5, %xmm3
2622; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
2623; SSSE3-NEXT:    pand %xmm3, %xmm1
2624; SSSE3-NEXT:    pandn %xmm5, %xmm3
2625; SSSE3-NEXT:    por %xmm1, %xmm3
2626; SSSE3-NEXT:    movdqa %xmm5, %xmm6
2627; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
2628; SSSE3-NEXT:    pand %xmm6, %xmm4
2629; SSSE3-NEXT:    pandn %xmm5, %xmm6
2630; SSSE3-NEXT:    por %xmm4, %xmm6
2631; SSSE3-NEXT:    movdqa %xmm5, %xmm4
2632; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
2633; SSSE3-NEXT:    pand %xmm4, %xmm0
2634; SSSE3-NEXT:    pandn %xmm5, %xmm4
2635; SSSE3-NEXT:    por %xmm0, %xmm4
2636; SSSE3-NEXT:    pxor %xmm5, %xmm5
2637; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2638; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
2639; SSSE3-NEXT:    pand %xmm4, %xmm1
2640; SSSE3-NEXT:    movdqa %xmm6, %xmm4
2641; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
2642; SSSE3-NEXT:    pand %xmm6, %xmm4
2643; SSSE3-NEXT:    movdqa %xmm3, %xmm0
2644; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
2645; SSSE3-NEXT:    pand %xmm3, %xmm0
2646; SSSE3-NEXT:    movdqa %xmm2, %xmm3
2647; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
2648; SSSE3-NEXT:    pand %xmm2, %xmm3
2649; SSSE3-NEXT:    pslld $16, %xmm3
2650; SSSE3-NEXT:    psrad $16, %xmm3
2651; SSSE3-NEXT:    pslld $16, %xmm0
2652; SSSE3-NEXT:    psrad $16, %xmm0
2653; SSSE3-NEXT:    packssdw %xmm3, %xmm0
2654; SSSE3-NEXT:    pslld $16, %xmm4
2655; SSSE3-NEXT:    psrad $16, %xmm4
2656; SSSE3-NEXT:    pslld $16, %xmm1
2657; SSSE3-NEXT:    psrad $16, %xmm1
2658; SSSE3-NEXT:    packssdw %xmm4, %xmm1
2659; SSSE3-NEXT:    retq
2660;
2661; SSE41-LABEL: trunc_packus_v16i32_v16i16:
2662; SSE41:       # %bb.0:
2663; SSE41-NEXT:    movdqa (%rdi), %xmm0
2664; SSE41-NEXT:    movdqa 32(%rdi), %xmm1
2665; SSE41-NEXT:    packusdw 16(%rdi), %xmm0
2666; SSE41-NEXT:    packusdw 48(%rdi), %xmm1
2667; SSE41-NEXT:    retq
2668;
2669; AVX1-LABEL: trunc_packus_v16i32_v16i16:
2670; AVX1:       # %bb.0:
2671; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
2672; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm1
2673; AVX1-NEXT:    vpackusdw 48(%rdi), %xmm1, %xmm1
2674; AVX1-NEXT:    vpackusdw 16(%rdi), %xmm0, %xmm0
2675; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2676; AVX1-NEXT:    retq
2677;
2678; AVX2-LABEL: trunc_packus_v16i32_v16i16:
2679; AVX2:       # %bb.0:
2680; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2681; AVX2-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
2682; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2683; AVX2-NEXT:    retq
2684;
2685; AVX512-LABEL: trunc_packus_v16i32_v16i16:
2686; AVX512:       # %bb.0:
2687; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2688; AVX512-NEXT:    vpmaxsd (%rdi), %zmm0, %zmm0
2689; AVX512-NEXT:    vpmovusdw %zmm0, %ymm0
2690; AVX512-NEXT:    retq
2691;
2692; SKX-LABEL: trunc_packus_v16i32_v16i16:
2693; SKX:       # %bb.0:
2694; SKX-NEXT:    vmovdqa (%rdi), %ymm0
2695; SKX-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
2696; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
2697; SKX-NEXT:    retq
2698  %a0 = load <16 x i32>, ptr %p0
2699  %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2700  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
2701  %3 = icmp sgt <16 x i32> %2, zeroinitializer
2702  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2703  %5 = trunc <16 x i32> %4 to <16 x i16>
2704  ret <16 x i16> %5
2705}
2706
2707;
2708; PACKUS saturation truncation to vXi8
2709;
2710
2711define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) {
2712; SSE2-LABEL: trunc_packus_v2i64_v2i8:
2713; SSE2:       # %bb.0:
2714; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
2715; SSE2-NEXT:    movdqa %xmm0, %xmm2
2716; SSE2-NEXT:    pxor %xmm1, %xmm2
2717; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
2718; SSE2-NEXT:    pxor %xmm4, %xmm4
2719; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
2720; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
2721; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
2722; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
2723; SSE2-NEXT:    pand %xmm4, %xmm2
2724; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2725; SSE2-NEXT:    por %xmm2, %xmm3
2726; SSE2-NEXT:    pand %xmm3, %xmm0
2727; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2728; SSE2-NEXT:    por %xmm3, %xmm0
2729; SSE2-NEXT:    movdqa %xmm0, %xmm2
2730; SSE2-NEXT:    pxor %xmm1, %xmm2
2731; SSE2-NEXT:    movdqa %xmm2, %xmm3
2732; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2733; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
2734; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
2735; SSE2-NEXT:    pand %xmm3, %xmm1
2736; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
2737; SSE2-NEXT:    por %xmm1, %xmm2
2738; SSE2-NEXT:    pand %xmm2, %xmm0
2739; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2740; SSE2-NEXT:    packuswb %xmm0, %xmm0
2741; SSE2-NEXT:    packuswb %xmm0, %xmm0
2742; SSE2-NEXT:    packuswb %xmm0, %xmm0
2743; SSE2-NEXT:    retq
2744;
2745; SSSE3-LABEL: trunc_packus_v2i64_v2i8:
2746; SSSE3:       # %bb.0:
2747; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
2748; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2749; SSSE3-NEXT:    pxor %xmm1, %xmm2
2750; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
2751; SSSE3-NEXT:    pxor %xmm4, %xmm4
2752; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
2753; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
2754; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
2755; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
2756; SSSE3-NEXT:    pand %xmm4, %xmm2
2757; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2758; SSSE3-NEXT:    por %xmm2, %xmm3
2759; SSSE3-NEXT:    pand %xmm3, %xmm0
2760; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2761; SSSE3-NEXT:    por %xmm3, %xmm0
2762; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2763; SSSE3-NEXT:    pxor %xmm1, %xmm2
2764; SSSE3-NEXT:    movdqa %xmm2, %xmm3
2765; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
2766; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
2767; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
2768; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
2769; SSSE3-NEXT:    pand %xmm4, %xmm1
2770; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
2771; SSSE3-NEXT:    por %xmm1, %xmm2
2772; SSSE3-NEXT:    pand %xmm2, %xmm0
2773; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2774; SSSE3-NEXT:    retq
2775;
2776; SSE41-LABEL: trunc_packus_v2i64_v2i8:
2777; SSE41:       # %bb.0:
2778; SSE41-NEXT:    movdqa %xmm0, %xmm1
2779; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [255,255]
2780; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
2781; SSE41-NEXT:    pxor %xmm3, %xmm0
2782; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
2783; SSE41-NEXT:    movdqa %xmm4, %xmm5
2784; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
2785; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
2786; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2787; SSE41-NEXT:    pand %xmm5, %xmm0
2788; SSE41-NEXT:    por %xmm4, %xmm0
2789; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
2790; SSE41-NEXT:    xorpd %xmm1, %xmm1
2791; SSE41-NEXT:    movapd %xmm2, %xmm4
2792; SSE41-NEXT:    xorpd %xmm3, %xmm4
2793; SSE41-NEXT:    movapd %xmm4, %xmm5
2794; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
2795; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
2796; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2797; SSE41-NEXT:    pand %xmm5, %xmm0
2798; SSE41-NEXT:    por %xmm4, %xmm0
2799; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
2800; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2801; SSE41-NEXT:    movdqa %xmm1, %xmm0
2802; SSE41-NEXT:    retq
2803;
2804; AVX-LABEL: trunc_packus_v2i64_v2i8:
2805; AVX:       # %bb.0:
2806; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255]
2807; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
2808; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
2809; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2810; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
2811; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
2812; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2813; AVX-NEXT:    retq
2814;
2815; AVX512F-LABEL: trunc_packus_v2i64_v2i8:
2816; AVX512F:       # %bb.0:
2817; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2818; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2819; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
2820; AVX512F-NEXT:    vpmovusqb %zmm0, %xmm0
2821; AVX512F-NEXT:    vzeroupper
2822; AVX512F-NEXT:    retq
2823;
2824; AVX512VL-LABEL: trunc_packus_v2i64_v2i8:
2825; AVX512VL:       # %bb.0:
2826; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2827; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
2828; AVX512VL-NEXT:    vpmovusqb %xmm0, %xmm0
2829; AVX512VL-NEXT:    retq
2830;
2831; AVX512BW-LABEL: trunc_packus_v2i64_v2i8:
2832; AVX512BW:       # %bb.0:
2833; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2834; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2835; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
2836; AVX512BW-NEXT:    vpmovusqb %zmm0, %xmm0
2837; AVX512BW-NEXT:    vzeroupper
2838; AVX512BW-NEXT:    retq
2839;
2840; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8:
2841; AVX512BWVL:       # %bb.0:
2842; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2843; AVX512BWVL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
2844; AVX512BWVL-NEXT:    vpmovusqb %xmm0, %xmm0
2845; AVX512BWVL-NEXT:    retq
2846;
2847; SKX-LABEL: trunc_packus_v2i64_v2i8:
2848; SKX:       # %bb.0:
2849; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2850; SKX-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
2851; SKX-NEXT:    vpmovusqb %xmm0, %xmm0
2852; SKX-NEXT:    retq
2853  %1 = icmp slt <2 x i64> %a0, <i64 255, i64 255>
2854  %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 255, i64 255>
2855  %3 = icmp sgt <2 x i64> %2, zeroinitializer
2856  %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer
2857  %5 = trunc <2 x i64> %4 to <2 x i8>
2858  ret <2 x i8> %5
2859}
2860
2861define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
2862; SSE2-LABEL: trunc_packus_v2i64_v2i8_store:
2863; SSE2:       # %bb.0:
2864; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
2865; SSE2-NEXT:    movdqa %xmm0, %xmm2
2866; SSE2-NEXT:    pxor %xmm1, %xmm2
2867; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
2868; SSE2-NEXT:    pxor %xmm4, %xmm4
2869; SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
2870; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
2871; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
2872; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
2873; SSE2-NEXT:    pand %xmm4, %xmm2
2874; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2875; SSE2-NEXT:    por %xmm2, %xmm3
2876; SSE2-NEXT:    pand %xmm3, %xmm0
2877; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2878; SSE2-NEXT:    por %xmm0, %xmm3
2879; SSE2-NEXT:    movdqa %xmm3, %xmm0
2880; SSE2-NEXT:    pxor %xmm1, %xmm0
2881; SSE2-NEXT:    movdqa %xmm0, %xmm2
2882; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2883; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
2884; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2885; SSE2-NEXT:    pand %xmm2, %xmm0
2886; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
2887; SSE2-NEXT:    por %xmm0, %xmm1
2888; SSE2-NEXT:    pand %xmm3, %xmm1
2889; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2890; SSE2-NEXT:    packuswb %xmm1, %xmm1
2891; SSE2-NEXT:    packuswb %xmm1, %xmm1
2892; SSE2-NEXT:    packuswb %xmm1, %xmm1
2893; SSE2-NEXT:    movd %xmm1, %eax
2894; SSE2-NEXT:    movw %ax, (%rdi)
2895; SSE2-NEXT:    retq
2896;
2897; SSSE3-LABEL: trunc_packus_v2i64_v2i8_store:
2898; SSSE3:       # %bb.0:
2899; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
2900; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2901; SSSE3-NEXT:    pxor %xmm1, %xmm2
2902; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
2903; SSSE3-NEXT:    pxor %xmm4, %xmm4
2904; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm4
2905; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
2906; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm3
2907; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
2908; SSSE3-NEXT:    pand %xmm4, %xmm2
2909; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2910; SSSE3-NEXT:    por %xmm2, %xmm3
2911; SSSE3-NEXT:    pand %xmm3, %xmm0
2912; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2913; SSSE3-NEXT:    por %xmm0, %xmm3
2914; SSSE3-NEXT:    movdqa %xmm3, %xmm0
2915; SSSE3-NEXT:    pxor %xmm1, %xmm0
2916; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2917; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2918; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
2919; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
2920; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
2921; SSSE3-NEXT:    pand %xmm4, %xmm0
2922; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
2923; SSSE3-NEXT:    por %xmm0, %xmm1
2924; SSSE3-NEXT:    pand %xmm3, %xmm1
2925; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2926; SSSE3-NEXT:    movd %xmm1, %eax
2927; SSSE3-NEXT:    movw %ax, (%rdi)
2928; SSSE3-NEXT:    retq
2929;
2930; SSE41-LABEL: trunc_packus_v2i64_v2i8_store:
2931; SSE41:       # %bb.0:
2932; SSE41-NEXT:    movdqa %xmm0, %xmm1
2933; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [255,255]
2934; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
2935; SSE41-NEXT:    pxor %xmm3, %xmm0
2936; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
2937; SSE41-NEXT:    movdqa %xmm4, %xmm5
2938; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
2939; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
2940; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2941; SSE41-NEXT:    pand %xmm5, %xmm0
2942; SSE41-NEXT:    por %xmm4, %xmm0
2943; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
2944; SSE41-NEXT:    xorpd %xmm1, %xmm1
2945; SSE41-NEXT:    movapd %xmm2, %xmm4
2946; SSE41-NEXT:    xorpd %xmm3, %xmm4
2947; SSE41-NEXT:    movapd %xmm4, %xmm5
2948; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
2949; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
2950; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
2951; SSE41-NEXT:    pand %xmm5, %xmm0
2952; SSE41-NEXT:    por %xmm4, %xmm0
2953; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
2954; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2955; SSE41-NEXT:    pextrw $0, %xmm1, (%rdi)
2956; SSE41-NEXT:    retq
2957;
2958; AVX-LABEL: trunc_packus_v2i64_v2i8_store:
2959; AVX:       # %bb.0:
2960; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255]
2961; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
2962; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
2963; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2964; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
2965; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
2966; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2967; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
2968; AVX-NEXT:    retq
2969;
2970; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store:
2971; AVX512F:       # %bb.0:
2972; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2973; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2974; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
2975; AVX512F-NEXT:    vpmovusqb %zmm0, %xmm0
2976; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rdi)
2977; AVX512F-NEXT:    vzeroupper
2978; AVX512F-NEXT:    retq
2979;
2980; AVX512VL-LABEL: trunc_packus_v2i64_v2i8_store:
2981; AVX512VL:       # %bb.0:
2982; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2983; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
2984; AVX512VL-NEXT:    vpmovusqb %xmm0, (%rdi)
2985; AVX512VL-NEXT:    retq
2986;
2987; AVX512BW-LABEL: trunc_packus_v2i64_v2i8_store:
2988; AVX512BW:       # %bb.0:
2989; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2990; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2991; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
2992; AVX512BW-NEXT:    vpmovusqb %zmm0, %xmm0
2993; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rdi)
2994; AVX512BW-NEXT:    vzeroupper
2995; AVX512BW-NEXT:    retq
2996;
2997; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8_store:
2998; AVX512BWVL:       # %bb.0:
2999; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3000; AVX512BWVL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
3001; AVX512BWVL-NEXT:    vpmovusqb %xmm0, (%rdi)
3002; AVX512BWVL-NEXT:    retq
3003;
3004; SKX-LABEL: trunc_packus_v2i64_v2i8_store:
3005; SKX:       # %bb.0:
3006; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3007; SKX-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
3008; SKX-NEXT:    vpmovusqb %xmm0, (%rdi)
3009; SKX-NEXT:    retq
3010  %1 = icmp slt <2 x i64> %a0, <i64 255, i64 255>
3011  %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 255, i64 255>
3012  %3 = icmp sgt <2 x i64> %2, zeroinitializer
3013  %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer
3014  %5 = trunc <2 x i64> %4 to <2 x i8>
3015  store <2 x i8> %5, ptr%p1
3016  ret void
3017}
3018
3019define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
3020; SSE2-LABEL: trunc_packus_v4i64_v4i8:
3021; SSE2:       # %bb.0:
3022; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3023; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
3024; SSE2-NEXT:    movdqa %xmm1, %xmm4
3025; SSE2-NEXT:    pxor %xmm3, %xmm4
3026; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
3027; SSE2-NEXT:    pxor %xmm9, %xmm9
3028; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
3029; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483903,2147483903]
3030; SSE2-NEXT:    movdqa %xmm2, %xmm7
3031; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
3032; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
3033; SSE2-NEXT:    pand %xmm5, %xmm6
3034; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
3035; SSE2-NEXT:    por %xmm6, %xmm4
3036; SSE2-NEXT:    pand %xmm4, %xmm1
3037; SSE2-NEXT:    pandn %xmm8, %xmm4
3038; SSE2-NEXT:    por %xmm1, %xmm4
3039; SSE2-NEXT:    movdqa %xmm0, %xmm1
3040; SSE2-NEXT:    pxor %xmm3, %xmm1
3041; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
3042; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
3043; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3044; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2]
3045; SSE2-NEXT:    pand %xmm5, %xmm1
3046; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3047; SSE2-NEXT:    por %xmm1, %xmm2
3048; SSE2-NEXT:    pand %xmm2, %xmm0
3049; SSE2-NEXT:    pandn %xmm8, %xmm2
3050; SSE2-NEXT:    por %xmm2, %xmm0
3051; SSE2-NEXT:    movdqa %xmm0, %xmm1
3052; SSE2-NEXT:    pxor %xmm3, %xmm1
3053; SSE2-NEXT:    movdqa %xmm1, %xmm2
3054; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
3055; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
3056; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3057; SSE2-NEXT:    pand %xmm2, %xmm1
3058; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3059; SSE2-NEXT:    por %xmm1, %xmm2
3060; SSE2-NEXT:    movdqa %xmm4, %xmm1
3061; SSE2-NEXT:    pxor %xmm3, %xmm1
3062; SSE2-NEXT:    movdqa %xmm1, %xmm5
3063; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
3064; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
3065; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3066; SSE2-NEXT:    pand %xmm5, %xmm1
3067; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
3068; SSE2-NEXT:    por %xmm1, %xmm3
3069; SSE2-NEXT:    pand %xmm8, %xmm3
3070; SSE2-NEXT:    pand %xmm4, %xmm3
3071; SSE2-NEXT:    pand %xmm8, %xmm2
3072; SSE2-NEXT:    pand %xmm2, %xmm0
3073; SSE2-NEXT:    packuswb %xmm3, %xmm0
3074; SSE2-NEXT:    packuswb %xmm0, %xmm0
3075; SSE2-NEXT:    packuswb %xmm0, %xmm0
3076; SSE2-NEXT:    retq
3077;
3078; SSSE3-LABEL: trunc_packus_v4i64_v4i8:
3079; SSSE3:       # %bb.0:
3080; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3081; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
3082; SSSE3-NEXT:    movdqa %xmm1, %xmm3
3083; SSSE3-NEXT:    pxor %xmm2, %xmm3
3084; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
3085; SSSE3-NEXT:    pxor %xmm9, %xmm9
3086; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
3087; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
3088; SSSE3-NEXT:    movdqa %xmm4, %xmm7
3089; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
3090; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
3091; SSSE3-NEXT:    pand %xmm5, %xmm6
3092; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
3093; SSSE3-NEXT:    por %xmm6, %xmm3
3094; SSSE3-NEXT:    pand %xmm3, %xmm1
3095; SSSE3-NEXT:    pandn %xmm8, %xmm3
3096; SSSE3-NEXT:    por %xmm1, %xmm3
3097; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3098; SSSE3-NEXT:    pxor %xmm2, %xmm1
3099; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
3100; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
3101; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
3102; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
3103; SSSE3-NEXT:    pand %xmm5, %xmm1
3104; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
3105; SSSE3-NEXT:    por %xmm1, %xmm4
3106; SSSE3-NEXT:    pand %xmm4, %xmm0
3107; SSSE3-NEXT:    pandn %xmm8, %xmm4
3108; SSSE3-NEXT:    por %xmm4, %xmm0
3109; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3110; SSSE3-NEXT:    pxor %xmm2, %xmm1
3111; SSSE3-NEXT:    movdqa %xmm1, %xmm4
3112; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
3113; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
3114; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
3115; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3116; SSSE3-NEXT:    pand %xmm5, %xmm1
3117; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
3118; SSSE3-NEXT:    por %xmm1, %xmm4
3119; SSSE3-NEXT:    pand %xmm4, %xmm0
3120; SSSE3-NEXT:    movdqa %xmm3, %xmm1
3121; SSSE3-NEXT:    pxor %xmm2, %xmm1
3122; SSSE3-NEXT:    movdqa %xmm1, %xmm4
3123; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
3124; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
3125; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm1
3126; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3127; SSSE3-NEXT:    pand %xmm5, %xmm1
3128; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
3129; SSSE3-NEXT:    por %xmm1, %xmm2
3130; SSSE3-NEXT:    pand %xmm3, %xmm2
3131; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3132; SSSE3-NEXT:    pshufb %xmm1, %xmm2
3133; SSSE3-NEXT:    pshufb %xmm1, %xmm0
3134; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3135; SSSE3-NEXT:    retq
3136;
3137; SSE41-LABEL: trunc_packus_v4i64_v4i8:
3138; SSE41:       # %bb.0:
3139; SSE41-NEXT:    movdqa %xmm0, %xmm2
3140; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [255,255]
3141; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
3142; SSE41-NEXT:    movdqa %xmm1, %xmm0
3143; SSE41-NEXT:    pxor %xmm3, %xmm0
3144; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
3145; SSE41-NEXT:    movdqa %xmm6, %xmm5
3146; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
3147; SSE41-NEXT:    movdqa %xmm6, %xmm7
3148; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
3149; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
3150; SSE41-NEXT:    pand %xmm5, %xmm0
3151; SSE41-NEXT:    por %xmm7, %xmm0
3152; SSE41-NEXT:    movapd %xmm4, %xmm5
3153; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
3154; SSE41-NEXT:    movdqa %xmm2, %xmm0
3155; SSE41-NEXT:    pxor %xmm3, %xmm0
3156; SSE41-NEXT:    movdqa %xmm6, %xmm1
3157; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
3158; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
3159; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3160; SSE41-NEXT:    pand %xmm1, %xmm0
3161; SSE41-NEXT:    por %xmm6, %xmm0
3162; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
3163; SSE41-NEXT:    xorpd %xmm2, %xmm2
3164; SSE41-NEXT:    movapd %xmm4, %xmm1
3165; SSE41-NEXT:    xorpd %xmm3, %xmm1
3166; SSE41-NEXT:    movapd %xmm1, %xmm6
3167; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
3168; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
3169; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
3170; SSE41-NEXT:    pand %xmm6, %xmm0
3171; SSE41-NEXT:    por %xmm1, %xmm0
3172; SSE41-NEXT:    pxor %xmm1, %xmm1
3173; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
3174; SSE41-NEXT:    movapd %xmm5, %xmm4
3175; SSE41-NEXT:    xorpd %xmm3, %xmm4
3176; SSE41-NEXT:    movapd %xmm4, %xmm6
3177; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
3178; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
3179; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
3180; SSE41-NEXT:    pand %xmm6, %xmm0
3181; SSE41-NEXT:    por %xmm4, %xmm0
3182; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
3183; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3184; SSE41-NEXT:    pshufb %xmm0, %xmm2
3185; SSE41-NEXT:    pshufb %xmm0, %xmm1
3186; SSE41-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3187; SSE41-NEXT:    movdqa %xmm1, %xmm0
3188; SSE41-NEXT:    retq
3189;
3190; AVX1-LABEL: trunc_packus_v4i64_v4i8:
3191; AVX1:       # %bb.0:
3192; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3193; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255]
3194; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
3195; AVX1-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
3196; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
3197; AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
3198; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3199; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm3
3200; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
3201; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
3202; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
3203; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3204; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3205; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3206; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3207; AVX1-NEXT:    vzeroupper
3208; AVX1-NEXT:    retq
3209;
3210; AVX2-LABEL: trunc_packus_v4i64_v4i8:
3211; AVX2:       # %bb.0:
3212; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255]
3213; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
3214; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
3215; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3216; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
3217; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
3218; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3219; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3220; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3221; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3222; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3223; AVX2-NEXT:    vzeroupper
3224; AVX2-NEXT:    retq
3225;
3226; AVX512F-LABEL: trunc_packus_v4i64_v4i8:
3227; AVX512F:       # %bb.0:
3228; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3229; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3230; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
3231; AVX512F-NEXT:    vpmovusqb %zmm0, %xmm0
3232; AVX512F-NEXT:    vzeroupper
3233; AVX512F-NEXT:    retq
3234;
3235; AVX512VL-LABEL: trunc_packus_v4i64_v4i8:
3236; AVX512VL:       # %bb.0:
3237; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3238; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
3239; AVX512VL-NEXT:    vpmovusqb %ymm0, %xmm0
3240; AVX512VL-NEXT:    vzeroupper
3241; AVX512VL-NEXT:    retq
3242;
3243; AVX512BW-LABEL: trunc_packus_v4i64_v4i8:
3244; AVX512BW:       # %bb.0:
3245; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3246; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3247; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
3248; AVX512BW-NEXT:    vpmovusqb %zmm0, %xmm0
3249; AVX512BW-NEXT:    vzeroupper
3250; AVX512BW-NEXT:    retq
3251;
3252; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i8:
3253; AVX512BWVL:       # %bb.0:
3254; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3255; AVX512BWVL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
3256; AVX512BWVL-NEXT:    vpmovusqb %ymm0, %xmm0
3257; AVX512BWVL-NEXT:    vzeroupper
3258; AVX512BWVL-NEXT:    retq
3259;
3260; SKX-LABEL: trunc_packus_v4i64_v4i8:
3261; SKX:       # %bb.0:
3262; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3263; SKX-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
3264; SKX-NEXT:    vpmovusqb %ymm0, %xmm0
3265; SKX-NEXT:    vzeroupper
3266; SKX-NEXT:    retq
3267  %1 = icmp slt <4 x i64> %a0, <i64 255, i64 255, i64 255, i64 255>
3268  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 255, i64 255, i64 255, i64 255>
3269  %3 = icmp sgt <4 x i64> %2, zeroinitializer
3270  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
3271  %5 = trunc <4 x i64> %4 to <4 x i8>
3272  ret <4 x i8> %5
3273}
3274
3275define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
3276; SSE2-LABEL: trunc_packus_v4i64_v4i8_store:
3277; SSE2:       # %bb.0:
3278; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3279; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
3280; SSE2-NEXT:    movdqa %xmm1, %xmm4
3281; SSE2-NEXT:    pxor %xmm3, %xmm4
3282; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
3283; SSE2-NEXT:    pxor %xmm9, %xmm9
3284; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
3285; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483903,2147483903]
3286; SSE2-NEXT:    movdqa %xmm2, %xmm7
3287; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
3288; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
3289; SSE2-NEXT:    pand %xmm5, %xmm6
3290; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
3291; SSE2-NEXT:    por %xmm6, %xmm4
3292; SSE2-NEXT:    pand %xmm4, %xmm1
3293; SSE2-NEXT:    pandn %xmm8, %xmm4
3294; SSE2-NEXT:    por %xmm1, %xmm4
3295; SSE2-NEXT:    movdqa %xmm0, %xmm1
3296; SSE2-NEXT:    pxor %xmm3, %xmm1
3297; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
3298; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
3299; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3300; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
3301; SSE2-NEXT:    pand %xmm5, %xmm6
3302; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
3303; SSE2-NEXT:    por %xmm6, %xmm1
3304; SSE2-NEXT:    pand %xmm1, %xmm0
3305; SSE2-NEXT:    pandn %xmm8, %xmm1
3306; SSE2-NEXT:    por %xmm0, %xmm1
3307; SSE2-NEXT:    movdqa %xmm1, %xmm0
3308; SSE2-NEXT:    pxor %xmm3, %xmm0
3309; SSE2-NEXT:    movdqa %xmm0, %xmm2
3310; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
3311; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
3312; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3313; SSE2-NEXT:    pand %xmm2, %xmm0
3314; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3315; SSE2-NEXT:    por %xmm0, %xmm2
3316; SSE2-NEXT:    movdqa %xmm4, %xmm0
3317; SSE2-NEXT:    pxor %xmm3, %xmm0
3318; SSE2-NEXT:    movdqa %xmm0, %xmm5
3319; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
3320; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
3321; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3322; SSE2-NEXT:    pand %xmm5, %xmm0
3323; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
3324; SSE2-NEXT:    por %xmm0, %xmm3
3325; SSE2-NEXT:    pand %xmm8, %xmm3
3326; SSE2-NEXT:    pand %xmm4, %xmm3
3327; SSE2-NEXT:    pand %xmm8, %xmm2
3328; SSE2-NEXT:    pand %xmm1, %xmm2
3329; SSE2-NEXT:    packuswb %xmm3, %xmm2
3330; SSE2-NEXT:    packuswb %xmm2, %xmm2
3331; SSE2-NEXT:    packuswb %xmm2, %xmm2
3332; SSE2-NEXT:    movd %xmm2, (%rdi)
3333; SSE2-NEXT:    retq
3334;
3335; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store:
3336; SSSE3:       # %bb.0:
3337; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3338; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
3339; SSSE3-NEXT:    movdqa %xmm1, %xmm3
3340; SSSE3-NEXT:    pxor %xmm2, %xmm3
3341; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
3342; SSSE3-NEXT:    pxor %xmm9, %xmm9
3343; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
3344; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
3345; SSSE3-NEXT:    movdqa %xmm4, %xmm7
3346; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
3347; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
3348; SSSE3-NEXT:    pand %xmm5, %xmm6
3349; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
3350; SSSE3-NEXT:    por %xmm6, %xmm3
3351; SSSE3-NEXT:    pand %xmm3, %xmm1
3352; SSSE3-NEXT:    pandn %xmm8, %xmm3
3353; SSSE3-NEXT:    por %xmm1, %xmm3
3354; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3355; SSSE3-NEXT:    pxor %xmm2, %xmm1
3356; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
3357; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
3358; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
3359; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
3360; SSSE3-NEXT:    pand %xmm5, %xmm1
3361; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
3362; SSSE3-NEXT:    por %xmm1, %xmm4
3363; SSSE3-NEXT:    pand %xmm4, %xmm0
3364; SSSE3-NEXT:    pandn %xmm8, %xmm4
3365; SSSE3-NEXT:    por %xmm0, %xmm4
3366; SSSE3-NEXT:    movdqa %xmm4, %xmm0
3367; SSSE3-NEXT:    pxor %xmm2, %xmm0
3368; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3369; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
3370; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
3371; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
3372; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3373; SSSE3-NEXT:    pand %xmm5, %xmm0
3374; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3375; SSSE3-NEXT:    por %xmm0, %xmm1
3376; SSSE3-NEXT:    pand %xmm4, %xmm1
3377; SSSE3-NEXT:    movdqa %xmm3, %xmm0
3378; SSSE3-NEXT:    pxor %xmm2, %xmm0
3379; SSSE3-NEXT:    movdqa %xmm0, %xmm4
3380; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm4
3381; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
3382; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
3383; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3384; SSSE3-NEXT:    pand %xmm5, %xmm0
3385; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
3386; SSSE3-NEXT:    por %xmm0, %xmm2
3387; SSSE3-NEXT:    pand %xmm3, %xmm2
3388; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3389; SSSE3-NEXT:    pshufb %xmm0, %xmm2
3390; SSSE3-NEXT:    pshufb %xmm0, %xmm1
3391; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3392; SSSE3-NEXT:    movd %xmm1, (%rdi)
3393; SSSE3-NEXT:    retq
3394;
3395; SSE41-LABEL: trunc_packus_v4i64_v4i8_store:
3396; SSE41:       # %bb.0:
3397; SSE41-NEXT:    movdqa %xmm0, %xmm2
3398; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [255,255]
3399; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
3400; SSE41-NEXT:    movdqa %xmm1, %xmm0
3401; SSE41-NEXT:    pxor %xmm3, %xmm0
3402; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
3403; SSE41-NEXT:    movdqa %xmm6, %xmm5
3404; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
3405; SSE41-NEXT:    movdqa %xmm6, %xmm7
3406; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
3407; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
3408; SSE41-NEXT:    pand %xmm5, %xmm0
3409; SSE41-NEXT:    por %xmm7, %xmm0
3410; SSE41-NEXT:    movapd %xmm4, %xmm5
3411; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
3412; SSE41-NEXT:    movdqa %xmm2, %xmm0
3413; SSE41-NEXT:    pxor %xmm3, %xmm0
3414; SSE41-NEXT:    movdqa %xmm6, %xmm1
3415; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
3416; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
3417; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3418; SSE41-NEXT:    pand %xmm1, %xmm0
3419; SSE41-NEXT:    por %xmm6, %xmm0
3420; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
3421; SSE41-NEXT:    pxor %xmm1, %xmm1
3422; SSE41-NEXT:    movapd %xmm4, %xmm2
3423; SSE41-NEXT:    xorpd %xmm3, %xmm2
3424; SSE41-NEXT:    movapd %xmm2, %xmm6
3425; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
3426; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
3427; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
3428; SSE41-NEXT:    pand %xmm6, %xmm0
3429; SSE41-NEXT:    por %xmm2, %xmm0
3430; SSE41-NEXT:    pxor %xmm2, %xmm2
3431; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
3432; SSE41-NEXT:    movapd %xmm5, %xmm4
3433; SSE41-NEXT:    xorpd %xmm3, %xmm4
3434; SSE41-NEXT:    movapd %xmm4, %xmm6
3435; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
3436; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
3437; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
3438; SSE41-NEXT:    pand %xmm6, %xmm0
3439; SSE41-NEXT:    por %xmm4, %xmm0
3440; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
3441; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3442; SSE41-NEXT:    pshufb %xmm0, %xmm1
3443; SSE41-NEXT:    pshufb %xmm0, %xmm2
3444; SSE41-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3445; SSE41-NEXT:    movd %xmm2, (%rdi)
3446; SSE41-NEXT:    retq
3447;
3448; AVX1-LABEL: trunc_packus_v4i64_v4i8_store:
3449; AVX1:       # %bb.0:
3450; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3451; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255]
3452; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
3453; AVX1-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
3454; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
3455; AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
3456; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3457; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm3
3458; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
3459; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
3460; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
3461; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3462; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3463; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3464; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3465; AVX1-NEXT:    vmovd %xmm0, (%rdi)
3466; AVX1-NEXT:    vzeroupper
3467; AVX1-NEXT:    retq
3468;
3469; AVX2-LABEL: trunc_packus_v4i64_v4i8_store:
3470; AVX2:       # %bb.0:
3471; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255]
3472; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
3473; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
3474; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3475; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
3476; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
3477; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3478; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3479; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3480; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3481; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3482; AVX2-NEXT:    vmovd %xmm0, (%rdi)
3483; AVX2-NEXT:    vzeroupper
3484; AVX2-NEXT:    retq
3485;
3486; AVX512F-LABEL: trunc_packus_v4i64_v4i8_store:
3487; AVX512F:       # %bb.0:
3488; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3489; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3490; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
3491; AVX512F-NEXT:    vpmovusqb %zmm0, %xmm0
3492; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
3493; AVX512F-NEXT:    vzeroupper
3494; AVX512F-NEXT:    retq
3495;
3496; AVX512VL-LABEL: trunc_packus_v4i64_v4i8_store:
3497; AVX512VL:       # %bb.0:
3498; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3499; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
3500; AVX512VL-NEXT:    vpmovusqb %ymm0, (%rdi)
3501; AVX512VL-NEXT:    vzeroupper
3502; AVX512VL-NEXT:    retq
3503;
3504; AVX512BW-LABEL: trunc_packus_v4i64_v4i8_store:
3505; AVX512BW:       # %bb.0:
3506; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3507; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3508; AVX512BW-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
3509; AVX512BW-NEXT:    vpmovusqb %zmm0, %xmm0
3510; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
3511; AVX512BW-NEXT:    vzeroupper
3512; AVX512BW-NEXT:    retq
3513;
3514; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i8_store:
3515; AVX512BWVL:       # %bb.0:
3516; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3517; AVX512BWVL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
3518; AVX512BWVL-NEXT:    vpmovusqb %ymm0, (%rdi)
3519; AVX512BWVL-NEXT:    vzeroupper
3520; AVX512BWVL-NEXT:    retq
3521;
3522; SKX-LABEL: trunc_packus_v4i64_v4i8_store:
3523; SKX:       # %bb.0:
3524; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3525; SKX-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
3526; SKX-NEXT:    vpmovusqb %ymm0, (%rdi)
3527; SKX-NEXT:    vzeroupper
3528; SKX-NEXT:    retq
3529  %1 = icmp slt <4 x i64> %a0, <i64 255, i64 255, i64 255, i64 255>
3530  %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 255, i64 255, i64 255, i64 255>
3531  %3 = icmp sgt <4 x i64> %2, zeroinitializer
3532  %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
3533  %5 = trunc <4 x i64> %4 to <4 x i8>
3534  store <4 x i8> %5, ptr%p1
3535  ret void
3536}
3537
3538define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
3539; SSE2-LABEL: trunc_packus_v8i64_v8i8:
3540; SSE2:       # %bb.0:
3541; SSE2-NEXT:    movdqa (%rdi), %xmm5
3542; SSE2-NEXT:    movdqa 16(%rdi), %xmm10
3543; SSE2-NEXT:    movdqa 32(%rdi), %xmm3
3544; SSE2-NEXT:    movdqa 48(%rdi), %xmm4
3545; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3546; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
3547; SSE2-NEXT:    movdqa %xmm3, %xmm2
3548; SSE2-NEXT:    pxor %xmm11, %xmm2
3549; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
3550; SSE2-NEXT:    pxor %xmm9, %xmm9
3551; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
3552; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
3553; SSE2-NEXT:    movdqa %xmm0, %xmm6
3554; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
3555; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
3556; SSE2-NEXT:    pand %xmm7, %xmm1
3557; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
3558; SSE2-NEXT:    por %xmm1, %xmm2
3559; SSE2-NEXT:    pand %xmm2, %xmm3
3560; SSE2-NEXT:    pandn %xmm8, %xmm2
3561; SSE2-NEXT:    por %xmm3, %xmm2
3562; SSE2-NEXT:    movdqa %xmm4, %xmm1
3563; SSE2-NEXT:    pxor %xmm11, %xmm1
3564; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
3565; SSE2-NEXT:    pcmpeqd %xmm9, %xmm3
3566; SSE2-NEXT:    movdqa %xmm0, %xmm6
3567; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
3568; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
3569; SSE2-NEXT:    pand %xmm3, %xmm1
3570; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
3571; SSE2-NEXT:    por %xmm1, %xmm3
3572; SSE2-NEXT:    pand %xmm3, %xmm4
3573; SSE2-NEXT:    pandn %xmm8, %xmm3
3574; SSE2-NEXT:    por %xmm4, %xmm3
3575; SSE2-NEXT:    movdqa %xmm5, %xmm1
3576; SSE2-NEXT:    pxor %xmm11, %xmm1
3577; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
3578; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
3579; SSE2-NEXT:    movdqa %xmm0, %xmm6
3580; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
3581; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
3582; SSE2-NEXT:    pand %xmm4, %xmm1
3583; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
3584; SSE2-NEXT:    por %xmm1, %xmm4
3585; SSE2-NEXT:    pand %xmm4, %xmm5
3586; SSE2-NEXT:    pandn %xmm8, %xmm4
3587; SSE2-NEXT:    por %xmm5, %xmm4
3588; SSE2-NEXT:    movdqa %xmm10, %xmm1
3589; SSE2-NEXT:    pxor %xmm11, %xmm1
3590; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
3591; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
3592; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
3593; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
3594; SSE2-NEXT:    pand %xmm5, %xmm1
3595; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3596; SSE2-NEXT:    por %xmm1, %xmm0
3597; SSE2-NEXT:    pand %xmm0, %xmm10
3598; SSE2-NEXT:    pandn %xmm8, %xmm0
3599; SSE2-NEXT:    por %xmm10, %xmm0
3600; SSE2-NEXT:    movdqa %xmm0, %xmm1
3601; SSE2-NEXT:    pxor %xmm11, %xmm1
3602; SSE2-NEXT:    movdqa %xmm1, %xmm5
3603; SSE2-NEXT:    pcmpgtd %xmm11, %xmm5
3604; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
3605; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
3606; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3607; SSE2-NEXT:    pand %xmm6, %xmm1
3608; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
3609; SSE2-NEXT:    por %xmm1, %xmm5
3610; SSE2-NEXT:    pand %xmm0, %xmm5
3611; SSE2-NEXT:    movdqa %xmm4, %xmm0
3612; SSE2-NEXT:    pxor %xmm11, %xmm0
3613; SSE2-NEXT:    movdqa %xmm0, %xmm1
3614; SSE2-NEXT:    pcmpgtd %xmm11, %xmm1
3615; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
3616; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
3617; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
3618; SSE2-NEXT:    pand %xmm6, %xmm7
3619; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
3620; SSE2-NEXT:    por %xmm7, %xmm0
3621; SSE2-NEXT:    pand %xmm4, %xmm0
3622; SSE2-NEXT:    packuswb %xmm5, %xmm0
3623; SSE2-NEXT:    movdqa %xmm3, %xmm1
3624; SSE2-NEXT:    pxor %xmm11, %xmm1
3625; SSE2-NEXT:    movdqa %xmm1, %xmm4
3626; SSE2-NEXT:    pcmpgtd %xmm11, %xmm4
3627; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
3628; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
3629; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3630; SSE2-NEXT:    pand %xmm5, %xmm1
3631; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
3632; SSE2-NEXT:    por %xmm1, %xmm4
3633; SSE2-NEXT:    pand %xmm3, %xmm4
3634; SSE2-NEXT:    movdqa %xmm2, %xmm1
3635; SSE2-NEXT:    pxor %xmm11, %xmm1
3636; SSE2-NEXT:    movdqa %xmm1, %xmm3
3637; SSE2-NEXT:    pcmpgtd %xmm11, %xmm3
3638; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
3639; SSE2-NEXT:    pcmpeqd %xmm11, %xmm1
3640; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3641; SSE2-NEXT:    pand %xmm5, %xmm1
3642; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3643; SSE2-NEXT:    por %xmm1, %xmm3
3644; SSE2-NEXT:    pand %xmm2, %xmm3
3645; SSE2-NEXT:    packuswb %xmm4, %xmm3
3646; SSE2-NEXT:    packuswb %xmm3, %xmm0
3647; SSE2-NEXT:    packuswb %xmm0, %xmm0
3648; SSE2-NEXT:    retq
3649;
3650; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
3651; SSSE3:       # %bb.0:
3652; SSSE3-NEXT:    movdqa (%rdi), %xmm5
3653; SSSE3-NEXT:    movdqa 16(%rdi), %xmm10
3654; SSSE3-NEXT:    movdqa 32(%rdi), %xmm3
3655; SSSE3-NEXT:    movdqa 48(%rdi), %xmm4
3656; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3657; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
3658; SSSE3-NEXT:    movdqa %xmm3, %xmm2
3659; SSSE3-NEXT:    pxor %xmm11, %xmm2
3660; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
3661; SSSE3-NEXT:    pxor %xmm9, %xmm9
3662; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
3663; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483903,2147483903]
3664; SSSE3-NEXT:    movdqa %xmm0, %xmm6
3665; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
3666; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
3667; SSSE3-NEXT:    pand %xmm7, %xmm1
3668; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
3669; SSSE3-NEXT:    por %xmm1, %xmm2
3670; SSSE3-NEXT:    pand %xmm2, %xmm3
3671; SSSE3-NEXT:    pandn %xmm8, %xmm2
3672; SSSE3-NEXT:    por %xmm3, %xmm2
3673; SSSE3-NEXT:    movdqa %xmm4, %xmm1
3674; SSSE3-NEXT:    pxor %xmm11, %xmm1
3675; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
3676; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm3
3677; SSSE3-NEXT:    movdqa %xmm0, %xmm6
3678; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
3679; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
3680; SSSE3-NEXT:    pand %xmm3, %xmm1
3681; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
3682; SSSE3-NEXT:    por %xmm1, %xmm3
3683; SSSE3-NEXT:    pand %xmm3, %xmm4
3684; SSSE3-NEXT:    pandn %xmm8, %xmm3
3685; SSSE3-NEXT:    por %xmm4, %xmm3
3686; SSSE3-NEXT:    movdqa %xmm5, %xmm1
3687; SSSE3-NEXT:    pxor %xmm11, %xmm1
3688; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
3689; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
3690; SSSE3-NEXT:    movdqa %xmm0, %xmm6
3691; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
3692; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
3693; SSSE3-NEXT:    pand %xmm4, %xmm1
3694; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
3695; SSSE3-NEXT:    por %xmm1, %xmm4
3696; SSSE3-NEXT:    pand %xmm4, %xmm5
3697; SSSE3-NEXT:    pandn %xmm8, %xmm4
3698; SSSE3-NEXT:    por %xmm5, %xmm4
3699; SSSE3-NEXT:    movdqa %xmm10, %xmm1
3700; SSSE3-NEXT:    pxor %xmm11, %xmm1
3701; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
3702; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
3703; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
3704; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
3705; SSSE3-NEXT:    pand %xmm5, %xmm1
3706; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3707; SSSE3-NEXT:    por %xmm1, %xmm0
3708; SSSE3-NEXT:    pand %xmm0, %xmm10
3709; SSSE3-NEXT:    pandn %xmm8, %xmm0
3710; SSSE3-NEXT:    por %xmm10, %xmm0
3711; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3712; SSSE3-NEXT:    pxor %xmm11, %xmm1
3713; SSSE3-NEXT:    movdqa %xmm1, %xmm5
3714; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm5
3715; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
3716; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
3717; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3718; SSSE3-NEXT:    pand %xmm6, %xmm1
3719; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
3720; SSSE3-NEXT:    por %xmm1, %xmm5
3721; SSSE3-NEXT:    pand %xmm0, %xmm5
3722; SSSE3-NEXT:    movdqa %xmm4, %xmm0
3723; SSSE3-NEXT:    pxor %xmm11, %xmm0
3724; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3725; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm1
3726; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
3727; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
3728; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
3729; SSSE3-NEXT:    pand %xmm6, %xmm7
3730; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
3731; SSSE3-NEXT:    por %xmm7, %xmm0
3732; SSSE3-NEXT:    pand %xmm4, %xmm0
3733; SSSE3-NEXT:    packuswb %xmm5, %xmm0
3734; SSSE3-NEXT:    movdqa %xmm3, %xmm1
3735; SSSE3-NEXT:    pxor %xmm11, %xmm1
3736; SSSE3-NEXT:    movdqa %xmm1, %xmm4
3737; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm4
3738; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
3739; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
3740; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3741; SSSE3-NEXT:    pand %xmm5, %xmm1
3742; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
3743; SSSE3-NEXT:    por %xmm1, %xmm4
3744; SSSE3-NEXT:    pand %xmm3, %xmm4
3745; SSSE3-NEXT:    movdqa %xmm2, %xmm1
3746; SSSE3-NEXT:    pxor %xmm11, %xmm1
3747; SSSE3-NEXT:    movdqa %xmm1, %xmm3
3748; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm3
3749; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
3750; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm1
3751; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3752; SSSE3-NEXT:    pand %xmm5, %xmm1
3753; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3754; SSSE3-NEXT:    por %xmm1, %xmm3
3755; SSSE3-NEXT:    pand %xmm2, %xmm3
3756; SSSE3-NEXT:    packuswb %xmm4, %xmm3
3757; SSSE3-NEXT:    packuswb %xmm3, %xmm0
3758; SSSE3-NEXT:    packuswb %xmm0, %xmm0
3759; SSSE3-NEXT:    retq
3760;
3761; SSE41-LABEL: trunc_packus_v8i64_v8i8:
3762; SSE41:       # %bb.0:
3763; SSE41-NEXT:    movdqa (%rdi), %xmm10
3764; SSE41-NEXT:    movdqa 16(%rdi), %xmm9
3765; SSE41-NEXT:    movdqa 32(%rdi), %xmm3
3766; SSE41-NEXT:    movdqa 48(%rdi), %xmm5
3767; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [255,255]
3768; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
3769; SSE41-NEXT:    movdqa %xmm3, %xmm0
3770; SSE41-NEXT:    pxor %xmm2, %xmm0
3771; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
3772; SSE41-NEXT:    movdqa %xmm4, %xmm7
3773; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
3774; SSE41-NEXT:    movdqa %xmm4, %xmm6
3775; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
3776; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3777; SSE41-NEXT:    pand %xmm7, %xmm0
3778; SSE41-NEXT:    por %xmm6, %xmm0
3779; SSE41-NEXT:    movapd %xmm1, %xmm8
3780; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
3781; SSE41-NEXT:    movdqa %xmm5, %xmm0
3782; SSE41-NEXT:    pxor %xmm2, %xmm0
3783; SSE41-NEXT:    movdqa %xmm4, %xmm3
3784; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
3785; SSE41-NEXT:    movdqa %xmm4, %xmm6
3786; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
3787; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3788; SSE41-NEXT:    pand %xmm3, %xmm0
3789; SSE41-NEXT:    por %xmm6, %xmm0
3790; SSE41-NEXT:    movapd %xmm1, %xmm6
3791; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm6
3792; SSE41-NEXT:    movdqa %xmm10, %xmm0
3793; SSE41-NEXT:    pxor %xmm2, %xmm0
3794; SSE41-NEXT:    movdqa %xmm4, %xmm3
3795; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
3796; SSE41-NEXT:    movdqa %xmm4, %xmm5
3797; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
3798; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
3799; SSE41-NEXT:    pand %xmm3, %xmm0
3800; SSE41-NEXT:    por %xmm5, %xmm0
3801; SSE41-NEXT:    movapd %xmm1, %xmm3
3802; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
3803; SSE41-NEXT:    movdqa %xmm9, %xmm0
3804; SSE41-NEXT:    pxor %xmm2, %xmm0
3805; SSE41-NEXT:    movdqa %xmm4, %xmm5
3806; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
3807; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
3808; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
3809; SSE41-NEXT:    pand %xmm5, %xmm0
3810; SSE41-NEXT:    por %xmm4, %xmm0
3811; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm1
3812; SSE41-NEXT:    pxor %xmm5, %xmm5
3813; SSE41-NEXT:    movapd %xmm1, %xmm4
3814; SSE41-NEXT:    xorpd %xmm2, %xmm4
3815; SSE41-NEXT:    movapd %xmm4, %xmm7
3816; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
3817; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
3818; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
3819; SSE41-NEXT:    pand %xmm7, %xmm0
3820; SSE41-NEXT:    por %xmm4, %xmm0
3821; SSE41-NEXT:    pxor %xmm4, %xmm4
3822; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
3823; SSE41-NEXT:    movapd %xmm3, %xmm1
3824; SSE41-NEXT:    xorpd %xmm2, %xmm1
3825; SSE41-NEXT:    movapd %xmm1, %xmm7
3826; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
3827; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
3828; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
3829; SSE41-NEXT:    pand %xmm7, %xmm0
3830; SSE41-NEXT:    por %xmm1, %xmm0
3831; SSE41-NEXT:    pxor %xmm1, %xmm1
3832; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
3833; SSE41-NEXT:    packusdw %xmm4, %xmm1
3834; SSE41-NEXT:    movapd %xmm6, %xmm3
3835; SSE41-NEXT:    xorpd %xmm2, %xmm3
3836; SSE41-NEXT:    movapd %xmm3, %xmm4
3837; SSE41-NEXT:    pcmpeqd %xmm2, %xmm4
3838; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
3839; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
3840; SSE41-NEXT:    pand %xmm4, %xmm0
3841; SSE41-NEXT:    por %xmm3, %xmm0
3842; SSE41-NEXT:    pxor %xmm3, %xmm3
3843; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
3844; SSE41-NEXT:    movapd %xmm8, %xmm4
3845; SSE41-NEXT:    xorpd %xmm2, %xmm4
3846; SSE41-NEXT:    movapd %xmm4, %xmm6
3847; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
3848; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
3849; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
3850; SSE41-NEXT:    pand %xmm6, %xmm0
3851; SSE41-NEXT:    por %xmm4, %xmm0
3852; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm5
3853; SSE41-NEXT:    packusdw %xmm3, %xmm5
3854; SSE41-NEXT:    packusdw %xmm5, %xmm1
3855; SSE41-NEXT:    packuswb %xmm1, %xmm1
3856; SSE41-NEXT:    movdqa %xmm1, %xmm0
3857; SSE41-NEXT:    retq
3858;
3859; AVX1-LABEL: trunc_packus_v8i64_v8i8:
3860; AVX1:       # %bb.0:
3861; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
3862; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
3863; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
3864; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
3865; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
3866; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
3867; AVX1-NEXT:    vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
3868; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm5
3869; AVX1-NEXT:    vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
3870; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm5
3871; AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
3872; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
3873; AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
3874; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
3875; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm5
3876; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm1
3877; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
3878; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm0
3879; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3880; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm1
3881; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
3882; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm3
3883; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
3884; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
3885; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3886; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
3887; AVX1-NEXT:    retq
3888;
3889; AVX2-LABEL: trunc_packus_v8i64_v8i8:
3890; AVX2:       # %bb.0:
3891; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
3892; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
3893; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
3894; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
3895; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
3896; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
3897; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
3898; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3899; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
3900; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm1
3901; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
3902; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
3903; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
3904; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3905; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3906; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3907; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
3908; AVX2-NEXT:    vzeroupper
3909; AVX2-NEXT:    retq
3910;
3911; AVX512-LABEL: trunc_packus_v8i64_v8i8:
3912; AVX512:       # %bb.0:
3913; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3914; AVX512-NEXT:    vpmaxsq (%rdi), %zmm0, %zmm0
3915; AVX512-NEXT:    vpmovusqb %zmm0, %xmm0
3916; AVX512-NEXT:    vzeroupper
3917; AVX512-NEXT:    retq
3918;
3919; SKX-LABEL: trunc_packus_v8i64_v8i8:
3920; SKX:       # %bb.0:
3921; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3922; SKX-NEXT:    vpmaxsq 32(%rdi), %ymm0, %ymm1
3923; SKX-NEXT:    vpmovusqb %ymm1, %xmm1
3924; SKX-NEXT:    vpmaxsq (%rdi), %ymm0, %ymm0
3925; SKX-NEXT:    vpmovusqb %ymm0, %xmm0
3926; SKX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3927; SKX-NEXT:    vzeroupper
3928; SKX-NEXT:    retq
3929  %a0 = load <8 x i64>, ptr %p0
3930  %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
3931  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
3932  %3 = icmp sgt <8 x i64> %2, zeroinitializer
3933  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
3934  %5 = trunc <8 x i64> %4 to <8 x i8>
3935  ret <8 x i8> %5
3936}
3937
3938define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-width"="256" {
3939; SSE2-LABEL: trunc_packus_v8i64_v8i8_store:
3940; SSE2:       # %bb.0:
3941; SSE2-NEXT:    movdqa (%rdi), %xmm5
3942; SSE2-NEXT:    movdqa 16(%rdi), %xmm10
3943; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
3944; SSE2-NEXT:    movdqa 48(%rdi), %xmm4
3945; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
3946; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
3947; SSE2-NEXT:    movdqa %xmm2, %xmm1
3948; SSE2-NEXT:    pxor %xmm11, %xmm1
3949; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
3950; SSE2-NEXT:    pxor %xmm9, %xmm9
3951; SSE2-NEXT:    pcmpeqd %xmm9, %xmm7
3952; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
3953; SSE2-NEXT:    movdqa %xmm3, %xmm6
3954; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
3955; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3956; SSE2-NEXT:    pand %xmm7, %xmm0
3957; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
3958; SSE2-NEXT:    por %xmm0, %xmm1
3959; SSE2-NEXT:    pand %xmm1, %xmm2
3960; SSE2-NEXT:    pandn %xmm8, %xmm1
3961; SSE2-NEXT:    por %xmm2, %xmm1
3962; SSE2-NEXT:    movdqa %xmm4, %xmm0
3963; SSE2-NEXT:    pxor %xmm11, %xmm0
3964; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
3965; SSE2-NEXT:    pcmpeqd %xmm9, %xmm2
3966; SSE2-NEXT:    movdqa %xmm3, %xmm6
3967; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
3968; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3969; SSE2-NEXT:    pand %xmm2, %xmm0
3970; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
3971; SSE2-NEXT:    por %xmm0, %xmm2
3972; SSE2-NEXT:    pand %xmm2, %xmm4
3973; SSE2-NEXT:    pandn %xmm8, %xmm2
3974; SSE2-NEXT:    por %xmm4, %xmm2
3975; SSE2-NEXT:    movdqa %xmm5, %xmm0
3976; SSE2-NEXT:    pxor %xmm11, %xmm0
3977; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
3978; SSE2-NEXT:    pcmpeqd %xmm9, %xmm4
3979; SSE2-NEXT:    movdqa %xmm3, %xmm6
3980; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
3981; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
3982; SSE2-NEXT:    pand %xmm4, %xmm0
3983; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
3984; SSE2-NEXT:    por %xmm0, %xmm4
3985; SSE2-NEXT:    pand %xmm4, %xmm5
3986; SSE2-NEXT:    pandn %xmm8, %xmm4
3987; SSE2-NEXT:    por %xmm5, %xmm4
3988; SSE2-NEXT:    movdqa %xmm10, %xmm0
3989; SSE2-NEXT:    pxor %xmm11, %xmm0
3990; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
3991; SSE2-NEXT:    pcmpeqd %xmm9, %xmm5
3992; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3993; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
3994; SSE2-NEXT:    pand %xmm5, %xmm0
3995; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3996; SSE2-NEXT:    por %xmm0, %xmm3
3997; SSE2-NEXT:    pand %xmm3, %xmm10
3998; SSE2-NEXT:    pandn %xmm8, %xmm3
3999; SSE2-NEXT:    por %xmm10, %xmm3
4000; SSE2-NEXT:    movdqa %xmm3, %xmm0
4001; SSE2-NEXT:    pxor %xmm11, %xmm0
4002; SSE2-NEXT:    movdqa %xmm0, %xmm5
4003; SSE2-NEXT:    pcmpgtd %xmm11, %xmm5
4004; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
4005; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
4006; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4007; SSE2-NEXT:    pand %xmm6, %xmm0
4008; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
4009; SSE2-NEXT:    por %xmm0, %xmm5
4010; SSE2-NEXT:    pand %xmm3, %xmm5
4011; SSE2-NEXT:    movdqa %xmm4, %xmm0
4012; SSE2-NEXT:    pxor %xmm11, %xmm0
4013; SSE2-NEXT:    movdqa %xmm0, %xmm3
4014; SSE2-NEXT:    pcmpgtd %xmm11, %xmm3
4015; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
4016; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
4017; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4018; SSE2-NEXT:    pand %xmm6, %xmm0
4019; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4020; SSE2-NEXT:    por %xmm0, %xmm3
4021; SSE2-NEXT:    pand %xmm4, %xmm3
4022; SSE2-NEXT:    packuswb %xmm5, %xmm3
4023; SSE2-NEXT:    movdqa %xmm2, %xmm0
4024; SSE2-NEXT:    pxor %xmm11, %xmm0
4025; SSE2-NEXT:    movdqa %xmm0, %xmm4
4026; SSE2-NEXT:    pcmpgtd %xmm11, %xmm4
4027; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
4028; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
4029; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4030; SSE2-NEXT:    pand %xmm5, %xmm0
4031; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
4032; SSE2-NEXT:    por %xmm0, %xmm4
4033; SSE2-NEXT:    pand %xmm2, %xmm4
4034; SSE2-NEXT:    movdqa %xmm1, %xmm0
4035; SSE2-NEXT:    pxor %xmm11, %xmm0
4036; SSE2-NEXT:    movdqa %xmm0, %xmm2
4037; SSE2-NEXT:    pcmpgtd %xmm11, %xmm2
4038; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
4039; SSE2-NEXT:    pcmpeqd %xmm11, %xmm0
4040; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4041; SSE2-NEXT:    pand %xmm5, %xmm0
4042; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4043; SSE2-NEXT:    por %xmm0, %xmm2
4044; SSE2-NEXT:    pand %xmm1, %xmm2
4045; SSE2-NEXT:    packuswb %xmm4, %xmm2
4046; SSE2-NEXT:    packuswb %xmm2, %xmm3
4047; SSE2-NEXT:    packuswb %xmm3, %xmm3
4048; SSE2-NEXT:    movq %xmm3, (%rsi)
4049; SSE2-NEXT:    retq
4050;
4051; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store:
4052; SSSE3:       # %bb.0:
4053; SSSE3-NEXT:    movdqa (%rdi), %xmm5
4054; SSSE3-NEXT:    movdqa 16(%rdi), %xmm10
4055; SSSE3-NEXT:    movdqa 32(%rdi), %xmm2
4056; SSSE3-NEXT:    movdqa 48(%rdi), %xmm4
4057; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
4058; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
4059; SSSE3-NEXT:    movdqa %xmm2, %xmm1
4060; SSSE3-NEXT:    pxor %xmm11, %xmm1
4061; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
4062; SSSE3-NEXT:    pxor %xmm9, %xmm9
4063; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm7
4064; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
4065; SSSE3-NEXT:    movdqa %xmm3, %xmm6
4066; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
4067; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4068; SSSE3-NEXT:    pand %xmm7, %xmm0
4069; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
4070; SSSE3-NEXT:    por %xmm0, %xmm1
4071; SSSE3-NEXT:    pand %xmm1, %xmm2
4072; SSSE3-NEXT:    pandn %xmm8, %xmm1
4073; SSSE3-NEXT:    por %xmm2, %xmm1
4074; SSSE3-NEXT:    movdqa %xmm4, %xmm0
4075; SSSE3-NEXT:    pxor %xmm11, %xmm0
4076; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4077; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm2
4078; SSSE3-NEXT:    movdqa %xmm3, %xmm6
4079; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
4080; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4081; SSSE3-NEXT:    pand %xmm2, %xmm0
4082; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
4083; SSSE3-NEXT:    por %xmm0, %xmm2
4084; SSSE3-NEXT:    pand %xmm2, %xmm4
4085; SSSE3-NEXT:    pandn %xmm8, %xmm2
4086; SSSE3-NEXT:    por %xmm4, %xmm2
4087; SSSE3-NEXT:    movdqa %xmm5, %xmm0
4088; SSSE3-NEXT:    pxor %xmm11, %xmm0
4089; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
4090; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm4
4091; SSSE3-NEXT:    movdqa %xmm3, %xmm6
4092; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
4093; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4094; SSSE3-NEXT:    pand %xmm4, %xmm0
4095; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
4096; SSSE3-NEXT:    por %xmm0, %xmm4
4097; SSSE3-NEXT:    pand %xmm4, %xmm5
4098; SSSE3-NEXT:    pandn %xmm8, %xmm4
4099; SSSE3-NEXT:    por %xmm5, %xmm4
4100; SSSE3-NEXT:    movdqa %xmm10, %xmm0
4101; SSSE3-NEXT:    pxor %xmm11, %xmm0
4102; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
4103; SSSE3-NEXT:    pcmpeqd %xmm9, %xmm5
4104; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
4105; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4106; SSSE3-NEXT:    pand %xmm5, %xmm0
4107; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4108; SSSE3-NEXT:    por %xmm0, %xmm3
4109; SSSE3-NEXT:    pand %xmm3, %xmm10
4110; SSSE3-NEXT:    pandn %xmm8, %xmm3
4111; SSSE3-NEXT:    por %xmm10, %xmm3
4112; SSSE3-NEXT:    movdqa %xmm3, %xmm0
4113; SSSE3-NEXT:    pxor %xmm11, %xmm0
4114; SSSE3-NEXT:    movdqa %xmm0, %xmm5
4115; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm5
4116; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
4117; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
4118; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4119; SSSE3-NEXT:    pand %xmm6, %xmm0
4120; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
4121; SSSE3-NEXT:    por %xmm0, %xmm5
4122; SSSE3-NEXT:    pand %xmm3, %xmm5
4123; SSSE3-NEXT:    movdqa %xmm4, %xmm0
4124; SSSE3-NEXT:    pxor %xmm11, %xmm0
4125; SSSE3-NEXT:    movdqa %xmm0, %xmm3
4126; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm3
4127; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
4128; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
4129; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4130; SSSE3-NEXT:    pand %xmm6, %xmm0
4131; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4132; SSSE3-NEXT:    por %xmm0, %xmm3
4133; SSSE3-NEXT:    pand %xmm4, %xmm3
4134; SSSE3-NEXT:    packuswb %xmm5, %xmm3
4135; SSSE3-NEXT:    movdqa %xmm2, %xmm0
4136; SSSE3-NEXT:    pxor %xmm11, %xmm0
4137; SSSE3-NEXT:    movdqa %xmm0, %xmm4
4138; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm4
4139; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
4140; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
4141; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4142; SSSE3-NEXT:    pand %xmm5, %xmm0
4143; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
4144; SSSE3-NEXT:    por %xmm0, %xmm4
4145; SSSE3-NEXT:    pand %xmm2, %xmm4
4146; SSSE3-NEXT:    movdqa %xmm1, %xmm0
4147; SSSE3-NEXT:    pxor %xmm11, %xmm0
4148; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4149; SSSE3-NEXT:    pcmpgtd %xmm11, %xmm2
4150; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
4151; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm0
4152; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4153; SSSE3-NEXT:    pand %xmm5, %xmm0
4154; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4155; SSSE3-NEXT:    por %xmm0, %xmm2
4156; SSSE3-NEXT:    pand %xmm1, %xmm2
4157; SSSE3-NEXT:    packuswb %xmm4, %xmm2
4158; SSSE3-NEXT:    packuswb %xmm2, %xmm3
4159; SSSE3-NEXT:    packuswb %xmm3, %xmm3
4160; SSSE3-NEXT:    movq %xmm3, (%rsi)
4161; SSSE3-NEXT:    retq
4162;
4163; SSE41-LABEL: trunc_packus_v8i64_v8i8_store:
4164; SSE41:       # %bb.0:
4165; SSE41-NEXT:    movdqa (%rdi), %xmm10
4166; SSE41-NEXT:    movdqa 16(%rdi), %xmm9
4167; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
4168; SSE41-NEXT:    movdqa 48(%rdi), %xmm5
4169; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [255,255]
4170; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
4171; SSE41-NEXT:    movdqa %xmm2, %xmm0
4172; SSE41-NEXT:    pxor %xmm1, %xmm0
4173; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [2147483903,2147483903]
4174; SSE41-NEXT:    movdqa %xmm3, %xmm7
4175; SSE41-NEXT:    pcmpeqd %xmm0, %xmm7
4176; SSE41-NEXT:    movdqa %xmm3, %xmm6
4177; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
4178; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4179; SSE41-NEXT:    pand %xmm7, %xmm0
4180; SSE41-NEXT:    por %xmm6, %xmm0
4181; SSE41-NEXT:    movapd %xmm4, %xmm8
4182; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm8
4183; SSE41-NEXT:    movdqa %xmm5, %xmm0
4184; SSE41-NEXT:    pxor %xmm1, %xmm0
4185; SSE41-NEXT:    movdqa %xmm3, %xmm2
4186; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
4187; SSE41-NEXT:    movdqa %xmm3, %xmm6
4188; SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
4189; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4190; SSE41-NEXT:    pand %xmm2, %xmm0
4191; SSE41-NEXT:    por %xmm6, %xmm0
4192; SSE41-NEXT:    movapd %xmm4, %xmm6
4193; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm6
4194; SSE41-NEXT:    movdqa %xmm10, %xmm0
4195; SSE41-NEXT:    pxor %xmm1, %xmm0
4196; SSE41-NEXT:    movdqa %xmm3, %xmm2
4197; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
4198; SSE41-NEXT:    movdqa %xmm3, %xmm5
4199; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
4200; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
4201; SSE41-NEXT:    pand %xmm2, %xmm0
4202; SSE41-NEXT:    por %xmm5, %xmm0
4203; SSE41-NEXT:    movapd %xmm4, %xmm2
4204; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
4205; SSE41-NEXT:    movdqa %xmm9, %xmm0
4206; SSE41-NEXT:    pxor %xmm1, %xmm0
4207; SSE41-NEXT:    movdqa %xmm3, %xmm5
4208; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
4209; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
4210; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4211; SSE41-NEXT:    pand %xmm5, %xmm0
4212; SSE41-NEXT:    por %xmm3, %xmm0
4213; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
4214; SSE41-NEXT:    pxor %xmm5, %xmm5
4215; SSE41-NEXT:    movapd %xmm4, %xmm3
4216; SSE41-NEXT:    xorpd %xmm1, %xmm3
4217; SSE41-NEXT:    movapd %xmm3, %xmm7
4218; SSE41-NEXT:    pcmpeqd %xmm1, %xmm7
4219; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
4220; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4221; SSE41-NEXT:    pand %xmm7, %xmm0
4222; SSE41-NEXT:    por %xmm3, %xmm0
4223; SSE41-NEXT:    pxor %xmm3, %xmm3
4224; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
4225; SSE41-NEXT:    movapd %xmm2, %xmm4
4226; SSE41-NEXT:    xorpd %xmm1, %xmm4
4227; SSE41-NEXT:    movapd %xmm4, %xmm7
4228; SSE41-NEXT:    pcmpeqd %xmm1, %xmm7
4229; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
4230; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4231; SSE41-NEXT:    pand %xmm7, %xmm0
4232; SSE41-NEXT:    por %xmm4, %xmm0
4233; SSE41-NEXT:    pxor %xmm4, %xmm4
4234; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
4235; SSE41-NEXT:    packusdw %xmm3, %xmm4
4236; SSE41-NEXT:    movapd %xmm6, %xmm2
4237; SSE41-NEXT:    xorpd %xmm1, %xmm2
4238; SSE41-NEXT:    movapd %xmm2, %xmm3
4239; SSE41-NEXT:    pcmpeqd %xmm1, %xmm3
4240; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
4241; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
4242; SSE41-NEXT:    pand %xmm3, %xmm0
4243; SSE41-NEXT:    por %xmm2, %xmm0
4244; SSE41-NEXT:    pxor %xmm2, %xmm2
4245; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
4246; SSE41-NEXT:    movapd %xmm8, %xmm3
4247; SSE41-NEXT:    xorpd %xmm1, %xmm3
4248; SSE41-NEXT:    movapd %xmm3, %xmm6
4249; SSE41-NEXT:    pcmpeqd %xmm1, %xmm6
4250; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
4251; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4252; SSE41-NEXT:    pand %xmm6, %xmm0
4253; SSE41-NEXT:    por %xmm3, %xmm0
4254; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm5
4255; SSE41-NEXT:    packusdw %xmm2, %xmm5
4256; SSE41-NEXT:    packusdw %xmm5, %xmm4
4257; SSE41-NEXT:    packuswb %xmm4, %xmm4
4258; SSE41-NEXT:    movq %xmm4, (%rsi)
4259; SSE41-NEXT:    retq
4260;
4261; AVX1-LABEL: trunc_packus_v8i64_v8i8_store:
4262; AVX1:       # %bb.0:
4263; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
4264; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
4265; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
4266; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
4267; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
4268; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
4269; AVX1-NEXT:    vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
4270; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm5
4271; AVX1-NEXT:    vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
4272; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm5
4273; AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
4274; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm5
4275; AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
4276; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
4277; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm5
4278; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm1
4279; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm5
4280; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm0
4281; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4282; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm1
4283; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
4284; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm3
4285; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
4286; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
4287; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4288; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
4289; AVX1-NEXT:    vmovq %xmm0, (%rsi)
4290; AVX1-NEXT:    retq
4291;
4292; AVX2-LABEL: trunc_packus_v8i64_v8i8_store:
4293; AVX2:       # %bb.0:
4294; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
4295; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
4296; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
4297; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
4298; AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
4299; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm3
4300; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
4301; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
4302; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
4303; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm1
4304; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
4305; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
4306; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
4307; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4308; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4309; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4310; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
4311; AVX2-NEXT:    vmovq %xmm0, (%rsi)
4312; AVX2-NEXT:    vzeroupper
4313; AVX2-NEXT:    retq
4314;
4315; AVX512-LABEL: trunc_packus_v8i64_v8i8_store:
4316; AVX512:       # %bb.0:
4317; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
4318; AVX512-NEXT:    vpmaxsq (%rdi), %zmm0, %zmm0
4319; AVX512-NEXT:    vpmovusqb %zmm0, (%rsi)
4320; AVX512-NEXT:    vzeroupper
4321; AVX512-NEXT:    retq
4322;
4323; SKX-LABEL: trunc_packus_v8i64_v8i8_store:
4324; SKX:       # %bb.0:
4325; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
4326; SKX-NEXT:    vpmaxsq 32(%rdi), %ymm0, %ymm1
4327; SKX-NEXT:    vpmovusqb %ymm1, %xmm1
4328; SKX-NEXT:    vpmaxsq (%rdi), %ymm0, %ymm0
4329; SKX-NEXT:    vpmovusqb %ymm0, %xmm0
4330; SKX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4331; SKX-NEXT:    vmovq %xmm0, (%rsi)
4332; SKX-NEXT:    vzeroupper
4333; SKX-NEXT:    retq
4334  %a0 = load <8 x i64>, ptr %p0
4335  %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
4336  %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
4337  %3 = icmp sgt <8 x i64> %2, zeroinitializer
4338  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
4339  %5 = trunc <8 x i64> %4 to <8 x i8>
4340  store <8 x i8> %5, ptr%p1
4341  ret void
4342}
4343
4344define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" {
4345; SSE2-LABEL: trunc_packus_v16i64_v16i8:
4346; SSE2:       # %bb.0:
4347; SSE2-NEXT:    movdqa (%rdi), %xmm11
4348; SSE2-NEXT:    movdqa 16(%rdi), %xmm9
4349; SSE2-NEXT:    movdqa 32(%rdi), %xmm15
4350; SSE2-NEXT:    movdqa 48(%rdi), %xmm12
4351; SSE2-NEXT:    movdqa 80(%rdi), %xmm2
4352; SSE2-NEXT:    movdqa 64(%rdi), %xmm5
4353; SSE2-NEXT:    movdqa 112(%rdi), %xmm3
4354; SSE2-NEXT:    movdqa 96(%rdi), %xmm14
4355; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
4356; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
4357; SSE2-NEXT:    movdqa %xmm14, %xmm7
4358; SSE2-NEXT:    pxor %xmm1, %xmm7
4359; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
4360; SSE2-NEXT:    pxor %xmm10, %xmm10
4361; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
4362; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
4363; SSE2-NEXT:    movdqa %xmm6, %xmm4
4364; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
4365; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
4366; SSE2-NEXT:    pand %xmm0, %xmm7
4367; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
4368; SSE2-NEXT:    por %xmm7, %xmm13
4369; SSE2-NEXT:    pand %xmm13, %xmm14
4370; SSE2-NEXT:    pandn %xmm8, %xmm13
4371; SSE2-NEXT:    por %xmm14, %xmm13
4372; SSE2-NEXT:    movdqa %xmm3, %xmm0
4373; SSE2-NEXT:    pxor %xmm1, %xmm0
4374; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
4375; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
4376; SSE2-NEXT:    movdqa %xmm6, %xmm7
4377; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
4378; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
4379; SSE2-NEXT:    pand %xmm4, %xmm0
4380; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3]
4381; SSE2-NEXT:    por %xmm0, %xmm14
4382; SSE2-NEXT:    pand %xmm14, %xmm3
4383; SSE2-NEXT:    pandn %xmm8, %xmm14
4384; SSE2-NEXT:    por %xmm3, %xmm14
4385; SSE2-NEXT:    movdqa %xmm5, %xmm0
4386; SSE2-NEXT:    pxor %xmm1, %xmm0
4387; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4388; SSE2-NEXT:    pcmpeqd %xmm10, %xmm3
4389; SSE2-NEXT:    movdqa %xmm6, %xmm4
4390; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
4391; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4392; SSE2-NEXT:    pand %xmm3, %xmm0
4393; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
4394; SSE2-NEXT:    por %xmm0, %xmm4
4395; SSE2-NEXT:    pand %xmm4, %xmm5
4396; SSE2-NEXT:    pandn %xmm8, %xmm4
4397; SSE2-NEXT:    por %xmm5, %xmm4
4398; SSE2-NEXT:    movdqa %xmm2, %xmm0
4399; SSE2-NEXT:    pxor %xmm1, %xmm0
4400; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4401; SSE2-NEXT:    pcmpeqd %xmm10, %xmm3
4402; SSE2-NEXT:    movdqa %xmm6, %xmm5
4403; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
4404; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
4405; SSE2-NEXT:    pand %xmm3, %xmm0
4406; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
4407; SSE2-NEXT:    por %xmm0, %xmm5
4408; SSE2-NEXT:    pand %xmm5, %xmm2
4409; SSE2-NEXT:    pandn %xmm8, %xmm5
4410; SSE2-NEXT:    por %xmm2, %xmm5
4411; SSE2-NEXT:    movdqa %xmm15, %xmm0
4412; SSE2-NEXT:    pxor %xmm1, %xmm0
4413; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4414; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
4415; SSE2-NEXT:    movdqa %xmm6, %xmm3
4416; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
4417; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4418; SSE2-NEXT:    pand %xmm2, %xmm0
4419; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
4420; SSE2-NEXT:    por %xmm0, %xmm7
4421; SSE2-NEXT:    pand %xmm7, %xmm15
4422; SSE2-NEXT:    pandn %xmm8, %xmm7
4423; SSE2-NEXT:    por %xmm15, %xmm7
4424; SSE2-NEXT:    movdqa %xmm12, %xmm0
4425; SSE2-NEXT:    pxor %xmm1, %xmm0
4426; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4427; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
4428; SSE2-NEXT:    movdqa %xmm6, %xmm3
4429; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
4430; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4431; SSE2-NEXT:    pand %xmm2, %xmm0
4432; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3]
4433; SSE2-NEXT:    por %xmm0, %xmm15
4434; SSE2-NEXT:    pand %xmm15, %xmm12
4435; SSE2-NEXT:    pandn %xmm8, %xmm15
4436; SSE2-NEXT:    por %xmm12, %xmm15
4437; SSE2-NEXT:    movdqa %xmm11, %xmm0
4438; SSE2-NEXT:    pxor %xmm1, %xmm0
4439; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4440; SSE2-NEXT:    pcmpeqd %xmm10, %xmm3
4441; SSE2-NEXT:    movdqa %xmm6, %xmm2
4442; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
4443; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
4444; SSE2-NEXT:    pand %xmm3, %xmm0
4445; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
4446; SSE2-NEXT:    por %xmm0, %xmm12
4447; SSE2-NEXT:    pand %xmm12, %xmm11
4448; SSE2-NEXT:    pandn %xmm8, %xmm12
4449; SSE2-NEXT:    por %xmm11, %xmm12
4450; SSE2-NEXT:    movdqa %xmm9, %xmm0
4451; SSE2-NEXT:    pxor %xmm1, %xmm0
4452; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4453; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
4454; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
4455; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4456; SSE2-NEXT:    pand %xmm2, %xmm0
4457; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
4458; SSE2-NEXT:    por %xmm0, %xmm2
4459; SSE2-NEXT:    pand %xmm2, %xmm9
4460; SSE2-NEXT:    pandn %xmm8, %xmm2
4461; SSE2-NEXT:    por %xmm9, %xmm2
4462; SSE2-NEXT:    movdqa %xmm2, %xmm0
4463; SSE2-NEXT:    pxor %xmm1, %xmm0
4464; SSE2-NEXT:    movdqa %xmm0, %xmm6
4465; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
4466; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
4467; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
4468; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4469; SSE2-NEXT:    pand %xmm8, %xmm0
4470; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
4471; SSE2-NEXT:    por %xmm0, %xmm6
4472; SSE2-NEXT:    pand %xmm2, %xmm6
4473; SSE2-NEXT:    movdqa %xmm12, %xmm0
4474; SSE2-NEXT:    pxor %xmm1, %xmm0
4475; SSE2-NEXT:    movdqa %xmm0, %xmm2
4476; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
4477; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
4478; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
4479; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4480; SSE2-NEXT:    pand %xmm8, %xmm3
4481; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
4482; SSE2-NEXT:    por %xmm3, %xmm0
4483; SSE2-NEXT:    pand %xmm12, %xmm0
4484; SSE2-NEXT:    packuswb %xmm6, %xmm0
4485; SSE2-NEXT:    movdqa %xmm15, %xmm2
4486; SSE2-NEXT:    pxor %xmm1, %xmm2
4487; SSE2-NEXT:    movdqa %xmm2, %xmm3
4488; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
4489; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
4490; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
4491; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4492; SSE2-NEXT:    pand %xmm6, %xmm2
4493; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4494; SSE2-NEXT:    por %xmm2, %xmm3
4495; SSE2-NEXT:    pand %xmm15, %xmm3
4496; SSE2-NEXT:    movdqa %xmm7, %xmm2
4497; SSE2-NEXT:    pxor %xmm1, %xmm2
4498; SSE2-NEXT:    movdqa %xmm2, %xmm6
4499; SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
4500; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
4501; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
4502; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4503; SSE2-NEXT:    pand %xmm8, %xmm2
4504; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
4505; SSE2-NEXT:    por %xmm2, %xmm6
4506; SSE2-NEXT:    pand %xmm7, %xmm6
4507; SSE2-NEXT:    packuswb %xmm3, %xmm6
4508; SSE2-NEXT:    packuswb %xmm6, %xmm0
4509; SSE2-NEXT:    movdqa %xmm5, %xmm2
4510; SSE2-NEXT:    pxor %xmm1, %xmm2
4511; SSE2-NEXT:    movdqa %xmm2, %xmm3
4512; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
4513; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
4514; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
4515; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4516; SSE2-NEXT:    pand %xmm6, %xmm2
4517; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4518; SSE2-NEXT:    por %xmm2, %xmm3
4519; SSE2-NEXT:    pand %xmm5, %xmm3
4520; SSE2-NEXT:    movdqa %xmm4, %xmm2
4521; SSE2-NEXT:    pxor %xmm1, %xmm2
4522; SSE2-NEXT:    movdqa %xmm2, %xmm5
4523; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
4524; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
4525; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
4526; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
4527; SSE2-NEXT:    pand %xmm6, %xmm7
4528; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
4529; SSE2-NEXT:    por %xmm7, %xmm2
4530; SSE2-NEXT:    pand %xmm4, %xmm2
4531; SSE2-NEXT:    packuswb %xmm3, %xmm2
4532; SSE2-NEXT:    movdqa %xmm14, %xmm3
4533; SSE2-NEXT:    pxor %xmm1, %xmm3
4534; SSE2-NEXT:    movdqa %xmm3, %xmm4
4535; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
4536; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
4537; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
4538; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4539; SSE2-NEXT:    pand %xmm5, %xmm3
4540; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
4541; SSE2-NEXT:    por %xmm3, %xmm4
4542; SSE2-NEXT:    pand %xmm14, %xmm4
4543; SSE2-NEXT:    movdqa %xmm13, %xmm3
4544; SSE2-NEXT:    pxor %xmm1, %xmm3
4545; SSE2-NEXT:    movdqa %xmm3, %xmm5
4546; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
4547; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
4548; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
4549; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
4550; SSE2-NEXT:    pand %xmm6, %xmm1
4551; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
4552; SSE2-NEXT:    por %xmm1, %xmm3
4553; SSE2-NEXT:    pand %xmm13, %xmm3
4554; SSE2-NEXT:    packuswb %xmm4, %xmm3
4555; SSE2-NEXT:    packuswb %xmm3, %xmm2
4556; SSE2-NEXT:    packuswb %xmm2, %xmm0
4557; SSE2-NEXT:    retq
4558;
4559; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
4560; SSSE3:       # %bb.0:
4561; SSSE3-NEXT:    movdqa (%rdi), %xmm11
4562; SSSE3-NEXT:    movdqa 16(%rdi), %xmm9
4563; SSSE3-NEXT:    movdqa 32(%rdi), %xmm15
4564; SSSE3-NEXT:    movdqa 48(%rdi), %xmm12
4565; SSSE3-NEXT:    movdqa 80(%rdi), %xmm2
4566; SSSE3-NEXT:    movdqa 64(%rdi), %xmm5
4567; SSSE3-NEXT:    movdqa 112(%rdi), %xmm3
4568; SSSE3-NEXT:    movdqa 96(%rdi), %xmm14
4569; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255]
4570; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
4571; SSSE3-NEXT:    movdqa %xmm14, %xmm7
4572; SSSE3-NEXT:    pxor %xmm1, %xmm7
4573; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
4574; SSSE3-NEXT:    pxor %xmm10, %xmm10
4575; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
4576; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483903,2147483903]
4577; SSSE3-NEXT:    movdqa %xmm6, %xmm4
4578; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
4579; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
4580; SSSE3-NEXT:    pand %xmm0, %xmm7
4581; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3]
4582; SSSE3-NEXT:    por %xmm7, %xmm13
4583; SSSE3-NEXT:    pand %xmm13, %xmm14
4584; SSSE3-NEXT:    pandn %xmm8, %xmm13
4585; SSSE3-NEXT:    por %xmm14, %xmm13
4586; SSSE3-NEXT:    movdqa %xmm3, %xmm0
4587; SSSE3-NEXT:    pxor %xmm1, %xmm0
4588; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
4589; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
4590; SSSE3-NEXT:    movdqa %xmm6, %xmm7
4591; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm7
4592; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
4593; SSSE3-NEXT:    pand %xmm4, %xmm0
4594; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3]
4595; SSSE3-NEXT:    por %xmm0, %xmm14
4596; SSSE3-NEXT:    pand %xmm14, %xmm3
4597; SSSE3-NEXT:    pandn %xmm8, %xmm14
4598; SSSE3-NEXT:    por %xmm3, %xmm14
4599; SSSE3-NEXT:    movdqa %xmm5, %xmm0
4600; SSSE3-NEXT:    pxor %xmm1, %xmm0
4601; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4602; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm3
4603; SSSE3-NEXT:    movdqa %xmm6, %xmm4
4604; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
4605; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4606; SSSE3-NEXT:    pand %xmm3, %xmm0
4607; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
4608; SSSE3-NEXT:    por %xmm0, %xmm4
4609; SSSE3-NEXT:    pand %xmm4, %xmm5
4610; SSSE3-NEXT:    pandn %xmm8, %xmm4
4611; SSSE3-NEXT:    por %xmm5, %xmm4
4612; SSSE3-NEXT:    movdqa %xmm2, %xmm0
4613; SSSE3-NEXT:    pxor %xmm1, %xmm0
4614; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4615; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm3
4616; SSSE3-NEXT:    movdqa %xmm6, %xmm5
4617; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
4618; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
4619; SSSE3-NEXT:    pand %xmm3, %xmm0
4620; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
4621; SSSE3-NEXT:    por %xmm0, %xmm5
4622; SSSE3-NEXT:    pand %xmm5, %xmm2
4623; SSSE3-NEXT:    pandn %xmm8, %xmm5
4624; SSSE3-NEXT:    por %xmm2, %xmm5
4625; SSSE3-NEXT:    movdqa %xmm15, %xmm0
4626; SSSE3-NEXT:    pxor %xmm1, %xmm0
4627; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4628; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
4629; SSSE3-NEXT:    movdqa %xmm6, %xmm3
4630; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
4631; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4632; SSSE3-NEXT:    pand %xmm2, %xmm0
4633; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
4634; SSSE3-NEXT:    por %xmm0, %xmm7
4635; SSSE3-NEXT:    pand %xmm7, %xmm15
4636; SSSE3-NEXT:    pandn %xmm8, %xmm7
4637; SSSE3-NEXT:    por %xmm15, %xmm7
4638; SSSE3-NEXT:    movdqa %xmm12, %xmm0
4639; SSSE3-NEXT:    pxor %xmm1, %xmm0
4640; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4641; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
4642; SSSE3-NEXT:    movdqa %xmm6, %xmm3
4643; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
4644; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4645; SSSE3-NEXT:    pand %xmm2, %xmm0
4646; SSSE3-NEXT:    pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3]
4647; SSSE3-NEXT:    por %xmm0, %xmm15
4648; SSSE3-NEXT:    pand %xmm15, %xmm12
4649; SSSE3-NEXT:    pandn %xmm8, %xmm15
4650; SSSE3-NEXT:    por %xmm12, %xmm15
4651; SSSE3-NEXT:    movdqa %xmm11, %xmm0
4652; SSSE3-NEXT:    pxor %xmm1, %xmm0
4653; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4654; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm3
4655; SSSE3-NEXT:    movdqa %xmm6, %xmm2
4656; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
4657; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
4658; SSSE3-NEXT:    pand %xmm3, %xmm0
4659; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
4660; SSSE3-NEXT:    por %xmm0, %xmm12
4661; SSSE3-NEXT:    pand %xmm12, %xmm11
4662; SSSE3-NEXT:    pandn %xmm8, %xmm12
4663; SSSE3-NEXT:    por %xmm11, %xmm12
4664; SSSE3-NEXT:    movdqa %xmm9, %xmm0
4665; SSSE3-NEXT:    pxor %xmm1, %xmm0
4666; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4667; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
4668; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm6
4669; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
4670; SSSE3-NEXT:    pand %xmm2, %xmm0
4671; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
4672; SSSE3-NEXT:    por %xmm0, %xmm2
4673; SSSE3-NEXT:    pand %xmm2, %xmm9
4674; SSSE3-NEXT:    pandn %xmm8, %xmm2
4675; SSSE3-NEXT:    por %xmm9, %xmm2
4676; SSSE3-NEXT:    movdqa %xmm2, %xmm0
4677; SSSE3-NEXT:    pxor %xmm1, %xmm0
4678; SSSE3-NEXT:    movdqa %xmm0, %xmm6
4679; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
4680; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
4681; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
4682; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
4683; SSSE3-NEXT:    pand %xmm8, %xmm0
4684; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
4685; SSSE3-NEXT:    por %xmm0, %xmm6
4686; SSSE3-NEXT:    pand %xmm2, %xmm6
4687; SSSE3-NEXT:    movdqa %xmm12, %xmm0
4688; SSSE3-NEXT:    pxor %xmm1, %xmm0
4689; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4690; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
4691; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
4692; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
4693; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
4694; SSSE3-NEXT:    pand %xmm8, %xmm3
4695; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
4696; SSSE3-NEXT:    por %xmm3, %xmm0
4697; SSSE3-NEXT:    pand %xmm12, %xmm0
4698; SSSE3-NEXT:    packuswb %xmm6, %xmm0
4699; SSSE3-NEXT:    movdqa %xmm15, %xmm2
4700; SSSE3-NEXT:    pxor %xmm1, %xmm2
4701; SSSE3-NEXT:    movdqa %xmm2, %xmm3
4702; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
4703; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
4704; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
4705; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4706; SSSE3-NEXT:    pand %xmm6, %xmm2
4707; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4708; SSSE3-NEXT:    por %xmm2, %xmm3
4709; SSSE3-NEXT:    pand %xmm15, %xmm3
4710; SSSE3-NEXT:    movdqa %xmm7, %xmm2
4711; SSSE3-NEXT:    pxor %xmm1, %xmm2
4712; SSSE3-NEXT:    movdqa %xmm2, %xmm6
4713; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm6
4714; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
4715; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
4716; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4717; SSSE3-NEXT:    pand %xmm8, %xmm2
4718; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
4719; SSSE3-NEXT:    por %xmm2, %xmm6
4720; SSSE3-NEXT:    pand %xmm7, %xmm6
4721; SSSE3-NEXT:    packuswb %xmm3, %xmm6
4722; SSSE3-NEXT:    packuswb %xmm6, %xmm0
4723; SSSE3-NEXT:    movdqa %xmm5, %xmm2
4724; SSSE3-NEXT:    pxor %xmm1, %xmm2
4725; SSSE3-NEXT:    movdqa %xmm2, %xmm3
4726; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
4727; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
4728; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
4729; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
4730; SSSE3-NEXT:    pand %xmm6, %xmm2
4731; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4732; SSSE3-NEXT:    por %xmm2, %xmm3
4733; SSSE3-NEXT:    pand %xmm5, %xmm3
4734; SSSE3-NEXT:    movdqa %xmm4, %xmm2
4735; SSSE3-NEXT:    pxor %xmm1, %xmm2
4736; SSSE3-NEXT:    movdqa %xmm2, %xmm5
4737; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
4738; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
4739; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm2
4740; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
4741; SSSE3-NEXT:    pand %xmm6, %xmm7
4742; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
4743; SSSE3-NEXT:    por %xmm7, %xmm2
4744; SSSE3-NEXT:    pand %xmm4, %xmm2
4745; SSSE3-NEXT:    packuswb %xmm3, %xmm2
4746; SSSE3-NEXT:    movdqa %xmm14, %xmm3
4747; SSSE3-NEXT:    pxor %xmm1, %xmm3
4748; SSSE3-NEXT:    movdqa %xmm3, %xmm4
4749; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
4750; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
4751; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3
4752; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
4753; SSSE3-NEXT:    pand %xmm5, %xmm3
4754; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
4755; SSSE3-NEXT:    por %xmm3, %xmm4
4756; SSSE3-NEXT:    pand %xmm14, %xmm4
4757; SSSE3-NEXT:    movdqa %xmm13, %xmm3
4758; SSSE3-NEXT:    pxor %xmm1, %xmm3
4759; SSSE3-NEXT:    movdqa %xmm3, %xmm5
4760; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
4761; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
4762; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3
4763; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
4764; SSSE3-NEXT:    pand %xmm6, %xmm1
4765; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
4766; SSSE3-NEXT:    por %xmm1, %xmm3
4767; SSSE3-NEXT:    pand %xmm13, %xmm3
4768; SSSE3-NEXT:    packuswb %xmm4, %xmm3
4769; SSSE3-NEXT:    packuswb %xmm3, %xmm2
4770; SSSE3-NEXT:    packuswb %xmm2, %xmm0
4771; SSSE3-NEXT:    retq
4772;
4773; SSE41-LABEL: trunc_packus_v16i64_v16i8:
4774; SSE41:       # %bb.0:
4775; SSE41-NEXT:    movdqa (%rdi), %xmm10
4776; SSE41-NEXT:    movdqa 16(%rdi), %xmm9
4777; SSE41-NEXT:    movdqa 32(%rdi), %xmm14
4778; SSE41-NEXT:    movdqa 48(%rdi), %xmm12
4779; SSE41-NEXT:    movdqa 80(%rdi), %xmm15
4780; SSE41-NEXT:    movdqa 64(%rdi), %xmm6
4781; SSE41-NEXT:    movdqa 112(%rdi), %xmm13
4782; SSE41-NEXT:    movdqa 96(%rdi), %xmm4
4783; SSE41-NEXT:    movapd {{.*#+}} xmm1 = [255,255]
4784; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
4785; SSE41-NEXT:    movdqa %xmm4, %xmm0
4786; SSE41-NEXT:    pxor %xmm2, %xmm0
4787; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
4788; SSE41-NEXT:    movdqa %xmm7, %xmm3
4789; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
4790; SSE41-NEXT:    movdqa %xmm7, %xmm5
4791; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
4792; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
4793; SSE41-NEXT:    pand %xmm3, %xmm0
4794; SSE41-NEXT:    por %xmm5, %xmm0
4795; SSE41-NEXT:    movapd %xmm1, %xmm8
4796; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
4797; SSE41-NEXT:    movdqa %xmm13, %xmm0
4798; SSE41-NEXT:    pxor %xmm2, %xmm0
4799; SSE41-NEXT:    movdqa %xmm7, %xmm3
4800; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
4801; SSE41-NEXT:    movdqa %xmm7, %xmm4
4802; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
4803; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4804; SSE41-NEXT:    pand %xmm3, %xmm0
4805; SSE41-NEXT:    por %xmm4, %xmm0
4806; SSE41-NEXT:    movapd %xmm1, %xmm11
4807; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm11
4808; SSE41-NEXT:    movdqa %xmm6, %xmm0
4809; SSE41-NEXT:    pxor %xmm2, %xmm0
4810; SSE41-NEXT:    movdqa %xmm7, %xmm3
4811; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
4812; SSE41-NEXT:    movdqa %xmm7, %xmm4
4813; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
4814; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4815; SSE41-NEXT:    pand %xmm3, %xmm0
4816; SSE41-NEXT:    por %xmm4, %xmm0
4817; SSE41-NEXT:    movapd %xmm1, %xmm13
4818; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm13
4819; SSE41-NEXT:    movdqa %xmm15, %xmm0
4820; SSE41-NEXT:    pxor %xmm2, %xmm0
4821; SSE41-NEXT:    movdqa %xmm7, %xmm3
4822; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
4823; SSE41-NEXT:    movdqa %xmm7, %xmm4
4824; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
4825; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4826; SSE41-NEXT:    pand %xmm3, %xmm0
4827; SSE41-NEXT:    por %xmm4, %xmm0
4828; SSE41-NEXT:    movapd %xmm1, %xmm6
4829; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm6
4830; SSE41-NEXT:    movdqa %xmm14, %xmm0
4831; SSE41-NEXT:    pxor %xmm2, %xmm0
4832; SSE41-NEXT:    movdqa %xmm7, %xmm3
4833; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
4834; SSE41-NEXT:    movdqa %xmm7, %xmm4
4835; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
4836; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4837; SSE41-NEXT:    pand %xmm3, %xmm0
4838; SSE41-NEXT:    por %xmm4, %xmm0
4839; SSE41-NEXT:    movapd %xmm1, %xmm15
4840; SSE41-NEXT:    blendvpd %xmm0, %xmm14, %xmm15
4841; SSE41-NEXT:    movdqa %xmm12, %xmm0
4842; SSE41-NEXT:    pxor %xmm2, %xmm0
4843; SSE41-NEXT:    movdqa %xmm7, %xmm4
4844; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
4845; SSE41-NEXT:    movdqa %xmm7, %xmm5
4846; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
4847; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
4848; SSE41-NEXT:    pand %xmm4, %xmm0
4849; SSE41-NEXT:    por %xmm5, %xmm0
4850; SSE41-NEXT:    movapd %xmm1, %xmm4
4851; SSE41-NEXT:    blendvpd %xmm0, %xmm12, %xmm4
4852; SSE41-NEXT:    movdqa %xmm10, %xmm0
4853; SSE41-NEXT:    pxor %xmm2, %xmm0
4854; SSE41-NEXT:    movdqa %xmm7, %xmm5
4855; SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
4856; SSE41-NEXT:    movdqa %xmm7, %xmm3
4857; SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
4858; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4859; SSE41-NEXT:    pand %xmm5, %xmm0
4860; SSE41-NEXT:    por %xmm3, %xmm0
4861; SSE41-NEXT:    movapd %xmm1, %xmm5
4862; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
4863; SSE41-NEXT:    movdqa %xmm9, %xmm0
4864; SSE41-NEXT:    pxor %xmm2, %xmm0
4865; SSE41-NEXT:    movdqa %xmm7, %xmm3
4866; SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
4867; SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
4868; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
4869; SSE41-NEXT:    pand %xmm3, %xmm0
4870; SSE41-NEXT:    por %xmm7, %xmm0
4871; SSE41-NEXT:    blendvpd %xmm0, %xmm9, %xmm1
4872; SSE41-NEXT:    xorpd %xmm9, %xmm9
4873; SSE41-NEXT:    movapd %xmm1, %xmm3
4874; SSE41-NEXT:    xorpd %xmm2, %xmm3
4875; SSE41-NEXT:    movapd %xmm3, %xmm7
4876; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
4877; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
4878; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4879; SSE41-NEXT:    pand %xmm7, %xmm0
4880; SSE41-NEXT:    por %xmm3, %xmm0
4881; SSE41-NEXT:    pxor %xmm3, %xmm3
4882; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
4883; SSE41-NEXT:    movapd %xmm5, %xmm1
4884; SSE41-NEXT:    xorpd %xmm2, %xmm1
4885; SSE41-NEXT:    movapd %xmm1, %xmm7
4886; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
4887; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
4888; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
4889; SSE41-NEXT:    pand %xmm7, %xmm0
4890; SSE41-NEXT:    por %xmm1, %xmm0
4891; SSE41-NEXT:    pxor %xmm1, %xmm1
4892; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
4893; SSE41-NEXT:    packusdw %xmm3, %xmm1
4894; SSE41-NEXT:    movapd %xmm4, %xmm3
4895; SSE41-NEXT:    xorpd %xmm2, %xmm3
4896; SSE41-NEXT:    movapd %xmm3, %xmm5
4897; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
4898; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
4899; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4900; SSE41-NEXT:    pand %xmm5, %xmm0
4901; SSE41-NEXT:    por %xmm3, %xmm0
4902; SSE41-NEXT:    pxor %xmm3, %xmm3
4903; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
4904; SSE41-NEXT:    movapd %xmm15, %xmm4
4905; SSE41-NEXT:    xorpd %xmm2, %xmm4
4906; SSE41-NEXT:    movapd %xmm4, %xmm5
4907; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
4908; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
4909; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4910; SSE41-NEXT:    pand %xmm5, %xmm0
4911; SSE41-NEXT:    por %xmm4, %xmm0
4912; SSE41-NEXT:    pxor %xmm4, %xmm4
4913; SSE41-NEXT:    blendvpd %xmm0, %xmm15, %xmm4
4914; SSE41-NEXT:    packusdw %xmm3, %xmm4
4915; SSE41-NEXT:    packusdw %xmm4, %xmm1
4916; SSE41-NEXT:    movapd %xmm6, %xmm3
4917; SSE41-NEXT:    xorpd %xmm2, %xmm3
4918; SSE41-NEXT:    movapd %xmm3, %xmm4
4919; SSE41-NEXT:    pcmpeqd %xmm2, %xmm4
4920; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
4921; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4922; SSE41-NEXT:    pand %xmm4, %xmm0
4923; SSE41-NEXT:    por %xmm3, %xmm0
4924; SSE41-NEXT:    pxor %xmm4, %xmm4
4925; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
4926; SSE41-NEXT:    movapd %xmm13, %xmm3
4927; SSE41-NEXT:    xorpd %xmm2, %xmm3
4928; SSE41-NEXT:    movapd %xmm3, %xmm5
4929; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
4930; SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
4931; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
4932; SSE41-NEXT:    pand %xmm5, %xmm0
4933; SSE41-NEXT:    por %xmm3, %xmm0
4934; SSE41-NEXT:    pxor %xmm3, %xmm3
4935; SSE41-NEXT:    blendvpd %xmm0, %xmm13, %xmm3
4936; SSE41-NEXT:    packusdw %xmm4, %xmm3
4937; SSE41-NEXT:    movapd %xmm11, %xmm4
4938; SSE41-NEXT:    xorpd %xmm2, %xmm4
4939; SSE41-NEXT:    movapd %xmm4, %xmm5
4940; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
4941; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
4942; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
4943; SSE41-NEXT:    pand %xmm5, %xmm0
4944; SSE41-NEXT:    por %xmm4, %xmm0
4945; SSE41-NEXT:    pxor %xmm4, %xmm4
4946; SSE41-NEXT:    blendvpd %xmm0, %xmm11, %xmm4
4947; SSE41-NEXT:    movapd %xmm8, %xmm5
4948; SSE41-NEXT:    xorpd %xmm2, %xmm5
4949; SSE41-NEXT:    movapd %xmm5, %xmm6
4950; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
4951; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
4952; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
4953; SSE41-NEXT:    pand %xmm6, %xmm0
4954; SSE41-NEXT:    por %xmm5, %xmm0
4955; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm9
4956; SSE41-NEXT:    packusdw %xmm4, %xmm9
4957; SSE41-NEXT:    packusdw %xmm9, %xmm3
4958; SSE41-NEXT:    packuswb %xmm3, %xmm1
4959; SSE41-NEXT:    movdqa %xmm1, %xmm0
4960; SSE41-NEXT:    retq
4961;
4962; AVX1-LABEL: trunc_packus_v16i64_v16i8:
4963; AVX1:       # %bb.0:
4964; AVX1-NEXT:    vmovdqa 96(%rdi), %xmm0
4965; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255]
4966; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm1
4967; AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm8
4968; AVX1-NEXT:    vmovdqa 112(%rdi), %xmm1
4969; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
4970; AVX1-NEXT:    vblendvpd %xmm3, %xmm1, %xmm2, %xmm9
4971; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm3
4972; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
4973; AVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm3
4974; AVX1-NEXT:    vmovdqa 80(%rdi), %xmm4
4975; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm5
4976; AVX1-NEXT:    vblendvpd %xmm5, %xmm4, %xmm2, %xmm4
4977; AVX1-NEXT:    vmovdqa (%rdi), %xmm5
4978; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm6
4979; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm7
4980; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm0
4981; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm2, %xmm1
4982; AVX1-NEXT:    vblendvpd %xmm1, %xmm7, %xmm2, %xmm1
4983; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm7
4984; AVX1-NEXT:    vblendvpd %xmm7, %xmm0, %xmm2, %xmm0
4985; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
4986; AVX1-NEXT:    vblendvpd %xmm7, %xmm5, %xmm2, %xmm5
4987; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm7
4988; AVX1-NEXT:    vblendvpd %xmm7, %xmm6, %xmm2, %xmm2
4989; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
4990; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm7
4991; AVX1-NEXT:    vpand %xmm2, %xmm7, %xmm2
4992; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm7
4993; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm5
4994; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
4995; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm0, %xmm5
4996; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm0
4997; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm1, %xmm5
4998; AVX1-NEXT:    vpand %xmm1, %xmm5, %xmm1
4999; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
5000; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
5001; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm4, %xmm1
5002; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
5003; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm3, %xmm2
5004; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
5005; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
5006; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm9, %xmm2
5007; AVX1-NEXT:    vpand %xmm2, %xmm9, %xmm2
5008; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm8, %xmm3
5009; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
5010; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
5011; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
5012; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5013; AVX1-NEXT:    retq
5014;
5015; AVX2-LABEL: trunc_packus_v16i64_v16i8:
5016; AVX2:       # %bb.0:
5017; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5018; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
5019; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm2
5020; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
5021; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
5022; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
5023; AVX2-NEXT:    vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
5024; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm5
5025; AVX2-NEXT:    vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
5026; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm4, %ymm5
5027; AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
5028; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm4, %ymm5
5029; AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
5030; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
5031; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm1, %ymm5
5032; AVX2-NEXT:    vpand %ymm1, %ymm5, %ymm1
5033; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm5
5034; AVX2-NEXT:    vpand %ymm0, %ymm5, %ymm0
5035; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
5036; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm1
5037; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
5038; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm3
5039; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm2
5040; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
5041; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
5042; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5043; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
5044; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5045; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5046; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
5047; AVX2-NEXT:    vzeroupper
5048; AVX2-NEXT:    retq
5049;
5050; AVX512-LABEL: trunc_packus_v16i64_v16i8:
5051; AVX512:       # %bb.0:
5052; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5053; AVX512-NEXT:    vpmaxsq 64(%rdi), %zmm0, %zmm1
5054; AVX512-NEXT:    vpmovusqb %zmm1, %xmm1
5055; AVX512-NEXT:    vpmaxsq (%rdi), %zmm0, %zmm0
5056; AVX512-NEXT:    vpmovusqb %zmm0, %xmm0
5057; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5058; AVX512-NEXT:    vzeroupper
5059; AVX512-NEXT:    retq
5060;
5061; SKX-LABEL: trunc_packus_v16i64_v16i8:
5062; SKX:       # %bb.0:
5063; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5064; SKX-NEXT:    vpmaxsq 96(%rdi), %ymm0, %ymm1
5065; SKX-NEXT:    vpmovusqb %ymm1, %xmm1
5066; SKX-NEXT:    vpmaxsq 64(%rdi), %ymm0, %ymm2
5067; SKX-NEXT:    vpmovusqb %ymm2, %xmm2
5068; SKX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5069; SKX-NEXT:    vpmaxsq 32(%rdi), %ymm0, %ymm2
5070; SKX-NEXT:    vpmovusqb %ymm2, %xmm2
5071; SKX-NEXT:    vpmaxsq (%rdi), %ymm0, %ymm0
5072; SKX-NEXT:    vpmovusqb %ymm0, %xmm0
5073; SKX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
5074; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5075; SKX-NEXT:    vzeroupper
5076; SKX-NEXT:    retq
5077  %a0 = load <16 x i64>, ptr %p0
5078  %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
5079  %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
5080  %3 = icmp sgt <16 x i64> %2, zeroinitializer
5081  %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer
5082  %5 = trunc <16 x i64> %4 to <16 x i8>
5083  ret <16 x i8> %5
5084}
5085
5086define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" {
5087; SSE2-LABEL: trunc_packus_v4i32_v4i8:
5088; SSE2:       # %bb.0:
5089; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
5090; SSE2-NEXT:    movdqa %xmm1, %xmm2
5091; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
5092; SSE2-NEXT:    pand %xmm2, %xmm0
5093; SSE2-NEXT:    pandn %xmm1, %xmm2
5094; SSE2-NEXT:    por %xmm2, %xmm0
5095; SSE2-NEXT:    pxor %xmm1, %xmm1
5096; SSE2-NEXT:    movdqa %xmm0, %xmm2
5097; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
5098; SSE2-NEXT:    pand %xmm2, %xmm0
5099; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5100; SSE2-NEXT:    packuswb %xmm0, %xmm0
5101; SSE2-NEXT:    packuswb %xmm0, %xmm0
5102; SSE2-NEXT:    retq
5103;
5104; SSSE3-LABEL: trunc_packus_v4i32_v4i8:
5105; SSSE3:       # %bb.0:
5106; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
5107; SSSE3-NEXT:    movdqa %xmm1, %xmm2
5108; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
5109; SSSE3-NEXT:    pand %xmm2, %xmm0
5110; SSSE3-NEXT:    pandn %xmm1, %xmm2
5111; SSSE3-NEXT:    por %xmm2, %xmm0
5112; SSSE3-NEXT:    pxor %xmm1, %xmm1
5113; SSSE3-NEXT:    movdqa %xmm0, %xmm2
5114; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
5115; SSSE3-NEXT:    pand %xmm2, %xmm0
5116; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
5117; SSSE3-NEXT:    retq
5118;
5119; SSE41-LABEL: trunc_packus_v4i32_v4i8:
5120; SSE41:       # %bb.0:
5121; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5122; SSE41-NEXT:    pxor %xmm1, %xmm1
5123; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
5124; SSE41-NEXT:    packusdw %xmm0, %xmm0
5125; SSE41-NEXT:    packuswb %xmm0, %xmm0
5126; SSE41-NEXT:    retq
5127;
5128; AVX1-LABEL: trunc_packus_v4i32_v4i8:
5129; AVX1:       # %bb.0:
5130; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5131; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5132; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5133; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
5134; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5135; AVX1-NEXT:    retq
5136;
5137; AVX2-LABEL: trunc_packus_v4i32_v4i8:
5138; AVX2:       # %bb.0:
5139; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
5140; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
5141; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5142; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5143; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
5144; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5145; AVX2-NEXT:    retq
5146;
5147; AVX512F-LABEL: trunc_packus_v4i32_v4i8:
5148; AVX512F:       # %bb.0:
5149; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5150; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5151; AVX512F-NEXT:    vpmovusdb %zmm0, %xmm0
5152; AVX512F-NEXT:    vzeroupper
5153; AVX512F-NEXT:    retq
5154;
5155; AVX512VL-LABEL: trunc_packus_v4i32_v4i8:
5156; AVX512VL:       # %bb.0:
5157; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5158; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5159; AVX512VL-NEXT:    vpmovusdb %xmm0, %xmm0
5160; AVX512VL-NEXT:    retq
5161;
5162; AVX512BW-LABEL: trunc_packus_v4i32_v4i8:
5163; AVX512BW:       # %bb.0:
5164; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5165; AVX512BW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5166; AVX512BW-NEXT:    vpmovusdb %zmm0, %xmm0
5167; AVX512BW-NEXT:    vzeroupper
5168; AVX512BW-NEXT:    retq
5169;
5170; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8:
5171; AVX512BWVL:       # %bb.0:
5172; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5173; AVX512BWVL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5174; AVX512BWVL-NEXT:    vpmovusdb %xmm0, %xmm0
5175; AVX512BWVL-NEXT:    retq
5176;
5177; SKX-LABEL: trunc_packus_v4i32_v4i8:
5178; SKX:       # %bb.0:
5179; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5180; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5181; SKX-NEXT:    vpmovusdb %xmm0, %xmm0
5182; SKX-NEXT:    retq
5183  %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255>
5184  %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255>
5185  %3 = icmp sgt <4 x i32> %2, zeroinitializer
5186  %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer
5187  %5 = trunc <4 x i32> %4 to <4 x i8>
5188  ret <4 x i8> %5
5189}
5190
5191define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
5192; SSE2-LABEL: trunc_packus_v4i32_v4i8_store:
5193; SSE2:       # %bb.0:
5194; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
5195; SSE2-NEXT:    movdqa %xmm1, %xmm2
5196; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
5197; SSE2-NEXT:    pand %xmm2, %xmm0
5198; SSE2-NEXT:    pandn %xmm1, %xmm2
5199; SSE2-NEXT:    por %xmm0, %xmm2
5200; SSE2-NEXT:    pxor %xmm0, %xmm0
5201; SSE2-NEXT:    movdqa %xmm2, %xmm1
5202; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
5203; SSE2-NEXT:    pand %xmm2, %xmm1
5204; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
5205; SSE2-NEXT:    packuswb %xmm1, %xmm1
5206; SSE2-NEXT:    packuswb %xmm1, %xmm1
5207; SSE2-NEXT:    movd %xmm1, (%rdi)
5208; SSE2-NEXT:    retq
5209;
5210; SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
5211; SSSE3:       # %bb.0:
5212; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
5213; SSSE3-NEXT:    movdqa %xmm1, %xmm2
5214; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
5215; SSSE3-NEXT:    pand %xmm2, %xmm0
5216; SSSE3-NEXT:    pandn %xmm1, %xmm2
5217; SSSE3-NEXT:    por %xmm0, %xmm2
5218; SSSE3-NEXT:    pxor %xmm0, %xmm0
5219; SSSE3-NEXT:    movdqa %xmm2, %xmm1
5220; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
5221; SSSE3-NEXT:    pand %xmm2, %xmm1
5222; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
5223; SSSE3-NEXT:    movd %xmm1, (%rdi)
5224; SSSE3-NEXT:    retq
5225;
5226; SSE41-LABEL: trunc_packus_v4i32_v4i8_store:
5227; SSE41:       # %bb.0:
5228; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5229; SSE41-NEXT:    pxor %xmm1, %xmm1
5230; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
5231; SSE41-NEXT:    packusdw %xmm1, %xmm1
5232; SSE41-NEXT:    packuswb %xmm1, %xmm1
5233; SSE41-NEXT:    movd %xmm1, (%rdi)
5234; SSE41-NEXT:    retq
5235;
5236; AVX1-LABEL: trunc_packus_v4i32_v4i8_store:
5237; AVX1:       # %bb.0:
5238; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5239; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5240; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5241; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
5242; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5243; AVX1-NEXT:    vmovd %xmm0, (%rdi)
5244; AVX1-NEXT:    retq
5245;
5246; AVX2-LABEL: trunc_packus_v4i32_v4i8_store:
5247; AVX2:       # %bb.0:
5248; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
5249; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
5250; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5251; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5252; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
5253; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5254; AVX2-NEXT:    vmovd %xmm0, (%rdi)
5255; AVX2-NEXT:    retq
5256;
5257; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store:
5258; AVX512F:       # %bb.0:
5259; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5260; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5261; AVX512F-NEXT:    vpmovusdb %zmm0, %xmm0
5262; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
5263; AVX512F-NEXT:    vzeroupper
5264; AVX512F-NEXT:    retq
5265;
5266; AVX512VL-LABEL: trunc_packus_v4i32_v4i8_store:
5267; AVX512VL:       # %bb.0:
5268; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5269; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5270; AVX512VL-NEXT:    vpmovusdb %xmm0, (%rdi)
5271; AVX512VL-NEXT:    retq
5272;
5273; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store:
5274; AVX512BW:       # %bb.0:
5275; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5276; AVX512BW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5277; AVX512BW-NEXT:    vpmovusdb %zmm0, %xmm0
5278; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
5279; AVX512BW-NEXT:    vzeroupper
5280; AVX512BW-NEXT:    retq
5281;
5282; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8_store:
5283; AVX512BWVL:       # %bb.0:
5284; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5285; AVX512BWVL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5286; AVX512BWVL-NEXT:    vpmovusdb %xmm0, (%rdi)
5287; AVX512BWVL-NEXT:    retq
5288;
5289; SKX-LABEL: trunc_packus_v4i32_v4i8_store:
5290; SKX:       # %bb.0:
5291; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5292; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
5293; SKX-NEXT:    vpmovusdb %xmm0, (%rdi)
5294; SKX-NEXT:    retq
5295  %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255>
5296  %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255>
5297  %3 = icmp sgt <4 x i32> %2, zeroinitializer
5298  %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer
5299  %5 = trunc <4 x i32> %4 to <4 x i8>
5300  store <4 x i8> %5, ptr%p1
5301  ret void
5302}
5303
5304define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
5305; SSE-LABEL: trunc_packus_v8i32_v8i8:
5306; SSE:       # %bb.0:
5307; SSE-NEXT:    packssdw %xmm1, %xmm0
5308; SSE-NEXT:    packuswb %xmm0, %xmm0
5309; SSE-NEXT:    retq
5310;
5311; AVX1-LABEL: trunc_packus_v8i32_v8i8:
5312; AVX1:       # %bb.0:
5313; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
5314; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5315; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5316; AVX1-NEXT:    vzeroupper
5317; AVX1-NEXT:    retq
5318;
5319; AVX2-LABEL: trunc_packus_v8i32_v8i8:
5320; AVX2:       # %bb.0:
5321; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5322; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5323; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5324; AVX2-NEXT:    vzeroupper
5325; AVX2-NEXT:    retq
5326;
5327; AVX512F-LABEL: trunc_packus_v8i32_v8i8:
5328; AVX512F:       # %bb.0:
5329; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
5330; AVX512F-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5331; AVX512F-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5332; AVX512F-NEXT:    vzeroupper
5333; AVX512F-NEXT:    retq
5334;
5335; AVX512VL-LABEL: trunc_packus_v8i32_v8i8:
5336; AVX512VL:       # %bb.0:
5337; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5338; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
5339; AVX512VL-NEXT:    vpmovusdb %ymm0, %xmm0
5340; AVX512VL-NEXT:    vzeroupper
5341; AVX512VL-NEXT:    retq
5342;
5343; AVX512BW-LABEL: trunc_packus_v8i32_v8i8:
5344; AVX512BW:       # %bb.0:
5345; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
5346; AVX512BW-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5347; AVX512BW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5348; AVX512BW-NEXT:    vzeroupper
5349; AVX512BW-NEXT:    retq
5350;
5351; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8:
5352; AVX512BWVL:       # %bb.0:
5353; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5354; AVX512BWVL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
5355; AVX512BWVL-NEXT:    vpmovusdb %ymm0, %xmm0
5356; AVX512BWVL-NEXT:    vzeroupper
5357; AVX512BWVL-NEXT:    retq
5358;
5359; SKX-LABEL: trunc_packus_v8i32_v8i8:
5360; SKX:       # %bb.0:
5361; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5362; SKX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
5363; SKX-NEXT:    vpmovusdb %ymm0, %xmm0
5364; SKX-NEXT:    vzeroupper
5365; SKX-NEXT:    retq
5366  %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5367  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5368  %3 = icmp sgt <8 x i32> %2, zeroinitializer
5369  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
5370  %5 = trunc <8 x i32> %4 to <8 x i8>
5371  ret <8 x i8> %5
5372}
5373
5374define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) {
5375; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
5376; SSE:       # %bb.0:
5377; SSE-NEXT:    packssdw %xmm1, %xmm0
5378; SSE-NEXT:    packuswb %xmm0, %xmm0
5379; SSE-NEXT:    movq %xmm0, (%rdi)
5380; SSE-NEXT:    retq
5381;
5382; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
5383; AVX1:       # %bb.0:
5384; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
5385; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5386; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5387; AVX1-NEXT:    vmovq %xmm0, (%rdi)
5388; AVX1-NEXT:    vzeroupper
5389; AVX1-NEXT:    retq
5390;
5391; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
5392; AVX2:       # %bb.0:
5393; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5394; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5395; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5396; AVX2-NEXT:    vmovq %xmm0, (%rdi)
5397; AVX2-NEXT:    vzeroupper
5398; AVX2-NEXT:    retq
5399;
5400; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store:
5401; AVX512F:       # %bb.0:
5402; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
5403; AVX512F-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5404; AVX512F-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5405; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
5406; AVX512F-NEXT:    vzeroupper
5407; AVX512F-NEXT:    retq
5408;
5409; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store:
5410; AVX512VL:       # %bb.0:
5411; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5412; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
5413; AVX512VL-NEXT:    vpmovusdb %ymm0, (%rdi)
5414; AVX512VL-NEXT:    vzeroupper
5415; AVX512VL-NEXT:    retq
5416;
5417; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store:
5418; AVX512BW:       # %bb.0:
5419; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
5420; AVX512BW-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
5421; AVX512BW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5422; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
5423; AVX512BW-NEXT:    vzeroupper
5424; AVX512BW-NEXT:    retq
5425;
5426; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store:
5427; AVX512BWVL:       # %bb.0:
5428; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5429; AVX512BWVL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
5430; AVX512BWVL-NEXT:    vpmovusdb %ymm0, (%rdi)
5431; AVX512BWVL-NEXT:    vzeroupper
5432; AVX512BWVL-NEXT:    retq
5433;
5434; SKX-LABEL: trunc_packus_v8i32_v8i8_store:
5435; SKX:       # %bb.0:
5436; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5437; SKX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
5438; SKX-NEXT:    vpmovusdb %ymm0, (%rdi)
5439; SKX-NEXT:    vzeroupper
5440; SKX-NEXT:    retq
5441  %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5442  %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5443  %3 = icmp sgt <8 x i32> %2, zeroinitializer
5444  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
5445  %5 = trunc <8 x i32> %4 to <8 x i8>
5446  store <8 x i8> %5, ptr%p1
5447  ret void
5448}
5449
5450define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256" {
5451; SSE-LABEL: trunc_packus_v16i32_v16i8:
5452; SSE:       # %bb.0:
5453; SSE-NEXT:    movdqa (%rdi), %xmm0
5454; SSE-NEXT:    movdqa 32(%rdi), %xmm1
5455; SSE-NEXT:    packssdw 48(%rdi), %xmm1
5456; SSE-NEXT:    packssdw 16(%rdi), %xmm0
5457; SSE-NEXT:    packuswb %xmm1, %xmm0
5458; SSE-NEXT:    retq
5459;
5460; AVX1-LABEL: trunc_packus_v16i32_v16i8:
5461; AVX1:       # %bb.0:
5462; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
5463; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm1
5464; AVX1-NEXT:    vpackssdw 48(%rdi), %xmm1, %xmm1
5465; AVX1-NEXT:    vpackssdw 16(%rdi), %xmm0, %xmm0
5466; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5467; AVX1-NEXT:    retq
5468;
5469; AVX2-LABEL: trunc_packus_v16i32_v16i8:
5470; AVX2:       # %bb.0:
5471; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5472; AVX2-NEXT:    vpackssdw 32(%rdi), %ymm0, %ymm0
5473; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5474; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5475; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
5476; AVX2-NEXT:    vzeroupper
5477; AVX2-NEXT:    retq
5478;
5479; AVX512-LABEL: trunc_packus_v16i32_v16i8:
5480; AVX512:       # %bb.0:
5481; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5482; AVX512-NEXT:    vpmaxsd (%rdi), %zmm0, %zmm0
5483; AVX512-NEXT:    vpmovusdb %zmm0, %xmm0
5484; AVX512-NEXT:    vzeroupper
5485; AVX512-NEXT:    retq
5486;
5487; SKX-LABEL: trunc_packus_v16i32_v16i8:
5488; SKX:       # %bb.0:
5489; SKX-NEXT:    vmovdqa (%rdi), %ymm0
5490; SKX-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
5491; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5492; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
5493; SKX-NEXT:    vzeroupper
5494; SKX-NEXT:    retq
5495  %a0 = load <16 x i32>, ptr %p0
5496  %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5497  %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5498  %3 = icmp sgt <16 x i32> %2, zeroinitializer
5499  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5500  %5 = trunc <16 x i32> %4 to <16 x i8>
5501  ret <16 x i8> %5
5502}
5503
5504define void @trunc_packus_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector-width"="256" {
5505; SSE-LABEL: trunc_packus_v16i32_v16i8_store:
5506; SSE:       # %bb.0:
5507; SSE-NEXT:    movdqa (%rdi), %xmm0
5508; SSE-NEXT:    movdqa 32(%rdi), %xmm1
5509; SSE-NEXT:    packssdw 48(%rdi), %xmm1
5510; SSE-NEXT:    packssdw 16(%rdi), %xmm0
5511; SSE-NEXT:    packuswb %xmm1, %xmm0
5512; SSE-NEXT:    movdqa %xmm0, (%rsi)
5513; SSE-NEXT:    retq
5514;
5515; AVX1-LABEL: trunc_packus_v16i32_v16i8_store:
5516; AVX1:       # %bb.0:
5517; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
5518; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm1
5519; AVX1-NEXT:    vpackssdw 48(%rdi), %xmm1, %xmm1
5520; AVX1-NEXT:    vpackssdw 16(%rdi), %xmm0, %xmm0
5521; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5522; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
5523; AVX1-NEXT:    retq
5524;
5525; AVX2-LABEL: trunc_packus_v16i32_v16i8_store:
5526; AVX2:       # %bb.0:
5527; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5528; AVX2-NEXT:    vpackssdw 32(%rdi), %ymm0, %ymm0
5529; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5530; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5531; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
5532; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
5533; AVX2-NEXT:    vzeroupper
5534; AVX2-NEXT:    retq
5535;
5536; AVX512-LABEL: trunc_packus_v16i32_v16i8_store:
5537; AVX512:       # %bb.0:
5538; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5539; AVX512-NEXT:    vpmaxsd (%rdi), %zmm0, %zmm0
5540; AVX512-NEXT:    vpmovusdb %zmm0, (%rsi)
5541; AVX512-NEXT:    vzeroupper
5542; AVX512-NEXT:    retq
5543;
5544; SKX-LABEL: trunc_packus_v16i32_v16i8_store:
5545; SKX:       # %bb.0:
5546; SKX-NEXT:    vmovdqa (%rdi), %ymm0
5547; SKX-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
5548; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5549; SKX-NEXT:    vpmovuswb %ymm0, (%rsi)
5550; SKX-NEXT:    vzeroupper
5551; SKX-NEXT:    retq
5552  %a = load <16 x i32>, ptr %p0
5553  %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5554  %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5555  %d = icmp sgt <16 x i32> %c, zeroinitializer
5556  %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
5557  %f = trunc <16 x i32> %e to <16 x i8>
5558  store <16 x i8> %f, ptr %p1
5559  ret void
5560}
5561
5562define <8 x i8> @trunc_packus_v8i16_v8i8(<8 x i16> %a0) {
5563; SSE-LABEL: trunc_packus_v8i16_v8i8:
5564; SSE:       # %bb.0:
5565; SSE-NEXT:    packuswb %xmm0, %xmm0
5566; SSE-NEXT:    retq
5567;
5568; AVX-LABEL: trunc_packus_v8i16_v8i8:
5569; AVX:       # %bb.0:
5570; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5571; AVX-NEXT:    retq
5572;
5573; AVX512-LABEL: trunc_packus_v8i16_v8i8:
5574; AVX512:       # %bb.0:
5575; AVX512-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5576; AVX512-NEXT:    retq
5577;
5578; SKX-LABEL: trunc_packus_v8i16_v8i8:
5579; SKX:       # %bb.0:
5580; SKX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5581; SKX-NEXT:    retq
5582  %1 = icmp slt <8 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5583  %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5584  %3 = icmp sgt <8 x i16> %2, zeroinitializer
5585  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
5586  %5 = trunc <8 x i16> %4 to <8 x i8>
5587  ret <8 x i8> %5
5588}
5589
5590define void @trunc_packus_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) {
5591; SSE-LABEL: trunc_packus_v8i16_v8i8_store:
5592; SSE:       # %bb.0:
5593; SSE-NEXT:    packuswb %xmm0, %xmm0
5594; SSE-NEXT:    movq %xmm0, (%rdi)
5595; SSE-NEXT:    retq
5596;
5597; AVX-LABEL: trunc_packus_v8i16_v8i8_store:
5598; AVX:       # %bb.0:
5599; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5600; AVX-NEXT:    vmovq %xmm0, (%rdi)
5601; AVX-NEXT:    retq
5602;
5603; AVX512F-LABEL: trunc_packus_v8i16_v8i8_store:
5604; AVX512F:       # %bb.0:
5605; AVX512F-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5606; AVX512F-NEXT:    vmovq %xmm0, (%rdi)
5607; AVX512F-NEXT:    retq
5608;
5609; AVX512VL-LABEL: trunc_packus_v8i16_v8i8_store:
5610; AVX512VL:       # %bb.0:
5611; AVX512VL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5612; AVX512VL-NEXT:    vmovq %xmm0, (%rdi)
5613; AVX512VL-NEXT:    retq
5614;
5615; AVX512BW-LABEL: trunc_packus_v8i16_v8i8_store:
5616; AVX512BW:       # %bb.0:
5617; AVX512BW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
5618; AVX512BW-NEXT:    vmovq %xmm0, (%rdi)
5619; AVX512BW-NEXT:    retq
5620;
5621; AVX512BWVL-LABEL: trunc_packus_v8i16_v8i8_store:
5622; AVX512BWVL:       # %bb.0:
5623; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5624; AVX512BWVL-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
5625; AVX512BWVL-NEXT:    vpmovuswb %xmm0, (%rdi)
5626; AVX512BWVL-NEXT:    retq
5627;
5628; SKX-LABEL: trunc_packus_v8i16_v8i8_store:
5629; SKX:       # %bb.0:
5630; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5631; SKX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
5632; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
5633; SKX-NEXT:    retq
5634  %1 = icmp slt <8 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5635  %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5636  %3 = icmp sgt <8 x i16> %2, zeroinitializer
5637  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
5638  %5 = trunc <8 x i16> %4 to <8 x i8>
5639  store <8 x i8> %5, ptr%p1
5640  ret void
5641}
5642
5643define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
5644; SSE-LABEL: trunc_packus_v16i16_v16i8:
5645; SSE:       # %bb.0:
5646; SSE-NEXT:    packuswb %xmm1, %xmm0
5647; SSE-NEXT:    retq
5648;
5649; AVX1-LABEL: trunc_packus_v16i16_v16i8:
5650; AVX1:       # %bb.0:
5651; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
5652; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5653; AVX1-NEXT:    vzeroupper
5654; AVX1-NEXT:    retq
5655;
5656; AVX2-LABEL: trunc_packus_v16i16_v16i8:
5657; AVX2:       # %bb.0:
5658; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5659; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5660; AVX2-NEXT:    vzeroupper
5661; AVX2-NEXT:    retq
5662;
5663; AVX512F-LABEL: trunc_packus_v16i16_v16i8:
5664; AVX512F:       # %bb.0:
5665; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
5666; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5667; AVX512F-NEXT:    vzeroupper
5668; AVX512F-NEXT:    retq
5669;
5670; AVX512VL-LABEL: trunc_packus_v16i16_v16i8:
5671; AVX512VL:       # %bb.0:
5672; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
5673; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5674; AVX512VL-NEXT:    vzeroupper
5675; AVX512VL-NEXT:    retq
5676;
5677; AVX512BW-LABEL: trunc_packus_v16i16_v16i8:
5678; AVX512BW:       # %bb.0:
5679; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
5680; AVX512BW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5681; AVX512BW-NEXT:    vzeroupper
5682; AVX512BW-NEXT:    retq
5683;
5684; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8:
5685; AVX512BWVL:       # %bb.0:
5686; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5687; AVX512BWVL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
5688; AVX512BWVL-NEXT:    vpmovuswb %ymm0, %xmm0
5689; AVX512BWVL-NEXT:    vzeroupper
5690; AVX512BWVL-NEXT:    retq
5691;
5692; SKX-LABEL: trunc_packus_v16i16_v16i8:
5693; SKX:       # %bb.0:
5694; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
5695; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
5696; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
5697; SKX-NEXT:    vzeroupper
5698; SKX-NEXT:    retq
5699  %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5700  %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5701  %3 = icmp sgt <16 x i16> %2, zeroinitializer
5702  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
5703  %5 = trunc <16 x i16> %4 to <16 x i8>
5704  ret <16 x i8> %5
5705}
5706
5707define <32 x i8> @trunc_packus_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="256" {
5708; SSE-LABEL: trunc_packus_v32i16_v32i8:
5709; SSE:       # %bb.0:
5710; SSE-NEXT:    movdqa (%rdi), %xmm0
5711; SSE-NEXT:    movdqa 32(%rdi), %xmm1
5712; SSE-NEXT:    packuswb 16(%rdi), %xmm0
5713; SSE-NEXT:    packuswb 48(%rdi), %xmm1
5714; SSE-NEXT:    retq
5715;
5716; AVX1-LABEL: trunc_packus_v32i16_v32i8:
5717; AVX1:       # %bb.0:
5718; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
5719; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm1
5720; AVX1-NEXT:    vpackuswb 48(%rdi), %xmm1, %xmm1
5721; AVX1-NEXT:    vpackuswb 16(%rdi), %xmm0, %xmm0
5722; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
5723; AVX1-NEXT:    retq
5724;
5725; AVX2-LABEL: trunc_packus_v32i16_v32i8:
5726; AVX2:       # %bb.0:
5727; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5728; AVX2-NEXT:    vpackuswb 32(%rdi), %ymm0, %ymm0
5729; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5730; AVX2-NEXT:    retq
5731;
5732; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
5733; AVX512F:       # %bb.0:
5734; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
5735; AVX512F-NEXT:    vpackuswb 32(%rdi), %ymm0, %ymm0
5736; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5737; AVX512F-NEXT:    retq
5738;
5739; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
5740; AVX512VL:       # %bb.0:
5741; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
5742; AVX512VL-NEXT:    vpackuswb 32(%rdi), %ymm0, %ymm0
5743; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5744; AVX512VL-NEXT:    retq
5745;
5746; AVX512BW-LABEL: trunc_packus_v32i16_v32i8:
5747; AVX512BW:       # %bb.0:
5748; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5749; AVX512BW-NEXT:    vpmaxsw (%rdi), %zmm0, %zmm0
5750; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
5751; AVX512BW-NEXT:    retq
5752;
5753; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8:
5754; AVX512BWVL:       # %bb.0:
5755; AVX512BWVL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5756; AVX512BWVL-NEXT:    vpmaxsw (%rdi), %zmm0, %zmm0
5757; AVX512BWVL-NEXT:    vpmovuswb %zmm0, %ymm0
5758; AVX512BWVL-NEXT:    retq
5759;
5760; SKX-LABEL: trunc_packus_v32i16_v32i8:
5761; SKX:       # %bb.0:
5762; SKX-NEXT:    vmovdqa (%rdi), %ymm0
5763; SKX-NEXT:    vpackuswb 32(%rdi), %ymm0, %ymm0
5764; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5765; SKX-NEXT:    retq
5766  %a0 = load <32 x i16>, ptr %p0
5767  %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5768  %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
5769  %3 = icmp sgt <32 x i16> %2, zeroinitializer
5770  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5771  %5 = trunc <32 x i16> %4 to <32 x i8>
5772  ret <32 x i8> %5
5773}
5774
5775define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256" {
5776; SSE-LABEL: trunc_packus_v32i32_v32i8:
5777; SSE:       # %bb.0:
5778; SSE-NEXT:    movdqa (%rdi), %xmm0
5779; SSE-NEXT:    movdqa 32(%rdi), %xmm2
5780; SSE-NEXT:    movdqa 64(%rdi), %xmm1
5781; SSE-NEXT:    movdqa 96(%rdi), %xmm3
5782; SSE-NEXT:    packssdw 48(%rdi), %xmm2
5783; SSE-NEXT:    packssdw 16(%rdi), %xmm0
5784; SSE-NEXT:    packuswb %xmm2, %xmm0
5785; SSE-NEXT:    packssdw 112(%rdi), %xmm3
5786; SSE-NEXT:    packssdw 80(%rdi), %xmm1
5787; SSE-NEXT:    packuswb %xmm3, %xmm1
5788; SSE-NEXT:    retq
5789;
5790; AVX1-LABEL: trunc_packus_v32i32_v32i8:
5791; AVX1:       # %bb.0:
5792; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
5793; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm1
5794; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm2
5795; AVX1-NEXT:    vmovdqa 96(%rdi), %xmm3
5796; AVX1-NEXT:    vpackssdw 112(%rdi), %xmm3, %xmm3
5797; AVX1-NEXT:    vpackssdw 80(%rdi), %xmm2, %xmm2
5798; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
5799; AVX1-NEXT:    vpackssdw 48(%rdi), %xmm1, %xmm1
5800; AVX1-NEXT:    vpackssdw 16(%rdi), %xmm0, %xmm0
5801; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5802; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
5803; AVX1-NEXT:    retq
5804;
5805; AVX2-LABEL: trunc_packus_v32i32_v32i8:
5806; AVX2:       # %bb.0:
5807; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
5808; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm1
5809; AVX2-NEXT:    vpackssdw 96(%rdi), %ymm1, %ymm1
5810; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
5811; AVX2-NEXT:    vpackssdw 32(%rdi), %ymm0, %ymm0
5812; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5813; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
5814; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5815; AVX2-NEXT:    retq
5816;
5817; AVX512-LABEL: trunc_packus_v32i32_v32i8:
5818; AVX512:       # %bb.0:
5819; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
5820; AVX512-NEXT:    vpmaxsd (%rdi), %zmm0, %zmm1
5821; AVX512-NEXT:    vpmovusdb %zmm1, %xmm1
5822; AVX512-NEXT:    vpmaxsd 64(%rdi), %zmm0, %zmm0
5823; AVX512-NEXT:    vpmovusdb %zmm0, %xmm0
5824; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
5825; AVX512-NEXT:    retq
5826;
5827; SKX-LABEL: trunc_packus_v32i32_v32i8:
5828; SKX:       # %bb.0:
5829; SKX-NEXT:    vmovdqa (%rdi), %ymm0
5830; SKX-NEXT:    vmovdqa 64(%rdi), %ymm1
5831; SKX-NEXT:    vpackssdw 96(%rdi), %ymm1, %ymm1
5832; SKX-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
5833; SKX-NEXT:    vpackssdw 32(%rdi), %ymm0, %ymm0
5834; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5835; SKX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
5836; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
5837; SKX-NEXT:    retq
5838  %a0 = load <32 x i32>, ptr %p0
5839  %1 = icmp slt <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5840  %2 = select <32 x i1> %1, <32 x i32> %a0, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
5841  %3 = icmp sgt <32 x i32> %2, zeroinitializer
5842  %4 = select <32 x i1> %3, <32 x i32> %2, <32 x i32> zeroinitializer
5843  %5 = trunc <32 x i32> %4 to <32 x i8>
5844  ret <32 x i8> %5
5845}
5846