1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9;
10; Just two 32-bit runs to make sure we do reasonable things there.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE2
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
13
14define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
15; SSE2-LABEL: sext_16i8_to_8i16:
16; SSE2:       # %bb.0: # %entry
17; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
18; SSE2-NEXT:    psraw $8, %xmm0
19; SSE2-NEXT:    retq
20;
21; SSSE3-LABEL: sext_16i8_to_8i16:
22; SSSE3:       # %bb.0: # %entry
23; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
24; SSSE3-NEXT:    psraw $8, %xmm0
25; SSSE3-NEXT:    retq
26;
27; SSE41-LABEL: sext_16i8_to_8i16:
28; SSE41:       # %bb.0: # %entry
29; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
30; SSE41-NEXT:    retq
31;
32; AVX-LABEL: sext_16i8_to_8i16:
33; AVX:       # %bb.0: # %entry
34; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
35; AVX-NEXT:    retq
36;
37; X86-SSE2-LABEL: sext_16i8_to_8i16:
38; X86-SSE2:       # %bb.0: # %entry
39; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
40; X86-SSE2-NEXT:    psraw $8, %xmm0
41; X86-SSE2-NEXT:    retl
42;
43; X86-SSE41-LABEL: sext_16i8_to_8i16:
44; X86-SSE41:       # %bb.0: # %entry
45; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
46; X86-SSE41-NEXT:    retl
47entry:
48  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
49  %C = sext <8 x i8> %B to <8 x i16>
50  ret <8 x i16> %C
51}
52
53define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
54; SSE2-LABEL: sext_16i8_to_16i16:
55; SSE2:       # %bb.0: # %entry
56; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
57; SSE2-NEXT:    psraw $8, %xmm2
58; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
59; SSE2-NEXT:    psraw $8, %xmm1
60; SSE2-NEXT:    movdqa %xmm2, %xmm0
61; SSE2-NEXT:    retq
62;
63; SSSE3-LABEL: sext_16i8_to_16i16:
64; SSSE3:       # %bb.0: # %entry
65; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
66; SSSE3-NEXT:    psraw $8, %xmm2
67; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
68; SSSE3-NEXT:    psraw $8, %xmm1
69; SSSE3-NEXT:    movdqa %xmm2, %xmm0
70; SSSE3-NEXT:    retq
71;
72; SSE41-LABEL: sext_16i8_to_16i16:
73; SSE41:       # %bb.0: # %entry
74; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
75; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
76; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
77; SSE41-NEXT:    movdqa %xmm2, %xmm0
78; SSE41-NEXT:    retq
79;
80; AVX1-LABEL: sext_16i8_to_16i16:
81; AVX1:       # %bb.0: # %entry
82; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
83; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
84; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
85; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
86; AVX1-NEXT:    retq
87;
88; AVX2-LABEL: sext_16i8_to_16i16:
89; AVX2:       # %bb.0: # %entry
90; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
91; AVX2-NEXT:    retq
92;
93; AVX512-LABEL: sext_16i8_to_16i16:
94; AVX512:       # %bb.0: # %entry
95; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
96; AVX512-NEXT:    retq
97;
98; X86-SSE2-LABEL: sext_16i8_to_16i16:
99; X86-SSE2:       # %bb.0: # %entry
100; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
101; X86-SSE2-NEXT:    psraw $8, %xmm2
102; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
103; X86-SSE2-NEXT:    psraw $8, %xmm1
104; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
105; X86-SSE2-NEXT:    retl
106;
107; X86-SSE41-LABEL: sext_16i8_to_16i16:
108; X86-SSE41:       # %bb.0: # %entry
109; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
110; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
111; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
112; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
113; X86-SSE41-NEXT:    retl
114entry:
115  %B = sext <16 x i8> %A to <16 x i16>
116  ret <16 x i16> %B
117}
118
119define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
120; SSE2-LABEL: sext_32i8_to_32i16:
121; SSE2:       # %bb.0: # %entry
122; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
123; SSE2-NEXT:    psraw $8, %xmm4
124; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
125; SSE2-NEXT:    psraw $8, %xmm5
126; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
127; SSE2-NEXT:    psraw $8, %xmm2
128; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
129; SSE2-NEXT:    psraw $8, %xmm3
130; SSE2-NEXT:    movdqa %xmm4, %xmm0
131; SSE2-NEXT:    movdqa %xmm5, %xmm1
132; SSE2-NEXT:    retq
133;
134; SSSE3-LABEL: sext_32i8_to_32i16:
135; SSSE3:       # %bb.0: # %entry
136; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
137; SSSE3-NEXT:    psraw $8, %xmm4
138; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
139; SSSE3-NEXT:    psraw $8, %xmm5
140; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
141; SSSE3-NEXT:    psraw $8, %xmm2
142; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
143; SSSE3-NEXT:    psraw $8, %xmm3
144; SSSE3-NEXT:    movdqa %xmm4, %xmm0
145; SSSE3-NEXT:    movdqa %xmm5, %xmm1
146; SSSE3-NEXT:    retq
147;
148; SSE41-LABEL: sext_32i8_to_32i16:
149; SSE41:       # %bb.0: # %entry
150; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
151; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
152; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
153; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
154; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
155; SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
156; SSE41-NEXT:    movdqa %xmm5, %xmm0
157; SSE41-NEXT:    movdqa %xmm4, %xmm1
158; SSE41-NEXT:    retq
159;
160; AVX1-LABEL: sext_32i8_to_32i16:
161; AVX1:       # %bb.0: # %entry
162; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
163; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
164; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
165; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
166; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
167; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
168; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
169; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
170; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
171; AVX1-NEXT:    vmovaps %ymm2, %ymm0
172; AVX1-NEXT:    retq
173;
174; AVX2-LABEL: sext_32i8_to_32i16:
175; AVX2:       # %bb.0: # %entry
176; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
177; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
178; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
179; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
180; AVX2-NEXT:    retq
181;
182; AVX512F-LABEL: sext_32i8_to_32i16:
183; AVX512F:       # %bb.0: # %entry
184; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
185; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
186; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm0
187; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
188; AVX512F-NEXT:    retq
189;
190; AVX512BW-LABEL: sext_32i8_to_32i16:
191; AVX512BW:       # %bb.0: # %entry
192; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
193; AVX512BW-NEXT:    retq
194;
195; X86-SSE2-LABEL: sext_32i8_to_32i16:
196; X86-SSE2:       # %bb.0: # %entry
197; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
198; X86-SSE2-NEXT:    psraw $8, %xmm4
199; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
200; X86-SSE2-NEXT:    psraw $8, %xmm5
201; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
202; X86-SSE2-NEXT:    psraw $8, %xmm2
203; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
204; X86-SSE2-NEXT:    psraw $8, %xmm3
205; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
206; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
207; X86-SSE2-NEXT:    retl
208;
209; X86-SSE41-LABEL: sext_32i8_to_32i16:
210; X86-SSE41:       # %bb.0: # %entry
211; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
212; X86-SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
213; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
214; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
215; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
216; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
217; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
218; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
219; X86-SSE41-NEXT:    retl
220entry:
221  %B = sext <32 x i8> %A to <32 x i16>
222  ret <32 x i16> %B
223}
224
225define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
226; SSE2-LABEL: sext_16i8_to_4i32:
227; SSE2:       # %bb.0: # %entry
228; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
229; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
230; SSE2-NEXT:    psrad $24, %xmm0
231; SSE2-NEXT:    retq
232;
233; SSSE3-LABEL: sext_16i8_to_4i32:
234; SSSE3:       # %bb.0: # %entry
235; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
236; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
237; SSSE3-NEXT:    psrad $24, %xmm0
238; SSSE3-NEXT:    retq
239;
240; SSE41-LABEL: sext_16i8_to_4i32:
241; SSE41:       # %bb.0: # %entry
242; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
243; SSE41-NEXT:    retq
244;
245; AVX-LABEL: sext_16i8_to_4i32:
246; AVX:       # %bb.0: # %entry
247; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
248; AVX-NEXT:    retq
249;
250; X86-SSE2-LABEL: sext_16i8_to_4i32:
251; X86-SSE2:       # %bb.0: # %entry
252; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
253; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
254; X86-SSE2-NEXT:    psrad $24, %xmm0
255; X86-SSE2-NEXT:    retl
256;
257; X86-SSE41-LABEL: sext_16i8_to_4i32:
258; X86-SSE41:       # %bb.0: # %entry
259; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
260; X86-SSE41-NEXT:    retl
261entry:
262  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
263  %C = sext <4 x i8> %B to <4 x i32>
264  ret <4 x i32> %C
265}
266
267define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
268; SSE2-LABEL: sext_16i8_to_8i32:
269; SSE2:       # %bb.0: # %entry
270; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
271; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
272; SSE2-NEXT:    psrad $24, %xmm0
273; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
274; SSE2-NEXT:    psrad $24, %xmm1
275; SSE2-NEXT:    retq
276;
277; SSSE3-LABEL: sext_16i8_to_8i32:
278; SSSE3:       # %bb.0: # %entry
279; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
280; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
281; SSSE3-NEXT:    psrad $24, %xmm0
282; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
283; SSSE3-NEXT:    psrad $24, %xmm1
284; SSSE3-NEXT:    retq
285;
286; SSE41-LABEL: sext_16i8_to_8i32:
287; SSE41:       # %bb.0: # %entry
288; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
289; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
290; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
291; SSE41-NEXT:    movdqa %xmm2, %xmm0
292; SSE41-NEXT:    retq
293;
294; AVX1-LABEL: sext_16i8_to_8i32:
295; AVX1:       # %bb.0: # %entry
296; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
297; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
298; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
299; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
300; AVX1-NEXT:    retq
301;
302; AVX2-LABEL: sext_16i8_to_8i32:
303; AVX2:       # %bb.0: # %entry
304; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
305; AVX2-NEXT:    retq
306;
307; AVX512-LABEL: sext_16i8_to_8i32:
308; AVX512:       # %bb.0: # %entry
309; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
310; AVX512-NEXT:    retq
311;
312; X86-SSE2-LABEL: sext_16i8_to_8i32:
313; X86-SSE2:       # %bb.0: # %entry
314; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
315; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
316; X86-SSE2-NEXT:    psrad $24, %xmm0
317; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
318; X86-SSE2-NEXT:    psrad $24, %xmm1
319; X86-SSE2-NEXT:    retl
320;
321; X86-SSE41-LABEL: sext_16i8_to_8i32:
322; X86-SSE41:       # %bb.0: # %entry
323; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
324; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
325; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
326; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
327; X86-SSE41-NEXT:    retl
328entry:
329  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
330  %C = sext <8 x i8> %B to <8 x i32>
331  ret <8 x i32> %C
332}
333
334define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
335; SSE2-LABEL: sext_16i8_to_16i32:
336; SSE2:       # %bb.0: # %entry
337; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
338; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
339; SSE2-NEXT:    psrad $24, %xmm4
340; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
341; SSE2-NEXT:    psrad $24, %xmm1
342; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
344; SSE2-NEXT:    psrad $24, %xmm2
345; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
346; SSE2-NEXT:    psrad $24, %xmm3
347; SSE2-NEXT:    movdqa %xmm4, %xmm0
348; SSE2-NEXT:    retq
349;
350; SSSE3-LABEL: sext_16i8_to_16i32:
351; SSSE3:       # %bb.0: # %entry
352; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
353; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
354; SSSE3-NEXT:    psrad $24, %xmm4
355; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
356; SSSE3-NEXT:    psrad $24, %xmm1
357; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
358; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
359; SSSE3-NEXT:    psrad $24, %xmm2
360; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
361; SSSE3-NEXT:    psrad $24, %xmm3
362; SSSE3-NEXT:    movdqa %xmm4, %xmm0
363; SSSE3-NEXT:    retq
364;
365; SSE41-LABEL: sext_16i8_to_16i32:
366; SSE41:       # %bb.0: # %entry
367; SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
368; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
369; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
370; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
371; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
372; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
373; SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
374; SSE41-NEXT:    movdqa %xmm4, %xmm0
375; SSE41-NEXT:    retq
376;
377; AVX1-LABEL: sext_16i8_to_16i32:
378; AVX1:       # %bb.0: # %entry
379; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
380; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
381; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
382; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
383; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
384; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
385; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
386; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
387; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
388; AVX1-NEXT:    vmovaps %ymm2, %ymm0
389; AVX1-NEXT:    retq
390;
391; AVX2-LABEL: sext_16i8_to_16i32:
392; AVX2:       # %bb.0: # %entry
393; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm2
394; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
395; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
396; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
397; AVX2-NEXT:    retq
398;
399; AVX512-LABEL: sext_16i8_to_16i32:
400; AVX512:       # %bb.0: # %entry
401; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
402; AVX512-NEXT:    retq
403;
404; X86-SSE2-LABEL: sext_16i8_to_16i32:
405; X86-SSE2:       # %bb.0: # %entry
406; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
407; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
408; X86-SSE2-NEXT:    psrad $24, %xmm4
409; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
410; X86-SSE2-NEXT:    psrad $24, %xmm1
411; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
412; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
413; X86-SSE2-NEXT:    psrad $24, %xmm2
414; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
415; X86-SSE2-NEXT:    psrad $24, %xmm3
416; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
417; X86-SSE2-NEXT:    retl
418;
419; X86-SSE41-LABEL: sext_16i8_to_16i32:
420; X86-SSE41:       # %bb.0: # %entry
421; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
422; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
423; X86-SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
424; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
425; X86-SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
426; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
427; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
428; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
429; X86-SSE41-NEXT:    retl
430entry:
431  %B = sext <16 x i8> %A to <16 x i32>
432  ret <16 x i32> %B
433}
434
435define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
436; SSE2-LABEL: sext_16i8_to_2i64:
437; SSE2:       # %bb.0: # %entry
438; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
439; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
440; SSE2-NEXT:    pxor %xmm1, %xmm1
441; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
442; SSE2-NEXT:    psrad $24, %xmm0
443; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
444; SSE2-NEXT:    retq
445;
446; SSSE3-LABEL: sext_16i8_to_2i64:
447; SSSE3:       # %bb.0: # %entry
448; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
449; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
450; SSSE3-NEXT:    pxor %xmm1, %xmm1
451; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
452; SSSE3-NEXT:    psrad $24, %xmm0
453; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
454; SSSE3-NEXT:    retq
455;
456; SSE41-LABEL: sext_16i8_to_2i64:
457; SSE41:       # %bb.0: # %entry
458; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
459; SSE41-NEXT:    retq
460;
461; AVX-LABEL: sext_16i8_to_2i64:
462; AVX:       # %bb.0: # %entry
463; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
464; AVX-NEXT:    retq
465;
466; X86-SSE2-LABEL: sext_16i8_to_2i64:
467; X86-SSE2:       # %bb.0: # %entry
468; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
469; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
470; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
471; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
472; X86-SSE2-NEXT:    psrad $24, %xmm0
473; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
474; X86-SSE2-NEXT:    retl
475;
476; X86-SSE41-LABEL: sext_16i8_to_2i64:
477; X86-SSE41:       # %bb.0: # %entry
478; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
479; X86-SSE41-NEXT:    retl
480entry:
481  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
482  %C = sext <2 x i8> %B to <2 x i64>
483  ret <2 x i64> %C
484}
485
486define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
487; SSE2-LABEL: sext_16i8_to_4i64:
488; SSE2:       # %bb.0: # %entry
489; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
490; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
491; SSE2-NEXT:    psrad $24, %xmm1
492; SSE2-NEXT:    pxor %xmm2, %xmm2
493; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
494; SSE2-NEXT:    movdqa %xmm1, %xmm0
495; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
496; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
497; SSE2-NEXT:    retq
498;
499; SSSE3-LABEL: sext_16i8_to_4i64:
500; SSSE3:       # %bb.0: # %entry
501; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
502; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
503; SSSE3-NEXT:    psrad $24, %xmm1
504; SSSE3-NEXT:    pxor %xmm2, %xmm2
505; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
506; SSSE3-NEXT:    movdqa %xmm1, %xmm0
507; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
508; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
509; SSSE3-NEXT:    retq
510;
511; SSE41-LABEL: sext_16i8_to_4i64:
512; SSE41:       # %bb.0: # %entry
513; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
514; SSE41-NEXT:    psrld $16, %xmm0
515; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
516; SSE41-NEXT:    movdqa %xmm2, %xmm0
517; SSE41-NEXT:    retq
518;
519; AVX1-LABEL: sext_16i8_to_4i64:
520; AVX1:       # %bb.0: # %entry
521; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
522; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
523; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
524; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
525; AVX1-NEXT:    retq
526;
527; AVX2-LABEL: sext_16i8_to_4i64:
528; AVX2:       # %bb.0: # %entry
529; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
530; AVX2-NEXT:    retq
531;
532; AVX512-LABEL: sext_16i8_to_4i64:
533; AVX512:       # %bb.0: # %entry
534; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
535; AVX512-NEXT:    retq
536;
537; X86-SSE2-LABEL: sext_16i8_to_4i64:
538; X86-SSE2:       # %bb.0: # %entry
539; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
540; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
541; X86-SSE2-NEXT:    psrad $24, %xmm1
542; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
543; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
544; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
545; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
546; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
547; X86-SSE2-NEXT:    retl
548;
549; X86-SSE41-LABEL: sext_16i8_to_4i64:
550; X86-SSE41:       # %bb.0: # %entry
551; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
552; X86-SSE41-NEXT:    psrld $16, %xmm0
553; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
554; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
555; X86-SSE41-NEXT:    retl
556entry:
557  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
558  %C = sext <4 x i8> %B to <4 x i64>
559  ret <4 x i64> %C
560}
561
562define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
563; SSE2-LABEL: sext_16i8_to_8i64:
564; SSE2:       # %bb.0: # %entry
565; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
566; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
567; SSE2-NEXT:    psrad $24, %xmm1
568; SSE2-NEXT:    pxor %xmm4, %xmm4
569; SSE2-NEXT:    pxor %xmm3, %xmm3
570; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
571; SSE2-NEXT:    movdqa %xmm1, %xmm0
572; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
573; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
574; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
575; SSE2-NEXT:    psrad $24, %xmm3
576; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
577; SSE2-NEXT:    movdqa %xmm3, %xmm2
578; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
579; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
580; SSE2-NEXT:    retq
581;
582; SSSE3-LABEL: sext_16i8_to_8i64:
583; SSSE3:       # %bb.0: # %entry
584; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
585; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
586; SSSE3-NEXT:    psrad $24, %xmm1
587; SSSE3-NEXT:    pxor %xmm4, %xmm4
588; SSSE3-NEXT:    pxor %xmm3, %xmm3
589; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
590; SSSE3-NEXT:    movdqa %xmm1, %xmm0
591; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
592; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
593; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
594; SSSE3-NEXT:    psrad $24, %xmm3
595; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
596; SSSE3-NEXT:    movdqa %xmm3, %xmm2
597; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
598; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
599; SSSE3-NEXT:    retq
600;
601; SSE41-LABEL: sext_16i8_to_8i64:
602; SSE41:       # %bb.0: # %entry
603; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
604; SSE41-NEXT:    movdqa %xmm0, %xmm1
605; SSE41-NEXT:    psrld $16, %xmm1
606; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
607; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
608; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
609; SSE41-NEXT:    psrlq $48, %xmm0
610; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
611; SSE41-NEXT:    movdqa %xmm4, %xmm0
612; SSE41-NEXT:    retq
613;
614; AVX1-LABEL: sext_16i8_to_8i64:
615; AVX1:       # %bb.0: # %entry
616; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
617; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
618; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
619; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
620; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
621; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
622; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
623; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
624; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
625; AVX1-NEXT:    vmovaps %ymm2, %ymm0
626; AVX1-NEXT:    retq
627;
628; AVX2-LABEL: sext_16i8_to_8i64:
629; AVX2:       # %bb.0: # %entry
630; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm2
631; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
632; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
633; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
634; AVX2-NEXT:    retq
635;
636; AVX512-LABEL: sext_16i8_to_8i64:
637; AVX512:       # %bb.0: # %entry
638; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
639; AVX512-NEXT:    retq
640;
641; X86-SSE2-LABEL: sext_16i8_to_8i64:
642; X86-SSE2:       # %bb.0: # %entry
643; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
644; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
645; X86-SSE2-NEXT:    psrad $24, %xmm1
646; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
647; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
648; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
649; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
650; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
651; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
652; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
653; X86-SSE2-NEXT:    psrad $24, %xmm3
654; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
655; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
656; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
657; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
658; X86-SSE2-NEXT:    retl
659;
660; X86-SSE41-LABEL: sext_16i8_to_8i64:
661; X86-SSE41:       # %bb.0: # %entry
662; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
663; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
664; X86-SSE41-NEXT:    psrld $16, %xmm1
665; X86-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
666; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
667; X86-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
668; X86-SSE41-NEXT:    psrlq $48, %xmm0
669; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
670; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
671; X86-SSE41-NEXT:    retl
672entry:
673  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
674  %C = sext <8 x i8> %B to <8 x i64>
675  ret <8 x i64> %C
676}
677
678define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
679; SSE2-LABEL: sext_8i16_to_4i32:
680; SSE2:       # %bb.0: # %entry
681; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
682; SSE2-NEXT:    psrad $16, %xmm0
683; SSE2-NEXT:    retq
684;
685; SSSE3-LABEL: sext_8i16_to_4i32:
686; SSSE3:       # %bb.0: # %entry
687; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
688; SSSE3-NEXT:    psrad $16, %xmm0
689; SSSE3-NEXT:    retq
690;
691; SSE41-LABEL: sext_8i16_to_4i32:
692; SSE41:       # %bb.0: # %entry
693; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
694; SSE41-NEXT:    retq
695;
696; AVX-LABEL: sext_8i16_to_4i32:
697; AVX:       # %bb.0: # %entry
698; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
699; AVX-NEXT:    retq
700;
701; X86-SSE2-LABEL: sext_8i16_to_4i32:
702; X86-SSE2:       # %bb.0: # %entry
703; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
704; X86-SSE2-NEXT:    psrad $16, %xmm0
705; X86-SSE2-NEXT:    retl
706;
707; X86-SSE41-LABEL: sext_8i16_to_4i32:
708; X86-SSE41:       # %bb.0: # %entry
709; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
710; X86-SSE41-NEXT:    retl
711entry:
712  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
713  %C = sext <4 x i16> %B to <4 x i32>
714  ret <4 x i32> %C
715}
716
717define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
718; SSE2-LABEL: sext_8i16_to_8i32:
719; SSE2:       # %bb.0: # %entry
720; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
721; SSE2-NEXT:    psrad $16, %xmm2
722; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
723; SSE2-NEXT:    psrad $16, %xmm1
724; SSE2-NEXT:    movdqa %xmm2, %xmm0
725; SSE2-NEXT:    retq
726;
727; SSSE3-LABEL: sext_8i16_to_8i32:
728; SSSE3:       # %bb.0: # %entry
729; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
730; SSSE3-NEXT:    psrad $16, %xmm2
731; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
732; SSSE3-NEXT:    psrad $16, %xmm1
733; SSSE3-NEXT:    movdqa %xmm2, %xmm0
734; SSSE3-NEXT:    retq
735;
736; SSE41-LABEL: sext_8i16_to_8i32:
737; SSE41:       # %bb.0: # %entry
738; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
739; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
740; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
741; SSE41-NEXT:    movdqa %xmm2, %xmm0
742; SSE41-NEXT:    retq
743;
744; AVX1-LABEL: sext_8i16_to_8i32:
745; AVX1:       # %bb.0: # %entry
746; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
747; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
748; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
749; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
750; AVX1-NEXT:    retq
751;
752; AVX2-LABEL: sext_8i16_to_8i32:
753; AVX2:       # %bb.0: # %entry
754; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
755; AVX2-NEXT:    retq
756;
757; AVX512-LABEL: sext_8i16_to_8i32:
758; AVX512:       # %bb.0: # %entry
759; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
760; AVX512-NEXT:    retq
761;
762; X86-SSE2-LABEL: sext_8i16_to_8i32:
763; X86-SSE2:       # %bb.0: # %entry
764; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
765; X86-SSE2-NEXT:    psrad $16, %xmm2
766; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
767; X86-SSE2-NEXT:    psrad $16, %xmm1
768; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
769; X86-SSE2-NEXT:    retl
770;
771; X86-SSE41-LABEL: sext_8i16_to_8i32:
772; X86-SSE41:       # %bb.0: # %entry
773; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
774; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
775; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
776; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
777; X86-SSE41-NEXT:    retl
778entry:
779  %B = sext <8 x i16> %A to <8 x i32>
780  ret <8 x i32> %B
781}
782
783define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
784; SSE2-LABEL: sext_16i16_to_16i32:
785; SSE2:       # %bb.0: # %entry
786; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
787; SSE2-NEXT:    psrad $16, %xmm4
788; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
789; SSE2-NEXT:    psrad $16, %xmm5
790; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
791; SSE2-NEXT:    psrad $16, %xmm2
792; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
793; SSE2-NEXT:    psrad $16, %xmm3
794; SSE2-NEXT:    movdqa %xmm4, %xmm0
795; SSE2-NEXT:    movdqa %xmm5, %xmm1
796; SSE2-NEXT:    retq
797;
798; SSSE3-LABEL: sext_16i16_to_16i32:
799; SSSE3:       # %bb.0: # %entry
800; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
801; SSSE3-NEXT:    psrad $16, %xmm4
802; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
803; SSSE3-NEXT:    psrad $16, %xmm5
804; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
805; SSSE3-NEXT:    psrad $16, %xmm2
806; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
807; SSSE3-NEXT:    psrad $16, %xmm3
808; SSSE3-NEXT:    movdqa %xmm4, %xmm0
809; SSSE3-NEXT:    movdqa %xmm5, %xmm1
810; SSSE3-NEXT:    retq
811;
812; SSE41-LABEL: sext_16i16_to_16i32:
813; SSE41:       # %bb.0: # %entry
814; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
815; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
816; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
817; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
818; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
819; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
820; SSE41-NEXT:    movdqa %xmm5, %xmm0
821; SSE41-NEXT:    movdqa %xmm4, %xmm1
822; SSE41-NEXT:    retq
823;
824; AVX1-LABEL: sext_16i16_to_16i32:
825; AVX1:       # %bb.0: # %entry
826; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
827; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
828; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
829; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
830; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
831; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
832; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
833; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
834; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
835; AVX1-NEXT:    vmovaps %ymm2, %ymm0
836; AVX1-NEXT:    retq
837;
838; AVX2-LABEL: sext_16i16_to_16i32:
839; AVX2:       # %bb.0: # %entry
840; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
841; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
842; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm1
843; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
844; AVX2-NEXT:    retq
845;
846; AVX512-LABEL: sext_16i16_to_16i32:
847; AVX512:       # %bb.0: # %entry
848; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
849; AVX512-NEXT:    retq
850;
851; X86-SSE2-LABEL: sext_16i16_to_16i32:
852; X86-SSE2:       # %bb.0: # %entry
853; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
854; X86-SSE2-NEXT:    psrad $16, %xmm4
855; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
856; X86-SSE2-NEXT:    psrad $16, %xmm5
857; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
858; X86-SSE2-NEXT:    psrad $16, %xmm2
859; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
860; X86-SSE2-NEXT:    psrad $16, %xmm3
861; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
862; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
863; X86-SSE2-NEXT:    retl
864;
865; X86-SSE41-LABEL: sext_16i16_to_16i32:
866; X86-SSE41:       # %bb.0: # %entry
867; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
868; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
869; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
870; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
871; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
872; X86-SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
873; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
874; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
875; X86-SSE41-NEXT:    retl
876entry:
877  %B = sext <16 x i16> %A to <16 x i32>
878  ret <16 x i32> %B
879}
880
881define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
882; SSE2-LABEL: sext_8i16_to_2i64:
883; SSE2:       # %bb.0: # %entry
884; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
885; SSE2-NEXT:    pxor %xmm1, %xmm1
886; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
887; SSE2-NEXT:    psrad $16, %xmm0
888; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
889; SSE2-NEXT:    retq
890;
891; SSSE3-LABEL: sext_8i16_to_2i64:
892; SSSE3:       # %bb.0: # %entry
893; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
894; SSSE3-NEXT:    pxor %xmm1, %xmm1
895; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
896; SSSE3-NEXT:    psrad $16, %xmm0
897; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
898; SSSE3-NEXT:    retq
899;
900; SSE41-LABEL: sext_8i16_to_2i64:
901; SSE41:       # %bb.0: # %entry
902; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
903; SSE41-NEXT:    retq
904;
905; AVX-LABEL: sext_8i16_to_2i64:
906; AVX:       # %bb.0: # %entry
907; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
908; AVX-NEXT:    retq
909;
910; X86-SSE2-LABEL: sext_8i16_to_2i64:
911; X86-SSE2:       # %bb.0: # %entry
912; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
913; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
914; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
915; X86-SSE2-NEXT:    psrad $16, %xmm0
916; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
917; X86-SSE2-NEXT:    retl
918;
919; X86-SSE41-LABEL: sext_8i16_to_2i64:
920; X86-SSE41:       # %bb.0: # %entry
921; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
922; X86-SSE41-NEXT:    retl
923entry:
924  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
925  %C = sext <2 x i16> %B to <2 x i64>
926  ret <2 x i64> %C
927}
928
929define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
930; SSE2-LABEL: sext_8i16_to_4i64:
931; SSE2:       # %bb.0: # %entry
932; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
933; SSE2-NEXT:    psrad $16, %xmm1
934; SSE2-NEXT:    pxor %xmm2, %xmm2
935; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
936; SSE2-NEXT:    movdqa %xmm1, %xmm0
937; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
938; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
939; SSE2-NEXT:    retq
940;
941; SSSE3-LABEL: sext_8i16_to_4i64:
942; SSSE3:       # %bb.0: # %entry
943; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
944; SSSE3-NEXT:    psrad $16, %xmm1
945; SSSE3-NEXT:    pxor %xmm2, %xmm2
946; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
947; SSSE3-NEXT:    movdqa %xmm1, %xmm0
948; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
949; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
950; SSSE3-NEXT:    retq
951;
952; SSE41-LABEL: sext_8i16_to_4i64:
953; SSE41:       # %bb.0: # %entry
954; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
955; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
956; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
957; SSE41-NEXT:    movdqa %xmm2, %xmm0
958; SSE41-NEXT:    retq
959;
960; AVX1-LABEL: sext_8i16_to_4i64:
961; AVX1:       # %bb.0: # %entry
962; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
963; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
964; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
965; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
966; AVX1-NEXT:    retq
967;
968; AVX2-LABEL: sext_8i16_to_4i64:
969; AVX2:       # %bb.0: # %entry
970; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
971; AVX2-NEXT:    retq
972;
973; AVX512-LABEL: sext_8i16_to_4i64:
974; AVX512:       # %bb.0: # %entry
975; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
976; AVX512-NEXT:    retq
977;
978; X86-SSE2-LABEL: sext_8i16_to_4i64:
979; X86-SSE2:       # %bb.0: # %entry
980; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
981; X86-SSE2-NEXT:    psrad $16, %xmm1
982; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
983; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
984; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
985; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
986; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
987; X86-SSE2-NEXT:    retl
988;
989; X86-SSE41-LABEL: sext_8i16_to_4i64:
990; X86-SSE41:       # %bb.0: # %entry
991; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
992; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
993; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
994; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
995; X86-SSE41-NEXT:    retl
996entry:
997  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
998  %C = sext <4 x i16> %B to <4 x i64>
999  ret <4 x i64> %C
1000}
1001
1002define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
1003; SSE2-LABEL: sext_8i16_to_8i64:
1004; SSE2:       # %bb.0: # %entry
1005; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1006; SSE2-NEXT:    psrad $16, %xmm1
1007; SSE2-NEXT:    pxor %xmm5, %xmm5
1008; SSE2-NEXT:    pxor %xmm2, %xmm2
1009; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1010; SSE2-NEXT:    movdqa %xmm1, %xmm4
1011; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1012; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1013; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1014; SSE2-NEXT:    psrad $16, %xmm3
1015; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
1016; SSE2-NEXT:    movdqa %xmm3, %xmm2
1017; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1018; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1019; SSE2-NEXT:    movdqa %xmm4, %xmm0
1020; SSE2-NEXT:    retq
1021;
1022; SSSE3-LABEL: sext_8i16_to_8i64:
1023; SSSE3:       # %bb.0: # %entry
1024; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1025; SSSE3-NEXT:    psrad $16, %xmm1
1026; SSSE3-NEXT:    pxor %xmm5, %xmm5
1027; SSSE3-NEXT:    pxor %xmm2, %xmm2
1028; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1029; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1030; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1031; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1032; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1033; SSSE3-NEXT:    psrad $16, %xmm3
1034; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
1035; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1036; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1037; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1038; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1039; SSSE3-NEXT:    retq
1040;
1041; SSE41-LABEL: sext_8i16_to_8i64:
1042; SSE41:       # %bb.0: # %entry
1043; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1044; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1045; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1046; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1047; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1048; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1049; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1050; SSE41-NEXT:    movdqa %xmm4, %xmm0
1051; SSE41-NEXT:    retq
1052;
1053; AVX1-LABEL: sext_8i16_to_8i64:
1054; AVX1:       # %bb.0: # %entry
1055; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
1056; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
1057; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
1058; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1059; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1060; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
1061; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1062; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
1063; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1064; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1065; AVX1-NEXT:    retq
1066;
1067; AVX2-LABEL: sext_8i16_to_8i64:
1068; AVX2:       # %bb.0: # %entry
1069; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm2
1070; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1071; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm1
1072; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1073; AVX2-NEXT:    retq
1074;
1075; AVX512-LABEL: sext_8i16_to_8i64:
1076; AVX512:       # %bb.0: # %entry
1077; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
1078; AVX512-NEXT:    retq
1079;
1080; X86-SSE2-LABEL: sext_8i16_to_8i64:
1081; X86-SSE2:       # %bb.0: # %entry
1082; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1083; X86-SSE2-NEXT:    psrad $16, %xmm1
1084; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
1085; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
1086; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1087; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
1088; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
1089; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1090; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1091; X86-SSE2-NEXT:    psrad $16, %xmm3
1092; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
1093; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
1094; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1095; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1096; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
1097; X86-SSE2-NEXT:    retl
1098;
1099; X86-SSE41-LABEL: sext_8i16_to_8i64:
1100; X86-SSE41:       # %bb.0: # %entry
1101; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1102; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1103; X86-SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1104; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1105; X86-SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1106; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1107; X86-SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1108; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
1109; X86-SSE41-NEXT:    retl
1110entry:
1111  %B = sext <8 x i16> %A to <8 x i64>
1112  ret <8 x i64> %B
1113}
1114
1115define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1116; SSE2-LABEL: sext_4i32_to_2i64:
1117; SSE2:       # %bb.0: # %entry
1118; SSE2-NEXT:    pxor %xmm1, %xmm1
1119; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1120; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121; SSE2-NEXT:    retq
1122;
1123; SSSE3-LABEL: sext_4i32_to_2i64:
1124; SSSE3:       # %bb.0: # %entry
1125; SSSE3-NEXT:    pxor %xmm1, %xmm1
1126; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
1127; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1128; SSSE3-NEXT:    retq
1129;
1130; SSE41-LABEL: sext_4i32_to_2i64:
1131; SSE41:       # %bb.0: # %entry
1132; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1133; SSE41-NEXT:    retq
1134;
1135; AVX-LABEL: sext_4i32_to_2i64:
1136; AVX:       # %bb.0: # %entry
1137; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
1138; AVX-NEXT:    retq
1139;
1140; X86-SSE2-LABEL: sext_4i32_to_2i64:
1141; X86-SSE2:       # %bb.0: # %entry
1142; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1143; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1144; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1145; X86-SSE2-NEXT:    retl
1146;
1147; X86-SSE41-LABEL: sext_4i32_to_2i64:
1148; X86-SSE41:       # %bb.0: # %entry
1149; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1150; X86-SSE41-NEXT:    retl
1151entry:
1152  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1153  %C = sext <2 x i32> %B to <2 x i64>
1154  ret <2 x i64> %C
1155}
1156
1157define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1158; SSE2-LABEL: sext_4i32_to_4i64:
1159; SSE2:       # %bb.0: # %entry
1160; SSE2-NEXT:    pxor %xmm2, %xmm2
1161; SSE2-NEXT:    pxor %xmm3, %xmm3
1162; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1163; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1164; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1165; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1166; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1167; SSE2-NEXT:    retq
1168;
1169; SSSE3-LABEL: sext_4i32_to_4i64:
1170; SSSE3:       # %bb.0: # %entry
1171; SSSE3-NEXT:    pxor %xmm2, %xmm2
1172; SSSE3-NEXT:    pxor %xmm3, %xmm3
1173; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
1174; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1175; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1176; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
1177; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1178; SSSE3-NEXT:    retq
1179;
1180; SSE41-LABEL: sext_4i32_to_4i64:
1181; SSE41:       # %bb.0: # %entry
1182; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1183; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1184; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1185; SSE41-NEXT:    movdqa %xmm2, %xmm0
1186; SSE41-NEXT:    retq
1187;
1188; AVX1-LABEL: sext_4i32_to_4i64:
1189; AVX1:       # %bb.0: # %entry
1190; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1191; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1192; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1193; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1194; AVX1-NEXT:    retq
1195;
1196; AVX2-LABEL: sext_4i32_to_4i64:
1197; AVX2:       # %bb.0: # %entry
1198; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
1199; AVX2-NEXT:    retq
1200;
1201; AVX512-LABEL: sext_4i32_to_4i64:
1202; AVX512:       # %bb.0: # %entry
1203; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
1204; AVX512-NEXT:    retq
1205;
1206; X86-SSE2-LABEL: sext_4i32_to_4i64:
1207; X86-SSE2:       # %bb.0: # %entry
1208; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
1209; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1210; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1211; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1212; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1213; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1214; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1215; X86-SSE2-NEXT:    retl
1216;
1217; X86-SSE41-LABEL: sext_4i32_to_4i64:
1218; X86-SSE41:       # %bb.0: # %entry
1219; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1220; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1221; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1222; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
1223; X86-SSE41-NEXT:    retl
1224entry:
1225  %B = sext <4 x i32> %A to <4 x i64>
1226  ret <4 x i64> %B
1227}
1228
1229define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
1230; SSE2-LABEL: sext_8i32_to_8i64:
1231; SSE2:       # %bb.0: # %entry
1232; SSE2-NEXT:    movdqa %xmm1, %xmm2
1233; SSE2-NEXT:    pxor %xmm4, %xmm4
1234; SSE2-NEXT:    pxor %xmm3, %xmm3
1235; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1236; SSE2-NEXT:    pxor %xmm5, %xmm5
1237; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1238; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1239; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1240; SSE2-NEXT:    pxor %xmm3, %xmm3
1241; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1242; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1243; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1244; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1245; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
1246; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1247; SSE2-NEXT:    retq
1248;
1249; SSSE3-LABEL: sext_8i32_to_8i64:
1250; SSSE3:       # %bb.0: # %entry
1251; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1252; SSSE3-NEXT:    pxor %xmm4, %xmm4
1253; SSSE3-NEXT:    pxor %xmm3, %xmm3
1254; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
1255; SSSE3-NEXT:    pxor %xmm5, %xmm5
1256; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
1257; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1258; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1259; SSSE3-NEXT:    pxor %xmm3, %xmm3
1260; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
1261; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1262; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1263; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1264; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
1265; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1266; SSSE3-NEXT:    retq
1267;
1268; SSE41-LABEL: sext_8i32_to_8i64:
1269; SSE41:       # %bb.0: # %entry
1270; SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1271; SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1272; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1273; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1274; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1275; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1276; SSE41-NEXT:    movdqa %xmm5, %xmm0
1277; SSE41-NEXT:    movdqa %xmm4, %xmm1
1278; SSE41-NEXT:    retq
1279;
1280; AVX1-LABEL: sext_8i32_to_8i64:
1281; AVX1:       # %bb.0: # %entry
1282; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1283; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1284; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
1285; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1286; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1287; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1288; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1289; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1290; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1291; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1292; AVX1-NEXT:    retq
1293;
1294; AVX2-LABEL: sext_8i32_to_8i64:
1295; AVX2:       # %bb.0: # %entry
1296; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
1297; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1298; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
1299; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1300; AVX2-NEXT:    retq
1301;
1302; AVX512-LABEL: sext_8i32_to_8i64:
1303; AVX512:       # %bb.0: # %entry
1304; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
1305; AVX512-NEXT:    retq
1306;
1307; X86-SSE2-LABEL: sext_8i32_to_8i64:
1308; X86-SSE2:       # %bb.0: # %entry
1309; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1310; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
1311; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1312; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
1313; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
1314; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
1315; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1316; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1317; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1318; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1319; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1320; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
1321; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1322; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
1323; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1324; X86-SSE2-NEXT:    retl
1325;
1326; X86-SSE41-LABEL: sext_8i32_to_8i64:
1327; X86-SSE41:       # %bb.0: # %entry
1328; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1329; X86-SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1330; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1331; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1332; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1333; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1334; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
1335; X86-SSE41-NEXT:    movdqa %xmm4, %xmm1
1336; X86-SSE41-NEXT:    retl
1337entry:
1338  %B = sext <8 x i32> %A to <8 x i64>
1339  ret <8 x i64> %B
1340}
1341
1342define <2 x i64> @load_sext_2i1_to_2i64(ptr%ptr) {
1343; SSE-LABEL: load_sext_2i1_to_2i64:
1344; SSE:       # %bb.0: # %entry
1345; SSE-NEXT:    movzbl (%rdi), %eax
1346; SSE-NEXT:    movzbl %al, %ecx
1347; SSE-NEXT:    shrb %al
1348; SSE-NEXT:    movzbl %al, %eax
1349; SSE-NEXT:    negq %rax
1350; SSE-NEXT:    movq %rax, %xmm1
1351; SSE-NEXT:    andl $1, %ecx
1352; SSE-NEXT:    negq %rcx
1353; SSE-NEXT:    movq %rcx, %xmm0
1354; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1355; SSE-NEXT:    retq
1356;
1357; AVX1-LABEL: load_sext_2i1_to_2i64:
1358; AVX1:       # %bb.0: # %entry
1359; AVX1-NEXT:    movzbl (%rdi), %eax
1360; AVX1-NEXT:    movzbl %al, %ecx
1361; AVX1-NEXT:    shrb %al
1362; AVX1-NEXT:    movzbl %al, %eax
1363; AVX1-NEXT:    negq %rax
1364; AVX1-NEXT:    vmovq %rax, %xmm0
1365; AVX1-NEXT:    andl $1, %ecx
1366; AVX1-NEXT:    negq %rcx
1367; AVX1-NEXT:    vmovq %rcx, %xmm1
1368; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1369; AVX1-NEXT:    retq
1370;
1371; AVX2-LABEL: load_sext_2i1_to_2i64:
1372; AVX2:       # %bb.0: # %entry
1373; AVX2-NEXT:    movzbl (%rdi), %eax
1374; AVX2-NEXT:    movzbl %al, %ecx
1375; AVX2-NEXT:    shrb %al
1376; AVX2-NEXT:    movzbl %al, %eax
1377; AVX2-NEXT:    negq %rax
1378; AVX2-NEXT:    vmovq %rax, %xmm0
1379; AVX2-NEXT:    andl $1, %ecx
1380; AVX2-NEXT:    negq %rcx
1381; AVX2-NEXT:    vmovq %rcx, %xmm1
1382; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1383; AVX2-NEXT:    retq
1384;
1385; AVX512F-LABEL: load_sext_2i1_to_2i64:
1386; AVX512F:       # %bb.0: # %entry
1387; AVX512F-NEXT:    movzbl (%rdi), %eax
1388; AVX512F-NEXT:    kmovw %eax, %k1
1389; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1390; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1391; AVX512F-NEXT:    vzeroupper
1392; AVX512F-NEXT:    retq
1393;
1394; AVX512BW-LABEL: load_sext_2i1_to_2i64:
1395; AVX512BW:       # %bb.0: # %entry
1396; AVX512BW-NEXT:    movzbl (%rdi), %eax
1397; AVX512BW-NEXT:    kmovd %eax, %k1
1398; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1399; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1400; AVX512BW-NEXT:    vzeroupper
1401; AVX512BW-NEXT:    retq
1402;
1403; X86-SSE2-LABEL: load_sext_2i1_to_2i64:
1404; X86-SSE2:       # %bb.0: # %entry
1405; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1406; X86-SSE2-NEXT:    movzbl (%eax), %eax
1407; X86-SSE2-NEXT:    movzbl %al, %ecx
1408; X86-SSE2-NEXT:    shrb %al
1409; X86-SSE2-NEXT:    movzbl %al, %eax
1410; X86-SSE2-NEXT:    negl %eax
1411; X86-SSE2-NEXT:    movd %eax, %xmm0
1412; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
1413; X86-SSE2-NEXT:    andl $1, %ecx
1414; X86-SSE2-NEXT:    negl %ecx
1415; X86-SSE2-NEXT:    movd %ecx, %xmm0
1416; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1417; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1418; X86-SSE2-NEXT:    retl
1419;
1420; X86-SSE41-LABEL: load_sext_2i1_to_2i64:
1421; X86-SSE41:       # %bb.0: # %entry
1422; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1423; X86-SSE41-NEXT:    movzbl (%eax), %eax
1424; X86-SSE41-NEXT:    movzbl %al, %ecx
1425; X86-SSE41-NEXT:    andl $1, %ecx
1426; X86-SSE41-NEXT:    negl %ecx
1427; X86-SSE41-NEXT:    movd %ecx, %xmm0
1428; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1429; X86-SSE41-NEXT:    shrb %al
1430; X86-SSE41-NEXT:    movzbl %al, %eax
1431; X86-SSE41-NEXT:    negl %eax
1432; X86-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1433; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1434; X86-SSE41-NEXT:    retl
1435entry:
1436 %X = load <2 x i1>, ptr %ptr
1437 %Y = sext <2 x i1> %X to <2 x i64>
1438 ret <2 x i64> %Y
1439}
1440
1441define <2 x i64> @load_sext_2i8_to_2i64(ptr%ptr) {
1442; SSE2-LABEL: load_sext_2i8_to_2i64:
1443; SSE2:       # %bb.0: # %entry
1444; SSE2-NEXT:    movzwl (%rdi), %eax
1445; SSE2-NEXT:    movd %eax, %xmm0
1446; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1447; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1448; SSE2-NEXT:    pxor %xmm1, %xmm1
1449; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1450; SSE2-NEXT:    psrad $24, %xmm0
1451; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1452; SSE2-NEXT:    retq
1453;
1454; SSSE3-LABEL: load_sext_2i8_to_2i64:
1455; SSSE3:       # %bb.0: # %entry
1456; SSSE3-NEXT:    movzwl (%rdi), %eax
1457; SSSE3-NEXT:    movd %eax, %xmm0
1458; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1459; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1460; SSSE3-NEXT:    pxor %xmm1, %xmm1
1461; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
1462; SSSE3-NEXT:    psrad $24, %xmm0
1463; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1464; SSSE3-NEXT:    retq
1465;
1466; SSE41-LABEL: load_sext_2i8_to_2i64:
1467; SSE41:       # %bb.0: # %entry
1468; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1469; SSE41-NEXT:    retq
1470;
1471; AVX-LABEL: load_sext_2i8_to_2i64:
1472; AVX:       # %bb.0: # %entry
1473; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
1474; AVX-NEXT:    retq
1475;
1476; X86-SSE2-LABEL: load_sext_2i8_to_2i64:
1477; X86-SSE2:       # %bb.0: # %entry
1478; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1479; X86-SSE2-NEXT:    movzwl (%eax), %eax
1480; X86-SSE2-NEXT:    movd %eax, %xmm0
1481; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1482; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1483; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1484; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
1485; X86-SSE2-NEXT:    psrad $24, %xmm0
1486; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1487; X86-SSE2-NEXT:    retl
1488;
1489; X86-SSE41-LABEL: load_sext_2i8_to_2i64:
1490; X86-SSE41:       # %bb.0: # %entry
1491; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1492; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1493; X86-SSE41-NEXT:    retl
1494entry:
1495 %X = load <2 x i8>, ptr %ptr
1496 %Y = sext <2 x i8> %X to <2 x i64>
1497 ret <2 x i64> %Y
1498}
1499
1500define <4 x i32> @load_sext_4i1_to_4i32(ptr%ptr) {
1501; SSE2-LABEL: load_sext_4i1_to_4i32:
1502; SSE2:       # %bb.0: # %entry
1503; SSE2-NEXT:    movzbl (%rdi), %eax
1504; SSE2-NEXT:    movl %eax, %ecx
1505; SSE2-NEXT:    shrb $3, %cl
1506; SSE2-NEXT:    movzbl %cl, %ecx
1507; SSE2-NEXT:    negl %ecx
1508; SSE2-NEXT:    movd %ecx, %xmm0
1509; SSE2-NEXT:    movzbl %al, %ecx
1510; SSE2-NEXT:    shrb $2, %al
1511; SSE2-NEXT:    movzbl %al, %eax
1512; SSE2-NEXT:    andl $1, %eax
1513; SSE2-NEXT:    negl %eax
1514; SSE2-NEXT:    movd %eax, %xmm1
1515; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1516; SSE2-NEXT:    movl %ecx, %eax
1517; SSE2-NEXT:    andl $1, %eax
1518; SSE2-NEXT:    negl %eax
1519; SSE2-NEXT:    movd %eax, %xmm0
1520; SSE2-NEXT:    shrb %cl
1521; SSE2-NEXT:    movzbl %cl, %eax
1522; SSE2-NEXT:    andl $1, %eax
1523; SSE2-NEXT:    negl %eax
1524; SSE2-NEXT:    movd %eax, %xmm2
1525; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1526; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1527; SSE2-NEXT:    retq
1528;
1529; SSSE3-LABEL: load_sext_4i1_to_4i32:
1530; SSSE3:       # %bb.0: # %entry
1531; SSSE3-NEXT:    movzbl (%rdi), %eax
1532; SSSE3-NEXT:    movl %eax, %ecx
1533; SSSE3-NEXT:    shrb $3, %cl
1534; SSSE3-NEXT:    movzbl %cl, %ecx
1535; SSSE3-NEXT:    negl %ecx
1536; SSSE3-NEXT:    movd %ecx, %xmm0
1537; SSSE3-NEXT:    movzbl %al, %ecx
1538; SSSE3-NEXT:    shrb $2, %al
1539; SSSE3-NEXT:    movzbl %al, %eax
1540; SSSE3-NEXT:    andl $1, %eax
1541; SSSE3-NEXT:    negl %eax
1542; SSSE3-NEXT:    movd %eax, %xmm1
1543; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1544; SSSE3-NEXT:    movl %ecx, %eax
1545; SSSE3-NEXT:    andl $1, %eax
1546; SSSE3-NEXT:    negl %eax
1547; SSSE3-NEXT:    movd %eax, %xmm0
1548; SSSE3-NEXT:    shrb %cl
1549; SSSE3-NEXT:    movzbl %cl, %eax
1550; SSSE3-NEXT:    andl $1, %eax
1551; SSSE3-NEXT:    negl %eax
1552; SSSE3-NEXT:    movd %eax, %xmm2
1553; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1554; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1555; SSSE3-NEXT:    retq
1556;
1557; SSE41-LABEL: load_sext_4i1_to_4i32:
1558; SSE41:       # %bb.0: # %entry
1559; SSE41-NEXT:    movzbl (%rdi), %eax
1560; SSE41-NEXT:    movzbl %al, %ecx
1561; SSE41-NEXT:    shrb %al
1562; SSE41-NEXT:    movzbl %al, %eax
1563; SSE41-NEXT:    andl $1, %eax
1564; SSE41-NEXT:    negl %eax
1565; SSE41-NEXT:    movl %ecx, %edx
1566; SSE41-NEXT:    andl $1, %edx
1567; SSE41-NEXT:    negl %edx
1568; SSE41-NEXT:    movd %edx, %xmm0
1569; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
1570; SSE41-NEXT:    movl %ecx, %eax
1571; SSE41-NEXT:    shrb $2, %al
1572; SSE41-NEXT:    movzbl %al, %eax
1573; SSE41-NEXT:    andl $1, %eax
1574; SSE41-NEXT:    negl %eax
1575; SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1576; SSE41-NEXT:    shrb $3, %cl
1577; SSE41-NEXT:    movzbl %cl, %eax
1578; SSE41-NEXT:    negl %eax
1579; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1580; SSE41-NEXT:    retq
1581;
1582; AVX1-LABEL: load_sext_4i1_to_4i32:
1583; AVX1:       # %bb.0: # %entry
1584; AVX1-NEXT:    movzbl (%rdi), %eax
1585; AVX1-NEXT:    movzbl %al, %ecx
1586; AVX1-NEXT:    shrb %al
1587; AVX1-NEXT:    movzbl %al, %eax
1588; AVX1-NEXT:    andl $1, %eax
1589; AVX1-NEXT:    negl %eax
1590; AVX1-NEXT:    movl %ecx, %edx
1591; AVX1-NEXT:    andl $1, %edx
1592; AVX1-NEXT:    negl %edx
1593; AVX1-NEXT:    vmovd %edx, %xmm0
1594; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1595; AVX1-NEXT:    movl %ecx, %eax
1596; AVX1-NEXT:    shrb $2, %al
1597; AVX1-NEXT:    movzbl %al, %eax
1598; AVX1-NEXT:    andl $1, %eax
1599; AVX1-NEXT:    negl %eax
1600; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1601; AVX1-NEXT:    shrb $3, %cl
1602; AVX1-NEXT:    movzbl %cl, %eax
1603; AVX1-NEXT:    negl %eax
1604; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1605; AVX1-NEXT:    retq
1606;
1607; AVX2-LABEL: load_sext_4i1_to_4i32:
1608; AVX2:       # %bb.0: # %entry
1609; AVX2-NEXT:    movzbl (%rdi), %eax
1610; AVX2-NEXT:    movzbl %al, %ecx
1611; AVX2-NEXT:    shrb %al
1612; AVX2-NEXT:    movzbl %al, %eax
1613; AVX2-NEXT:    andl $1, %eax
1614; AVX2-NEXT:    negl %eax
1615; AVX2-NEXT:    movl %ecx, %edx
1616; AVX2-NEXT:    andl $1, %edx
1617; AVX2-NEXT:    negl %edx
1618; AVX2-NEXT:    vmovd %edx, %xmm0
1619; AVX2-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1620; AVX2-NEXT:    movl %ecx, %eax
1621; AVX2-NEXT:    shrb $2, %al
1622; AVX2-NEXT:    movzbl %al, %eax
1623; AVX2-NEXT:    andl $1, %eax
1624; AVX2-NEXT:    negl %eax
1625; AVX2-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1626; AVX2-NEXT:    shrb $3, %cl
1627; AVX2-NEXT:    movzbl %cl, %eax
1628; AVX2-NEXT:    negl %eax
1629; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1630; AVX2-NEXT:    retq
1631;
1632; AVX512F-LABEL: load_sext_4i1_to_4i32:
1633; AVX512F:       # %bb.0: # %entry
1634; AVX512F-NEXT:    movzbl (%rdi), %eax
1635; AVX512F-NEXT:    kmovw %eax, %k1
1636; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1637; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1638; AVX512F-NEXT:    vzeroupper
1639; AVX512F-NEXT:    retq
1640;
1641; AVX512BW-LABEL: load_sext_4i1_to_4i32:
1642; AVX512BW:       # %bb.0: # %entry
1643; AVX512BW-NEXT:    movzbl (%rdi), %eax
1644; AVX512BW-NEXT:    kmovd %eax, %k1
1645; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1646; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1647; AVX512BW-NEXT:    vzeroupper
1648; AVX512BW-NEXT:    retq
1649;
1650; X86-SSE2-LABEL: load_sext_4i1_to_4i32:
1651; X86-SSE2:       # %bb.0: # %entry
1652; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1653; X86-SSE2-NEXT:    movzbl (%eax), %eax
1654; X86-SSE2-NEXT:    movl %eax, %ecx
1655; X86-SSE2-NEXT:    shrb $3, %cl
1656; X86-SSE2-NEXT:    movzbl %cl, %ecx
1657; X86-SSE2-NEXT:    negl %ecx
1658; X86-SSE2-NEXT:    movd %ecx, %xmm0
1659; X86-SSE2-NEXT:    movl %eax, %ecx
1660; X86-SSE2-NEXT:    shrb $2, %cl
1661; X86-SSE2-NEXT:    movzbl %cl, %ecx
1662; X86-SSE2-NEXT:    andl $1, %ecx
1663; X86-SSE2-NEXT:    negl %ecx
1664; X86-SSE2-NEXT:    movd %ecx, %xmm1
1665; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1666; X86-SSE2-NEXT:    movzbl %al, %ecx
1667; X86-SSE2-NEXT:    andl $1, %ecx
1668; X86-SSE2-NEXT:    negl %ecx
1669; X86-SSE2-NEXT:    movd %ecx, %xmm0
1670; X86-SSE2-NEXT:    shrb %al
1671; X86-SSE2-NEXT:    movzbl %al, %eax
1672; X86-SSE2-NEXT:    andl $1, %eax
1673; X86-SSE2-NEXT:    negl %eax
1674; X86-SSE2-NEXT:    movd %eax, %xmm2
1675; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1676; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1677; X86-SSE2-NEXT:    retl
1678;
1679; X86-SSE41-LABEL: load_sext_4i1_to_4i32:
1680; X86-SSE41:       # %bb.0: # %entry
1681; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1682; X86-SSE41-NEXT:    movzbl (%eax), %eax
1683; X86-SSE41-NEXT:    movl %eax, %ecx
1684; X86-SSE41-NEXT:    shrb %cl
1685; X86-SSE41-NEXT:    movzbl %cl, %ecx
1686; X86-SSE41-NEXT:    andl $1, %ecx
1687; X86-SSE41-NEXT:    negl %ecx
1688; X86-SSE41-NEXT:    movzbl %al, %edx
1689; X86-SSE41-NEXT:    andl $1, %edx
1690; X86-SSE41-NEXT:    negl %edx
1691; X86-SSE41-NEXT:    movd %edx, %xmm0
1692; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1693; X86-SSE41-NEXT:    movl %eax, %ecx
1694; X86-SSE41-NEXT:    shrb $2, %cl
1695; X86-SSE41-NEXT:    movzbl %cl, %ecx
1696; X86-SSE41-NEXT:    andl $1, %ecx
1697; X86-SSE41-NEXT:    negl %ecx
1698; X86-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1699; X86-SSE41-NEXT:    shrb $3, %al
1700; X86-SSE41-NEXT:    movzbl %al, %eax
1701; X86-SSE41-NEXT:    negl %eax
1702; X86-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1703; X86-SSE41-NEXT:    retl
1704entry:
1705 %X = load <4 x i1>, ptr %ptr
1706 %Y = sext <4 x i1> %X to <4 x i32>
1707 ret <4 x i32> %Y
1708}
1709
1710define <4 x i32> @load_sext_4i8_to_4i32(ptr%ptr) {
1711; SSE2-LABEL: load_sext_4i8_to_4i32:
1712; SSE2:       # %bb.0: # %entry
1713; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1714; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1715; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1716; SSE2-NEXT:    psrad $24, %xmm0
1717; SSE2-NEXT:    retq
1718;
1719; SSSE3-LABEL: load_sext_4i8_to_4i32:
1720; SSSE3:       # %bb.0: # %entry
1721; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1722; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1723; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1724; SSSE3-NEXT:    psrad $24, %xmm0
1725; SSSE3-NEXT:    retq
1726;
1727; SSE41-LABEL: load_sext_4i8_to_4i32:
1728; SSE41:       # %bb.0: # %entry
1729; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1730; SSE41-NEXT:    retq
1731;
1732; AVX-LABEL: load_sext_4i8_to_4i32:
1733; AVX:       # %bb.0: # %entry
1734; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
1735; AVX-NEXT:    retq
1736;
1737; X86-SSE2-LABEL: load_sext_4i8_to_4i32:
1738; X86-SSE2:       # %bb.0: # %entry
1739; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1740; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1741; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1742; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1743; X86-SSE2-NEXT:    psrad $24, %xmm0
1744; X86-SSE2-NEXT:    retl
1745;
1746; X86-SSE41-LABEL: load_sext_4i8_to_4i32:
1747; X86-SSE41:       # %bb.0: # %entry
1748; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1749; X86-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1750; X86-SSE41-NEXT:    retl
1751entry:
1752 %X = load <4 x i8>, ptr %ptr
1753 %Y = sext <4 x i8> %X to <4 x i32>
1754 ret <4 x i32> %Y
1755}
1756
1757define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
1758; SSE2-LABEL: load_sext_4i1_to_4i64:
1759; SSE2:       # %bb.0: # %entry
1760; SSE2-NEXT:    movzbl (%rdi), %eax
1761; SSE2-NEXT:    movl %eax, %ecx
1762; SSE2-NEXT:    shrb %cl
1763; SSE2-NEXT:    andb $1, %cl
1764; SSE2-NEXT:    movzbl %cl, %ecx
1765; SSE2-NEXT:    movl %eax, %edx
1766; SSE2-NEXT:    andb $1, %dl
1767; SSE2-NEXT:    movzbl %dl, %edx
1768; SSE2-NEXT:    movd %edx, %xmm1
1769; SSE2-NEXT:    pinsrw $2, %ecx, %xmm1
1770; SSE2-NEXT:    movl %eax, %ecx
1771; SSE2-NEXT:    shrb $2, %cl
1772; SSE2-NEXT:    andb $1, %cl
1773; SSE2-NEXT:    movzbl %cl, %ecx
1774; SSE2-NEXT:    pinsrw $4, %ecx, %xmm1
1775; SSE2-NEXT:    shrb $3, %al
1776; SSE2-NEXT:    movzbl %al, %eax
1777; SSE2-NEXT:    pinsrw $6, %eax, %xmm1
1778; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1779; SSE2-NEXT:    psllq $63, %xmm0
1780; SSE2-NEXT:    psrad $31, %xmm0
1781; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1782; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1783; SSE2-NEXT:    psllq $63, %xmm1
1784; SSE2-NEXT:    psrad $31, %xmm1
1785; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1786; SSE2-NEXT:    retq
1787;
1788; SSSE3-LABEL: load_sext_4i1_to_4i64:
1789; SSSE3:       # %bb.0: # %entry
1790; SSSE3-NEXT:    movzbl (%rdi), %eax
1791; SSSE3-NEXT:    movl %eax, %ecx
1792; SSSE3-NEXT:    shrb %cl
1793; SSSE3-NEXT:    andb $1, %cl
1794; SSSE3-NEXT:    movzbl %cl, %ecx
1795; SSSE3-NEXT:    movl %eax, %edx
1796; SSSE3-NEXT:    andb $1, %dl
1797; SSSE3-NEXT:    movzbl %dl, %edx
1798; SSSE3-NEXT:    movd %edx, %xmm1
1799; SSSE3-NEXT:    pinsrw $2, %ecx, %xmm1
1800; SSSE3-NEXT:    movl %eax, %ecx
1801; SSSE3-NEXT:    shrb $2, %cl
1802; SSSE3-NEXT:    andb $1, %cl
1803; SSSE3-NEXT:    movzbl %cl, %ecx
1804; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm1
1805; SSSE3-NEXT:    shrb $3, %al
1806; SSSE3-NEXT:    movzbl %al, %eax
1807; SSSE3-NEXT:    pinsrw $6, %eax, %xmm1
1808; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1809; SSSE3-NEXT:    psllq $63, %xmm0
1810; SSSE3-NEXT:    psrad $31, %xmm0
1811; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1812; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1813; SSSE3-NEXT:    psllq $63, %xmm1
1814; SSSE3-NEXT:    psrad $31, %xmm1
1815; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1816; SSSE3-NEXT:    retq
1817;
1818; SSE41-LABEL: load_sext_4i1_to_4i64:
1819; SSE41:       # %bb.0: # %entry
1820; SSE41-NEXT:    movzbl (%rdi), %eax
1821; SSE41-NEXT:    movl %eax, %ecx
1822; SSE41-NEXT:    shrb %cl
1823; SSE41-NEXT:    andb $1, %cl
1824; SSE41-NEXT:    movzbl %cl, %ecx
1825; SSE41-NEXT:    movl %eax, %edx
1826; SSE41-NEXT:    andb $1, %dl
1827; SSE41-NEXT:    movzbl %dl, %edx
1828; SSE41-NEXT:    movd %edx, %xmm1
1829; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
1830; SSE41-NEXT:    movl %eax, %ecx
1831; SSE41-NEXT:    shrb $2, %cl
1832; SSE41-NEXT:    andb $1, %cl
1833; SSE41-NEXT:    movzbl %cl, %ecx
1834; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1835; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
1836; SSE41-NEXT:    shrb $3, %al
1837; SSE41-NEXT:    movzbl %al, %eax
1838; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
1839; SSE41-NEXT:    psllq $63, %xmm0
1840; SSE41-NEXT:    psrad $31, %xmm0
1841; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1842; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1843; SSE41-NEXT:    psllq $63, %xmm1
1844; SSE41-NEXT:    psrad $31, %xmm1
1845; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1846; SSE41-NEXT:    retq
1847;
1848; AVX1-LABEL: load_sext_4i1_to_4i64:
1849; AVX1:       # %bb.0: # %entry
1850; AVX1-NEXT:    movzbl (%rdi), %eax
1851; AVX1-NEXT:    movzbl %al, %ecx
1852; AVX1-NEXT:    shrb %al
1853; AVX1-NEXT:    movzbl %al, %eax
1854; AVX1-NEXT:    andl $1, %eax
1855; AVX1-NEXT:    negl %eax
1856; AVX1-NEXT:    movl %ecx, %edx
1857; AVX1-NEXT:    andl $1, %edx
1858; AVX1-NEXT:    negl %edx
1859; AVX1-NEXT:    vmovd %edx, %xmm0
1860; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
1861; AVX1-NEXT:    movl %ecx, %eax
1862; AVX1-NEXT:    shrb $2, %al
1863; AVX1-NEXT:    movzbl %al, %eax
1864; AVX1-NEXT:    andl $1, %eax
1865; AVX1-NEXT:    negl %eax
1866; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
1867; AVX1-NEXT:    shrb $3, %cl
1868; AVX1-NEXT:    movzbl %cl, %eax
1869; AVX1-NEXT:    negl %eax
1870; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1871; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1872; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1873; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1874; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1875; AVX1-NEXT:    retq
1876;
1877; AVX2-LABEL: load_sext_4i1_to_4i64:
1878; AVX2:       # %bb.0: # %entry
1879; AVX2-NEXT:    movzbl (%rdi), %eax
1880; AVX2-NEXT:    movl %eax, %ecx
1881; AVX2-NEXT:    shrb $3, %cl
1882; AVX2-NEXT:    movzbl %cl, %ecx
1883; AVX2-NEXT:    negq %rcx
1884; AVX2-NEXT:    vmovq %rcx, %xmm0
1885; AVX2-NEXT:    movzbl %al, %ecx
1886; AVX2-NEXT:    shrb $2, %al
1887; AVX2-NEXT:    movzbl %al, %eax
1888; AVX2-NEXT:    andl $1, %eax
1889; AVX2-NEXT:    negq %rax
1890; AVX2-NEXT:    vmovq %rax, %xmm1
1891; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1892; AVX2-NEXT:    movl %ecx, %eax
1893; AVX2-NEXT:    andl $1, %eax
1894; AVX2-NEXT:    negq %rax
1895; AVX2-NEXT:    vmovq %rax, %xmm1
1896; AVX2-NEXT:    shrb %cl
1897; AVX2-NEXT:    movzbl %cl, %eax
1898; AVX2-NEXT:    andl $1, %eax
1899; AVX2-NEXT:    negq %rax
1900; AVX2-NEXT:    vmovq %rax, %xmm2
1901; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1902; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1903; AVX2-NEXT:    retq
1904;
1905; AVX512F-LABEL: load_sext_4i1_to_4i64:
1906; AVX512F:       # %bb.0: # %entry
1907; AVX512F-NEXT:    movzbl (%rdi), %eax
1908; AVX512F-NEXT:    kmovw %eax, %k1
1909; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1910; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1911; AVX512F-NEXT:    retq
1912;
1913; AVX512BW-LABEL: load_sext_4i1_to_4i64:
1914; AVX512BW:       # %bb.0: # %entry
1915; AVX512BW-NEXT:    movzbl (%rdi), %eax
1916; AVX512BW-NEXT:    kmovd %eax, %k1
1917; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1918; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1919; AVX512BW-NEXT:    retq
1920;
1921; X86-SSE2-LABEL: load_sext_4i1_to_4i64:
1922; X86-SSE2:       # %bb.0: # %entry
1923; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1924; X86-SSE2-NEXT:    movzbl (%eax), %eax
1925; X86-SSE2-NEXT:    movl %eax, %ecx
1926; X86-SSE2-NEXT:    shrb %cl
1927; X86-SSE2-NEXT:    andb $1, %cl
1928; X86-SSE2-NEXT:    movzbl %cl, %ecx
1929; X86-SSE2-NEXT:    movl %eax, %edx
1930; X86-SSE2-NEXT:    andb $1, %dl
1931; X86-SSE2-NEXT:    movzbl %dl, %edx
1932; X86-SSE2-NEXT:    movd %edx, %xmm1
1933; X86-SSE2-NEXT:    pinsrw $2, %ecx, %xmm1
1934; X86-SSE2-NEXT:    movl %eax, %ecx
1935; X86-SSE2-NEXT:    shrb $2, %cl
1936; X86-SSE2-NEXT:    andb $1, %cl
1937; X86-SSE2-NEXT:    movzbl %cl, %ecx
1938; X86-SSE2-NEXT:    pinsrw $4, %ecx, %xmm1
1939; X86-SSE2-NEXT:    shrb $3, %al
1940; X86-SSE2-NEXT:    movzbl %al, %eax
1941; X86-SSE2-NEXT:    pinsrw $6, %eax, %xmm1
1942; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
1943; X86-SSE2-NEXT:    psllq $63, %xmm0
1944; X86-SSE2-NEXT:    psrad $31, %xmm0
1945; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1946; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
1947; X86-SSE2-NEXT:    psllq $63, %xmm1
1948; X86-SSE2-NEXT:    psrad $31, %xmm1
1949; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1950; X86-SSE2-NEXT:    retl
1951;
1952; X86-SSE41-LABEL: load_sext_4i1_to_4i64:
1953; X86-SSE41:       # %bb.0: # %entry
1954; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1955; X86-SSE41-NEXT:    movzbl (%eax), %eax
1956; X86-SSE41-NEXT:    movl %eax, %ecx
1957; X86-SSE41-NEXT:    shrb %cl
1958; X86-SSE41-NEXT:    andb $1, %cl
1959; X86-SSE41-NEXT:    movzbl %cl, %ecx
1960; X86-SSE41-NEXT:    movl %eax, %edx
1961; X86-SSE41-NEXT:    andb $1, %dl
1962; X86-SSE41-NEXT:    movzbl %dl, %edx
1963; X86-SSE41-NEXT:    movd %edx, %xmm1
1964; X86-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
1965; X86-SSE41-NEXT:    movl %eax, %ecx
1966; X86-SSE41-NEXT:    shrb $2, %cl
1967; X86-SSE41-NEXT:    andb $1, %cl
1968; X86-SSE41-NEXT:    movzbl %cl, %ecx
1969; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1970; X86-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
1971; X86-SSE41-NEXT:    shrb $3, %al
1972; X86-SSE41-NEXT:    movzbl %al, %eax
1973; X86-SSE41-NEXT:    pinsrb $12, %eax, %xmm1
1974; X86-SSE41-NEXT:    psllq $63, %xmm0
1975; X86-SSE41-NEXT:    psrad $31, %xmm0
1976; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1977; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1978; X86-SSE41-NEXT:    psllq $63, %xmm1
1979; X86-SSE41-NEXT:    psrad $31, %xmm1
1980; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1981; X86-SSE41-NEXT:    retl
1982entry:
1983 %X = load <4 x i1>, ptr %ptr
1984 %Y = sext <4 x i1> %X to <4 x i64>
1985 ret <4 x i64> %Y
1986}
1987
1988define <4 x i64> @load_sext_4i8_to_4i64(ptr%ptr) {
1989; SSE2-LABEL: load_sext_4i8_to_4i64:
1990; SSE2:       # %bb.0: # %entry
1991; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1992; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1993; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1994; SSE2-NEXT:    psrad $24, %xmm1
1995; SSE2-NEXT:    pxor %xmm2, %xmm2
1996; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
1997; SSE2-NEXT:    movdqa %xmm1, %xmm0
1998; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1999; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2000; SSE2-NEXT:    retq
2001;
2002; SSSE3-LABEL: load_sext_4i8_to_4i64:
2003; SSSE3:       # %bb.0: # %entry
2004; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2005; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2006; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2007; SSSE3-NEXT:    psrad $24, %xmm1
2008; SSSE3-NEXT:    pxor %xmm2, %xmm2
2009; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2010; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2011; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2012; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2013; SSSE3-NEXT:    retq
2014;
2015; SSE41-LABEL: load_sext_4i8_to_4i64:
2016; SSE41:       # %bb.0: # %entry
2017; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
2018; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
2019; SSE41-NEXT:    retq
2020;
2021; AVX1-LABEL: load_sext_4i8_to_4i64:
2022; AVX1:       # %bb.0: # %entry
2023; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2024; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm1
2025; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2026; AVX1-NEXT:    retq
2027;
2028; AVX2-LABEL: load_sext_4i8_to_4i64:
2029; AVX2:       # %bb.0: # %entry
2030; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2031; AVX2-NEXT:    retq
2032;
2033; AVX512-LABEL: load_sext_4i8_to_4i64:
2034; AVX512:       # %bb.0: # %entry
2035; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
2036; AVX512-NEXT:    retq
2037;
2038; X86-SSE2-LABEL: load_sext_4i8_to_4i64:
2039; X86-SSE2:       # %bb.0: # %entry
2040; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2041; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2042; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2043; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2044; X86-SSE2-NEXT:    psrad $24, %xmm1
2045; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2046; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2047; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2048; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2049; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2050; X86-SSE2-NEXT:    retl
2051;
2052; X86-SSE41-LABEL: load_sext_4i8_to_4i64:
2053; X86-SSE41:       # %bb.0: # %entry
2054; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2055; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2056; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2057; X86-SSE41-NEXT:    retl
2058entry:
2059 %X = load <4 x i8>, ptr %ptr
2060 %Y = sext <4 x i8> %X to <4 x i64>
2061 ret <4 x i64> %Y
2062}
2063
2064define <2 x i64> @load_sext_4i8_to_4i64_extract(ptr%ptr) {
2065; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
2066; SSE2:       # %bb.0:
2067; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2068; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2069; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2070; SSE2-NEXT:    psrad $24, %xmm0
2071; SSE2-NEXT:    pxor %xmm1, %xmm1
2072; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2073; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2074; SSE2-NEXT:    retq
2075;
2076; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
2077; SSSE3:       # %bb.0:
2078; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2079; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2080; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2081; SSSE3-NEXT:    psrad $24, %xmm0
2082; SSSE3-NEXT:    pxor %xmm1, %xmm1
2083; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2084; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2085; SSSE3-NEXT:    retq
2086;
2087; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
2088; SSE41:       # %bb.0:
2089; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm0
2090; SSE41-NEXT:    retq
2091;
2092; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
2093; AVX1:       # %bb.0:
2094; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2095; AVX1-NEXT:    retq
2096;
2097; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
2098; AVX2:       # %bb.0:
2099; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2100; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2101; AVX2-NEXT:    vzeroupper
2102; AVX2-NEXT:    retq
2103;
2104; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
2105; AVX512:       # %bb.0:
2106; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
2107; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
2108; AVX512-NEXT:    vzeroupper
2109; AVX512-NEXT:    retq
2110;
2111; X86-SSE2-LABEL: load_sext_4i8_to_4i64_extract:
2112; X86-SSE2:       # %bb.0:
2113; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2114; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2115; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2116; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2117; X86-SSE2-NEXT:    psrad $24, %xmm0
2118; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
2119; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2120; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2121; X86-SSE2-NEXT:    retl
2122;
2123; X86-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
2124; X86-SSE41:       # %bb.0:
2125; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2126; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm0
2127; X86-SSE41-NEXT:    retl
2128 %ld = load <4 x i8>, ptr %ptr
2129 %sext = sext <4 x i8> %ld to <4 x i64>
2130 %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
2131 ret <2 x i64> %extract
2132}
2133
2134define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) {
2135; SSE-LABEL: load_sext_8i1_to_8i16:
2136; SSE:       # %bb.0: # %entry
2137; SSE-NEXT:    movzbl (%rdi), %eax
2138; SSE-NEXT:    movd %eax, %xmm0
2139; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2140; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2141; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2142; SSE-NEXT:    pand %xmm1, %xmm0
2143; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
2144; SSE-NEXT:    retq
2145;
2146; AVX1-LABEL: load_sext_8i1_to_8i16:
2147; AVX1:       # %bb.0: # %entry
2148; AVX1-NEXT:    movzbl (%rdi), %eax
2149; AVX1-NEXT:    vmovd %eax, %xmm0
2150; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2151; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2152; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2153; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2154; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2155; AVX1-NEXT:    retq
2156;
2157; AVX2-LABEL: load_sext_8i1_to_8i16:
2158; AVX2:       # %bb.0: # %entry
2159; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
2160; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2161; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2162; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2163; AVX2-NEXT:    retq
2164;
2165; AVX512F-LABEL: load_sext_8i1_to_8i16:
2166; AVX512F:       # %bb.0: # %entry
2167; AVX512F-NEXT:    movzbl (%rdi), %eax
2168; AVX512F-NEXT:    kmovw %eax, %k1
2169; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2170; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2171; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2172; AVX512F-NEXT:    vzeroupper
2173; AVX512F-NEXT:    retq
2174;
2175; AVX512BW-LABEL: load_sext_8i1_to_8i16:
2176; AVX512BW:       # %bb.0: # %entry
2177; AVX512BW-NEXT:    movzbl (%rdi), %eax
2178; AVX512BW-NEXT:    kmovd %eax, %k0
2179; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2180; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2181; AVX512BW-NEXT:    vzeroupper
2182; AVX512BW-NEXT:    retq
2183;
2184; X86-SSE-LABEL: load_sext_8i1_to_8i16:
2185; X86-SSE:       # %bb.0: # %entry
2186; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2187; X86-SSE-NEXT:    movzbl (%eax), %eax
2188; X86-SSE-NEXT:    movd %eax, %xmm0
2189; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2190; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2191; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
2192; X86-SSE-NEXT:    pand %xmm1, %xmm0
2193; X86-SSE-NEXT:    pcmpeqw %xmm1, %xmm0
2194; X86-SSE-NEXT:    retl
2195entry:
2196 %X = load <8 x i1>, ptr %ptr
2197 %Y = sext <8 x i1> %X to <8 x i16>
2198 ret <8 x i16> %Y
2199}
2200
2201define <8 x i16> @load_sext_8i8_to_8i16(ptr%ptr) {
2202; SSE2-LABEL: load_sext_8i8_to_8i16:
2203; SSE2:       # %bb.0: # %entry
2204; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2205; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2206; SSE2-NEXT:    psraw $8, %xmm0
2207; SSE2-NEXT:    retq
2208;
2209; SSSE3-LABEL: load_sext_8i8_to_8i16:
2210; SSSE3:       # %bb.0: # %entry
2211; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2212; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2213; SSSE3-NEXT:    psraw $8, %xmm0
2214; SSSE3-NEXT:    retq
2215;
2216; SSE41-LABEL: load_sext_8i8_to_8i16:
2217; SSE41:       # %bb.0: # %entry
2218; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2219; SSE41-NEXT:    retq
2220;
2221; AVX-LABEL: load_sext_8i8_to_8i16:
2222; AVX:       # %bb.0: # %entry
2223; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
2224; AVX-NEXT:    retq
2225;
2226; X86-SSE2-LABEL: load_sext_8i8_to_8i16:
2227; X86-SSE2:       # %bb.0: # %entry
2228; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2229; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2230; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2231; X86-SSE2-NEXT:    psraw $8, %xmm0
2232; X86-SSE2-NEXT:    retl
2233;
2234; X86-SSE41-LABEL: load_sext_8i8_to_8i16:
2235; X86-SSE41:       # %bb.0: # %entry
2236; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2237; X86-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2238; X86-SSE41-NEXT:    retl
2239entry:
2240 %X = load <8 x i8>, ptr %ptr
2241 %Y = sext <8 x i8> %X to <8 x i16>
2242 ret <8 x i16> %Y
2243}
2244
2245define <8 x i64> @load_sext_8i8_to_8i64(ptr%ptr) {
2246; SSE2-LABEL: load_sext_8i8_to_8i64:
2247; SSE2:       # %bb.0: # %entry
2248; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2249; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2250; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2251; SSE2-NEXT:    psrad $24, %xmm1
2252; SSE2-NEXT:    pxor %xmm4, %xmm4
2253; SSE2-NEXT:    pxor %xmm3, %xmm3
2254; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2255; SSE2-NEXT:    movdqa %xmm1, %xmm0
2256; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2257; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2258; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2259; SSE2-NEXT:    psrad $24, %xmm3
2260; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
2261; SSE2-NEXT:    movdqa %xmm3, %xmm2
2262; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2263; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2264; SSE2-NEXT:    retq
2265;
2266; SSSE3-LABEL: load_sext_8i8_to_8i64:
2267; SSSE3:       # %bb.0: # %entry
2268; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2269; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2270; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2271; SSSE3-NEXT:    psrad $24, %xmm1
2272; SSSE3-NEXT:    pxor %xmm4, %xmm4
2273; SSSE3-NEXT:    pxor %xmm3, %xmm3
2274; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
2275; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2276; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2277; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2278; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2279; SSSE3-NEXT:    psrad $24, %xmm3
2280; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
2281; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2282; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2283; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2284; SSSE3-NEXT:    retq
2285;
2286; SSE41-LABEL: load_sext_8i8_to_8i64:
2287; SSE41:       # %bb.0: # %entry
2288; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
2289; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
2290; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
2291; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
2292; SSE41-NEXT:    retq
2293;
2294; AVX1-LABEL: load_sext_8i8_to_8i64:
2295; AVX1:       # %bb.0: # %entry
2296; AVX1-NEXT:    vpmovsxbq 6(%rdi), %xmm1
2297; AVX1-NEXT:    vpmovsxbq 4(%rdi), %xmm2
2298; AVX1-NEXT:    vpmovsxbq 2(%rdi), %xmm0
2299; AVX1-NEXT:    vpmovsxbq (%rdi), %xmm3
2300; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
2301; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2302; AVX1-NEXT:    retq
2303;
2304; AVX2-LABEL: load_sext_8i8_to_8i64:
2305; AVX2:       # %bb.0: # %entry
2306; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2307; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
2308; AVX2-NEXT:    retq
2309;
2310; AVX512-LABEL: load_sext_8i8_to_8i64:
2311; AVX512:       # %bb.0: # %entry
2312; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
2313; AVX512-NEXT:    retq
2314;
2315; X86-SSE2-LABEL: load_sext_8i8_to_8i64:
2316; X86-SSE2:       # %bb.0: # %entry
2317; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2318; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2319; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2320; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2321; X86-SSE2-NEXT:    psrad $24, %xmm1
2322; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
2323; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
2324; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
2325; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2326; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2327; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2328; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2329; X86-SSE2-NEXT:    psrad $24, %xmm3
2330; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
2331; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
2332; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2333; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2334; X86-SSE2-NEXT:    retl
2335;
2336; X86-SSE41-LABEL: load_sext_8i8_to_8i64:
2337; X86-SSE41:       # %bb.0: # %entry
2338; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2339; X86-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2340; X86-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2341; X86-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
2342; X86-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
2343; X86-SSE41-NEXT:    retl
2344entry:
2345 %X = load <8 x i8>, ptr %ptr
2346 %Y = sext <8 x i8> %X to <8 x i64>
2347 ret <8 x i64> %Y
2348}
2349
2350define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) {
2351; SSE-LABEL: load_sext_8i1_to_8i32:
2352; SSE:       # %bb.0: # %entry
2353; SSE-NEXT:    movzbl (%rdi), %eax
2354; SSE-NEXT:    movd %eax, %xmm0
2355; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2356; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8]
2357; SSE-NEXT:    movdqa %xmm1, %xmm0
2358; SSE-NEXT:    pand %xmm2, %xmm0
2359; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
2360; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
2361; SSE-NEXT:    pand %xmm2, %xmm1
2362; SSE-NEXT:    pcmpeqd %xmm2, %xmm1
2363; SSE-NEXT:    retq
2364;
2365; AVX1-LABEL: load_sext_8i1_to_8i32:
2366; AVX1:       # %bb.0: # %entry
2367; AVX1-NEXT:    movzbl (%rdi), %eax
2368; AVX1-NEXT:    vmovd %eax, %xmm0
2369; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2370; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2371; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2372; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2373; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2374; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2375; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2376; AVX1-NEXT:    retq
2377;
2378; AVX2-LABEL: load_sext_8i1_to_8i32:
2379; AVX2:       # %bb.0: # %entry
2380; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
2381; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
2382; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2383; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
2384; AVX2-NEXT:    retq
2385;
2386; AVX512F-LABEL: load_sext_8i1_to_8i32:
2387; AVX512F:       # %bb.0: # %entry
2388; AVX512F-NEXT:    movzbl (%rdi), %eax
2389; AVX512F-NEXT:    kmovw %eax, %k1
2390; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2391; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2392; AVX512F-NEXT:    retq
2393;
2394; AVX512BW-LABEL: load_sext_8i1_to_8i32:
2395; AVX512BW:       # %bb.0: # %entry
2396; AVX512BW-NEXT:    movzbl (%rdi), %eax
2397; AVX512BW-NEXT:    kmovd %eax, %k1
2398; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2399; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2400; AVX512BW-NEXT:    retq
2401;
2402; X86-SSE-LABEL: load_sext_8i1_to_8i32:
2403; X86-SSE:       # %bb.0: # %entry
2404; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2405; X86-SSE-NEXT:    movzbl (%eax), %eax
2406; X86-SSE-NEXT:    movd %eax, %xmm0
2407; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2408; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8]
2409; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
2410; X86-SSE-NEXT:    pand %xmm2, %xmm0
2411; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm0
2412; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16,32,64,128]
2413; X86-SSE-NEXT:    pand %xmm2, %xmm1
2414; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm1
2415; X86-SSE-NEXT:    retl
2416entry:
2417 %X = load <8 x i1>, ptr %ptr
2418 %Y = sext <8 x i1> %X to <8 x i32>
2419 ret <8 x i32> %Y
2420}
2421
2422define <8 x i32> @load_sext_8i8_to_8i32(ptr%ptr) {
2423; SSE2-LABEL: load_sext_8i8_to_8i32:
2424; SSE2:       # %bb.0: # %entry
2425; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2426; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2427; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2428; SSE2-NEXT:    psrad $24, %xmm0
2429; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2430; SSE2-NEXT:    psrad $24, %xmm1
2431; SSE2-NEXT:    retq
2432;
2433; SSSE3-LABEL: load_sext_8i8_to_8i32:
2434; SSSE3:       # %bb.0: # %entry
2435; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2436; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2437; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2438; SSSE3-NEXT:    psrad $24, %xmm0
2439; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2440; SSSE3-NEXT:    psrad $24, %xmm1
2441; SSSE3-NEXT:    retq
2442;
2443; SSE41-LABEL: load_sext_8i8_to_8i32:
2444; SSE41:       # %bb.0: # %entry
2445; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
2446; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
2447; SSE41-NEXT:    retq
2448;
2449; AVX1-LABEL: load_sext_8i8_to_8i32:
2450; AVX1:       # %bb.0: # %entry
2451; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
2452; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
2453; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2454; AVX1-NEXT:    retq
2455;
2456; AVX2-LABEL: load_sext_8i8_to_8i32:
2457; AVX2:       # %bb.0: # %entry
2458; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
2459; AVX2-NEXT:    retq
2460;
2461; AVX512-LABEL: load_sext_8i8_to_8i32:
2462; AVX512:       # %bb.0: # %entry
2463; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
2464; AVX512-NEXT:    retq
2465;
2466; X86-SSE2-LABEL: load_sext_8i8_to_8i32:
2467; X86-SSE2:       # %bb.0: # %entry
2468; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2469; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2470; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2471; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2472; X86-SSE2-NEXT:    psrad $24, %xmm0
2473; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2474; X86-SSE2-NEXT:    psrad $24, %xmm1
2475; X86-SSE2-NEXT:    retl
2476;
2477; X86-SSE41-LABEL: load_sext_8i8_to_8i32:
2478; X86-SSE41:       # %bb.0: # %entry
2479; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2480; X86-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
2481; X86-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
2482; X86-SSE41-NEXT:    retl
2483entry:
2484 %X = load <8 x i8>, ptr %ptr
2485 %Y = sext <8 x i8> %X to <8 x i32>
2486 ret <8 x i32> %Y
2487}
2488
2489define <16 x i8> @load_sext_16i1_to_16i8(ptr%ptr) nounwind readnone {
2490; SSE2-LABEL: load_sext_16i1_to_16i8:
2491; SSE2:       # %bb.0: # %entry
2492; SSE2-NEXT:    movzwl (%rdi), %eax
2493; SSE2-NEXT:    movd %eax, %xmm0
2494; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2495; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
2496; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2497; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2498; SSE2-NEXT:    pand %xmm1, %xmm0
2499; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
2500; SSE2-NEXT:    retq
2501;
2502; SSSE3-LABEL: load_sext_16i1_to_16i8:
2503; SSSE3:       # %bb.0: # %entry
2504; SSSE3-NEXT:    movzwl (%rdi), %eax
2505; SSSE3-NEXT:    movd %eax, %xmm0
2506; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2507; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2508; SSSE3-NEXT:    pand %xmm1, %xmm0
2509; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm0
2510; SSSE3-NEXT:    retq
2511;
2512; SSE41-LABEL: load_sext_16i1_to_16i8:
2513; SSE41:       # %bb.0: # %entry
2514; SSE41-NEXT:    movzwl (%rdi), %eax
2515; SSE41-NEXT:    movd %eax, %xmm0
2516; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2517; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2518; SSE41-NEXT:    pand %xmm1, %xmm0
2519; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
2520; SSE41-NEXT:    retq
2521;
2522; AVX1-LABEL: load_sext_16i1_to_16i8:
2523; AVX1:       # %bb.0: # %entry
2524; AVX1-NEXT:    movzwl (%rdi), %eax
2525; AVX1-NEXT:    vmovd %eax, %xmm0
2526; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2527; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
2528; AVX1-NEXT:    # xmm1 = mem[0,0]
2529; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2530; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
2531; AVX1-NEXT:    retq
2532;
2533; AVX2-LABEL: load_sext_16i1_to_16i8:
2534; AVX2:       # %bb.0: # %entry
2535; AVX2-NEXT:    movzwl (%rdi), %eax
2536; AVX2-NEXT:    vmovd %eax, %xmm0
2537; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2538; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
2539; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2540; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
2541; AVX2-NEXT:    retq
2542;
2543; AVX512F-LABEL: load_sext_16i1_to_16i8:
2544; AVX512F:       # %bb.0: # %entry
2545; AVX512F-NEXT:    kmovw (%rdi), %k1
2546; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2547; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2548; AVX512F-NEXT:    vzeroupper
2549; AVX512F-NEXT:    retq
2550;
2551; AVX512BW-LABEL: load_sext_16i1_to_16i8:
2552; AVX512BW:       # %bb.0: # %entry
2553; AVX512BW-NEXT:    kmovw (%rdi), %k0
2554; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2555; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2556; AVX512BW-NEXT:    vzeroupper
2557; AVX512BW-NEXT:    retq
2558;
2559; X86-SSE2-LABEL: load_sext_16i1_to_16i8:
2560; X86-SSE2:       # %bb.0: # %entry
2561; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2562; X86-SSE2-NEXT:    movzwl (%eax), %eax
2563; X86-SSE2-NEXT:    movd %eax, %xmm0
2564; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2565; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
2566; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2567; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2568; X86-SSE2-NEXT:    pand %xmm1, %xmm0
2569; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
2570; X86-SSE2-NEXT:    retl
2571;
2572; X86-SSE41-LABEL: load_sext_16i1_to_16i8:
2573; X86-SSE41:       # %bb.0: # %entry
2574; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2575; X86-SSE41-NEXT:    movzwl (%eax), %eax
2576; X86-SSE41-NEXT:    movd %eax, %xmm0
2577; X86-SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
2578; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2579; X86-SSE41-NEXT:    pand %xmm1, %xmm0
2580; X86-SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
2581; X86-SSE41-NEXT:    retl
2582entry:
2583 %X = load <16 x i1>, ptr %ptr
2584 %Y = sext <16 x i1> %X to <16 x i8>
2585 ret <16 x i8> %Y
2586}
2587
2588define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) {
2589; SSE-LABEL: load_sext_16i1_to_16i16:
2590; SSE:       # %bb.0: # %entry
2591; SSE-NEXT:    movzwl (%rdi), %eax
2592; SSE-NEXT:    movd %eax, %xmm0
2593; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2594; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2595; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
2596; SSE-NEXT:    movdqa %xmm1, %xmm0
2597; SSE-NEXT:    pand %xmm2, %xmm0
2598; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
2599; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
2600; SSE-NEXT:    pand %xmm2, %xmm1
2601; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
2602; SSE-NEXT:    retq
2603;
2604; AVX1-LABEL: load_sext_16i1_to_16i16:
2605; AVX1:       # %bb.0: # %entry
2606; AVX1-NEXT:    movzwl (%rdi), %eax
2607; AVX1-NEXT:    vmovd %eax, %xmm0
2608; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2609; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2610; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2611; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2612; AVX1-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2613; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2614; AVX1-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2615; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2616; AVX1-NEXT:    retq
2617;
2618; AVX2-LABEL: load_sext_16i1_to_16i16:
2619; AVX2:       # %bb.0: # %entry
2620; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
2621; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
2622; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2623; AVX2-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
2624; AVX2-NEXT:    retq
2625;
2626; AVX512F-LABEL: load_sext_16i1_to_16i16:
2627; AVX512F:       # %bb.0: # %entry
2628; AVX512F-NEXT:    kmovw (%rdi), %k1
2629; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2630; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2631; AVX512F-NEXT:    retq
2632;
2633; AVX512BW-LABEL: load_sext_16i1_to_16i16:
2634; AVX512BW:       # %bb.0: # %entry
2635; AVX512BW-NEXT:    kmovw (%rdi), %k0
2636; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
2637; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2638; AVX512BW-NEXT:    retq
2639;
2640; X86-SSE-LABEL: load_sext_16i1_to_16i16:
2641; X86-SSE:       # %bb.0: # %entry
2642; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2643; X86-SSE-NEXT:    movzwl (%eax), %eax
2644; X86-SSE-NEXT:    movd %eax, %xmm0
2645; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2646; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
2647; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
2648; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
2649; X86-SSE-NEXT:    pand %xmm2, %xmm0
2650; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
2651; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
2652; X86-SSE-NEXT:    pand %xmm2, %xmm1
2653; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
2654; X86-SSE-NEXT:    retl
2655entry:
2656 %X = load <16 x i1>, ptr %ptr
2657 %Y = sext <16 x i1> %X to <16 x i16>
2658 ret <16 x i16> %Y
2659}
2660
2661define <32 x i8> @load_sext_32i1_to_32i8(ptr%ptr) nounwind readnone {
2662; SSE-LABEL: load_sext_32i1_to_32i8:
2663; SSE:       # %bb.0: # %entry
2664; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2665; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2666; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
2667; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2668; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2669; SSE-NEXT:    pand %xmm2, %xmm0
2670; SSE-NEXT:    pcmpeqb %xmm2, %xmm0
2671; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
2672; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2673; SSE-NEXT:    pand %xmm2, %xmm1
2674; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
2675; SSE-NEXT:    retq
2676;
2677; AVX1-LABEL: load_sext_32i1_to_32i8:
2678; AVX1:       # %bb.0: # %entry
2679; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2680; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2681; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
2682; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
2683; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2684; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
2685; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2686; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2687; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
2688; AVX1-NEXT:    # xmm2 = mem[0,0]
2689; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
2690; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
2691; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2692; AVX1-NEXT:    retq
2693;
2694; AVX2-LABEL: load_sext_32i1_to_32i8:
2695; AVX2:       # %bb.0: # %entry
2696; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2697; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2698; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
2699; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2700; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2701; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
2702; AVX2-NEXT:    retq
2703;
2704; AVX512F-LABEL: load_sext_32i1_to_32i8:
2705; AVX512F:       # %bb.0: # %entry
2706; AVX512F-NEXT:    kmovw (%rdi), %k1
2707; AVX512F-NEXT:    kmovw 2(%rdi), %k2
2708; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2709; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2710; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
2711; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2712; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2713; AVX512F-NEXT:    retq
2714;
2715; AVX512BW-LABEL: load_sext_32i1_to_32i8:
2716; AVX512BW:       # %bb.0: # %entry
2717; AVX512BW-NEXT:    kmovd (%rdi), %k0
2718; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
2719; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2720; AVX512BW-NEXT:    retq
2721;
2722; X86-SSE-LABEL: load_sext_32i1_to_32i8:
2723; X86-SSE:       # %bb.0: # %entry
2724; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
2725; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2726; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2727; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
2728; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2729; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2730; X86-SSE-NEXT:    pand %xmm2, %xmm0
2731; X86-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
2732; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
2733; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2734; X86-SSE-NEXT:    pand %xmm2, %xmm1
2735; X86-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
2736; X86-SSE-NEXT:    retl
2737entry:
2738 %X = load <32 x i1>, ptr %ptr
2739 %Y = sext <32 x i1> %X to <32 x i8>
2740 ret <32 x i8> %Y
2741}
2742
2743define <16 x i16> @load_sext_16i8_to_16i16(ptr%ptr) {
2744; SSE2-LABEL: load_sext_16i8_to_16i16:
2745; SSE2:       # %bb.0: # %entry
2746; SSE2-NEXT:    movdqa (%rdi), %xmm1
2747; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2748; SSE2-NEXT:    psraw $8, %xmm0
2749; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2750; SSE2-NEXT:    psraw $8, %xmm1
2751; SSE2-NEXT:    retq
2752;
2753; SSSE3-LABEL: load_sext_16i8_to_16i16:
2754; SSSE3:       # %bb.0: # %entry
2755; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2756; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2757; SSSE3-NEXT:    psraw $8, %xmm0
2758; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2759; SSSE3-NEXT:    psraw $8, %xmm1
2760; SSSE3-NEXT:    retq
2761;
2762; SSE41-LABEL: load_sext_16i8_to_16i16:
2763; SSE41:       # %bb.0: # %entry
2764; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2765; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
2766; SSE41-NEXT:    retq
2767;
2768; AVX1-LABEL: load_sext_16i8_to_16i16:
2769; AVX1:       # %bb.0: # %entry
2770; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm0
2771; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm1
2772; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2773; AVX1-NEXT:    retq
2774;
2775; AVX2-LABEL: load_sext_16i8_to_16i16:
2776; AVX2:       # %bb.0: # %entry
2777; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
2778; AVX2-NEXT:    retq
2779;
2780; AVX512-LABEL: load_sext_16i8_to_16i16:
2781; AVX512:       # %bb.0: # %entry
2782; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
2783; AVX512-NEXT:    retq
2784;
2785; X86-SSE2-LABEL: load_sext_16i8_to_16i16:
2786; X86-SSE2:       # %bb.0: # %entry
2787; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2788; X86-SSE2-NEXT:    movdqa (%eax), %xmm1
2789; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2790; X86-SSE2-NEXT:    psraw $8, %xmm0
2791; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2792; X86-SSE2-NEXT:    psraw $8, %xmm1
2793; X86-SSE2-NEXT:    retl
2794;
2795; X86-SSE41-LABEL: load_sext_16i8_to_16i16:
2796; X86-SSE41:       # %bb.0: # %entry
2797; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2798; X86-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2799; X86-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
2800; X86-SSE41-NEXT:    retl
2801entry:
2802 %X = load <16 x i8>, ptr %ptr
2803 %Y = sext <16 x i8> %X to <16 x i16>
2804 ret <16 x i16> %Y
2805}
2806
2807define <2 x i64> @load_sext_2i16_to_2i64(ptr%ptr) {
2808; SSE2-LABEL: load_sext_2i16_to_2i64:
2809; SSE2:       # %bb.0: # %entry
2810; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2811; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2812; SSE2-NEXT:    pxor %xmm1, %xmm1
2813; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2814; SSE2-NEXT:    psrad $16, %xmm0
2815; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2816; SSE2-NEXT:    retq
2817;
2818; SSSE3-LABEL: load_sext_2i16_to_2i64:
2819; SSSE3:       # %bb.0: # %entry
2820; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2821; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2822; SSSE3-NEXT:    pxor %xmm1, %xmm1
2823; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
2824; SSSE3-NEXT:    psrad $16, %xmm0
2825; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2826; SSSE3-NEXT:    retq
2827;
2828; SSE41-LABEL: load_sext_2i16_to_2i64:
2829; SSE41:       # %bb.0: # %entry
2830; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
2831; SSE41-NEXT:    retq
2832;
2833; AVX-LABEL: load_sext_2i16_to_2i64:
2834; AVX:       # %bb.0: # %entry
2835; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
2836; AVX-NEXT:    retq
2837;
2838; X86-SSE2-LABEL: load_sext_2i16_to_2i64:
2839; X86-SSE2:       # %bb.0: # %entry
2840; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2841; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2842; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2843; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
2844; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
2845; X86-SSE2-NEXT:    psrad $16, %xmm0
2846; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2847; X86-SSE2-NEXT:    retl
2848;
2849; X86-SSE41-LABEL: load_sext_2i16_to_2i64:
2850; X86-SSE41:       # %bb.0: # %entry
2851; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2852; X86-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
2853; X86-SSE41-NEXT:    retl
2854entry:
2855 %X = load <2 x i16>, ptr %ptr
2856 %Y = sext <2 x i16> %X to <2 x i64>
2857 ret <2 x i64> %Y
2858}
2859
2860define <4 x i32> @load_sext_4i16_to_4i32(ptr%ptr) {
2861; SSE2-LABEL: load_sext_4i16_to_4i32:
2862; SSE2:       # %bb.0: # %entry
2863; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2864; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2865; SSE2-NEXT:    psrad $16, %xmm0
2866; SSE2-NEXT:    retq
2867;
2868; SSSE3-LABEL: load_sext_4i16_to_4i32:
2869; SSSE3:       # %bb.0: # %entry
2870; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2871; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2872; SSSE3-NEXT:    psrad $16, %xmm0
2873; SSSE3-NEXT:    retq
2874;
2875; SSE41-LABEL: load_sext_4i16_to_4i32:
2876; SSE41:       # %bb.0: # %entry
2877; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
2878; SSE41-NEXT:    retq
2879;
2880; AVX-LABEL: load_sext_4i16_to_4i32:
2881; AVX:       # %bb.0: # %entry
2882; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
2883; AVX-NEXT:    retq
2884;
2885; X86-SSE2-LABEL: load_sext_4i16_to_4i32:
2886; X86-SSE2:       # %bb.0: # %entry
2887; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2888; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2889; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2890; X86-SSE2-NEXT:    psrad $16, %xmm0
2891; X86-SSE2-NEXT:    retl
2892;
2893; X86-SSE41-LABEL: load_sext_4i16_to_4i32:
2894; X86-SSE41:       # %bb.0: # %entry
2895; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2896; X86-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
2897; X86-SSE41-NEXT:    retl
2898entry:
2899 %X = load <4 x i16>, ptr %ptr
2900 %Y = sext <4 x i16> %X to <4 x i32>
2901 ret <4 x i32> %Y
2902}
2903
2904define <4 x i64> @load_sext_4i16_to_4i64(ptr%ptr) {
2905; SSE2-LABEL: load_sext_4i16_to_4i64:
2906; SSE2:       # %bb.0: # %entry
2907; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2908; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2909; SSE2-NEXT:    psrad $16, %xmm1
2910; SSE2-NEXT:    pxor %xmm2, %xmm2
2911; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2912; SSE2-NEXT:    movdqa %xmm1, %xmm0
2913; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2914; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2915; SSE2-NEXT:    retq
2916;
2917; SSSE3-LABEL: load_sext_4i16_to_4i64:
2918; SSSE3:       # %bb.0: # %entry
2919; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2920; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2921; SSSE3-NEXT:    psrad $16, %xmm1
2922; SSSE3-NEXT:    pxor %xmm2, %xmm2
2923; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
2924; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2925; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2926; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2927; SSSE3-NEXT:    retq
2928;
2929; SSE41-LABEL: load_sext_4i16_to_4i64:
2930; SSE41:       # %bb.0: # %entry
2931; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
2932; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
2933; SSE41-NEXT:    retq
2934;
2935; AVX1-LABEL: load_sext_4i16_to_4i64:
2936; AVX1:       # %bb.0: # %entry
2937; AVX1-NEXT:    vpmovsxwq 4(%rdi), %xmm0
2938; AVX1-NEXT:    vpmovsxwq (%rdi), %xmm1
2939; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2940; AVX1-NEXT:    retq
2941;
2942; AVX2-LABEL: load_sext_4i16_to_4i64:
2943; AVX2:       # %bb.0: # %entry
2944; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
2945; AVX2-NEXT:    retq
2946;
2947; AVX512-LABEL: load_sext_4i16_to_4i64:
2948; AVX512:       # %bb.0: # %entry
2949; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
2950; AVX512-NEXT:    retq
2951;
2952; X86-SSE2-LABEL: load_sext_4i16_to_4i64:
2953; X86-SSE2:       # %bb.0: # %entry
2954; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2955; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2956; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2957; X86-SSE2-NEXT:    psrad $16, %xmm1
2958; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2959; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
2960; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2961; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2962; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2963; X86-SSE2-NEXT:    retl
2964;
2965; X86-SSE41-LABEL: load_sext_4i16_to_4i64:
2966; X86-SSE41:       # %bb.0: # %entry
2967; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2968; X86-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
2969; X86-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
2970; X86-SSE41-NEXT:    retl
2971entry:
2972 %X = load <4 x i16>, ptr %ptr
2973 %Y = sext <4 x i16> %X to <4 x i64>
2974 ret <4 x i64> %Y
2975}
2976
2977define <8 x i32> @load_sext_8i16_to_8i32(ptr%ptr) {
2978; SSE2-LABEL: load_sext_8i16_to_8i32:
2979; SSE2:       # %bb.0: # %entry
2980; SSE2-NEXT:    movdqa (%rdi), %xmm1
2981; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2982; SSE2-NEXT:    psrad $16, %xmm0
2983; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2984; SSE2-NEXT:    psrad $16, %xmm1
2985; SSE2-NEXT:    retq
2986;
2987; SSSE3-LABEL: load_sext_8i16_to_8i32:
2988; SSSE3:       # %bb.0: # %entry
2989; SSSE3-NEXT:    movdqa (%rdi), %xmm1
2990; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2991; SSSE3-NEXT:    psrad $16, %xmm0
2992; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2993; SSSE3-NEXT:    psrad $16, %xmm1
2994; SSSE3-NEXT:    retq
2995;
2996; SSE41-LABEL: load_sext_8i16_to_8i32:
2997; SSE41:       # %bb.0: # %entry
2998; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
2999; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
3000; SSE41-NEXT:    retq
3001;
3002; AVX1-LABEL: load_sext_8i16_to_8i32:
3003; AVX1:       # %bb.0: # %entry
3004; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
3005; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
3006; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3007; AVX1-NEXT:    retq
3008;
3009; AVX2-LABEL: load_sext_8i16_to_8i32:
3010; AVX2:       # %bb.0: # %entry
3011; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
3012; AVX2-NEXT:    retq
3013;
3014; AVX512-LABEL: load_sext_8i16_to_8i32:
3015; AVX512:       # %bb.0: # %entry
3016; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
3017; AVX512-NEXT:    retq
3018;
3019; X86-SSE2-LABEL: load_sext_8i16_to_8i32:
3020; X86-SSE2:       # %bb.0: # %entry
3021; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3022; X86-SSE2-NEXT:    movdqa (%eax), %xmm1
3023; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3024; X86-SSE2-NEXT:    psrad $16, %xmm0
3025; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3026; X86-SSE2-NEXT:    psrad $16, %xmm1
3027; X86-SSE2-NEXT:    retl
3028;
3029; X86-SSE41-LABEL: load_sext_8i16_to_8i32:
3030; X86-SSE41:       # %bb.0: # %entry
3031; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3032; X86-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
3033; X86-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
3034; X86-SSE41-NEXT:    retl
3035entry:
3036 %X = load <8 x i16>, ptr %ptr
3037 %Y = sext <8 x i16> %X to <8 x i32>
3038 ret <8 x i32> %Y
3039}
3040
3041define <2 x i64> @load_sext_2i32_to_2i64(ptr%ptr) {
3042; SSE2-LABEL: load_sext_2i32_to_2i64:
3043; SSE2:       # %bb.0: # %entry
3044; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3045; SSE2-NEXT:    pxor %xmm1, %xmm1
3046; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
3047; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3048; SSE2-NEXT:    retq
3049;
3050; SSSE3-LABEL: load_sext_2i32_to_2i64:
3051; SSSE3:       # %bb.0: # %entry
3052; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3053; SSSE3-NEXT:    pxor %xmm1, %xmm1
3054; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
3055; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3056; SSSE3-NEXT:    retq
3057;
3058; SSE41-LABEL: load_sext_2i32_to_2i64:
3059; SSE41:       # %bb.0: # %entry
3060; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3061; SSE41-NEXT:    retq
3062;
3063; AVX-LABEL: load_sext_2i32_to_2i64:
3064; AVX:       # %bb.0: # %entry
3065; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
3066; AVX-NEXT:    retq
3067;
3068; X86-SSE2-LABEL: load_sext_2i32_to_2i64:
3069; X86-SSE2:       # %bb.0: # %entry
3070; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3071; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3072; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
3073; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
3074; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3075; X86-SSE2-NEXT:    retl
3076;
3077; X86-SSE41-LABEL: load_sext_2i32_to_2i64:
3078; X86-SSE41:       # %bb.0: # %entry
3079; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3080; X86-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3081; X86-SSE41-NEXT:    retl
3082entry:
3083 %X = load <2 x i32>, ptr %ptr
3084 %Y = sext <2 x i32> %X to <2 x i64>
3085 ret <2 x i64> %Y
3086}
3087
3088define <4 x i64> @load_sext_4i32_to_4i64(ptr%ptr) {
3089; SSE2-LABEL: load_sext_4i32_to_4i64:
3090; SSE2:       # %bb.0: # %entry
3091; SSE2-NEXT:    movdqa (%rdi), %xmm0
3092; SSE2-NEXT:    pxor %xmm2, %xmm2
3093; SSE2-NEXT:    pxor %xmm3, %xmm3
3094; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3095; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3096; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3097; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3098; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3099; SSE2-NEXT:    retq
3100;
3101; SSSE3-LABEL: load_sext_4i32_to_4i64:
3102; SSSE3:       # %bb.0: # %entry
3103; SSSE3-NEXT:    movdqa (%rdi), %xmm0
3104; SSSE3-NEXT:    pxor %xmm2, %xmm2
3105; SSSE3-NEXT:    pxor %xmm3, %xmm3
3106; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
3107; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3108; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3109; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3110; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3111; SSSE3-NEXT:    retq
3112;
3113; SSE41-LABEL: load_sext_4i32_to_4i64:
3114; SSE41:       # %bb.0: # %entry
3115; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
3116; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
3117; SSE41-NEXT:    retq
3118;
3119; AVX1-LABEL: load_sext_4i32_to_4i64:
3120; AVX1:       # %bb.0: # %entry
3121; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm0
3122; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm1
3123; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3124; AVX1-NEXT:    retq
3125;
3126; AVX2-LABEL: load_sext_4i32_to_4i64:
3127; AVX2:       # %bb.0: # %entry
3128; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
3129; AVX2-NEXT:    retq
3130;
3131; AVX512-LABEL: load_sext_4i32_to_4i64:
3132; AVX512:       # %bb.0: # %entry
3133; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
3134; AVX512-NEXT:    retq
3135;
3136; X86-SSE2-LABEL: load_sext_4i32_to_4i64:
3137; X86-SSE2:       # %bb.0: # %entry
3138; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3139; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
3140; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3141; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
3142; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3143; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3144; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3145; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3146; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3147; X86-SSE2-NEXT:    retl
3148;
3149; X86-SSE41-LABEL: load_sext_4i32_to_4i64:
3150; X86-SSE41:       # %bb.0: # %entry
3151; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3152; X86-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
3153; X86-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
3154; X86-SSE41-NEXT:    retl
3155entry:
3156 %X = load <4 x i32>, ptr %ptr
3157 %Y = sext <4 x i32> %X to <4 x i64>
3158 ret <4 x i64> %Y
3159}
3160
3161define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
3162; SSE2-LABEL: sext_2i8_to_i32:
3163; SSE2:       # %bb.0: # %entry
3164; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3165; SSE2-NEXT:    psraw $8, %xmm0
3166; SSE2-NEXT:    movd %xmm0, %eax
3167; SSE2-NEXT:    retq
3168;
3169; SSSE3-LABEL: sext_2i8_to_i32:
3170; SSSE3:       # %bb.0: # %entry
3171; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3172; SSSE3-NEXT:    psraw $8, %xmm0
3173; SSSE3-NEXT:    movd %xmm0, %eax
3174; SSSE3-NEXT:    retq
3175;
3176; SSE41-LABEL: sext_2i8_to_i32:
3177; SSE41:       # %bb.0: # %entry
3178; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3179; SSE41-NEXT:    movd %xmm0, %eax
3180; SSE41-NEXT:    retq
3181;
3182; AVX-LABEL: sext_2i8_to_i32:
3183; AVX:       # %bb.0: # %entry
3184; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
3185; AVX-NEXT:    vmovd %xmm0, %eax
3186; AVX-NEXT:    retq
3187;
3188; X86-SSE2-LABEL: sext_2i8_to_i32:
3189; X86-SSE2:       # %bb.0: # %entry
3190; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3191; X86-SSE2-NEXT:    psraw $8, %xmm0
3192; X86-SSE2-NEXT:    movd %xmm0, %eax
3193; X86-SSE2-NEXT:    retl
3194;
3195; X86-SSE41-LABEL: sext_2i8_to_i32:
3196; X86-SSE41:       # %bb.0: # %entry
3197; X86-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
3198; X86-SSE41-NEXT:    movd %xmm0, %eax
3199; X86-SSE41-NEXT:    retl
3200entry:
3201  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
3202  %Ex = sext <2 x i8> %Shuf to <2 x i16>
3203  %Bc = bitcast <2 x i16> %Ex to i32
3204  ret i32 %Bc
3205}
3206
3207define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
3208; SSE2-LABEL: sext_4i1_to_4i64:
3209; SSE2:       # %bb.0:
3210; SSE2-NEXT:    pslld $31, %xmm0
3211; SSE2-NEXT:    psrad $31, %xmm0
3212; SSE2-NEXT:    pxor %xmm2, %xmm2
3213; SSE2-NEXT:    pxor %xmm3, %xmm3
3214; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3215; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3216; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3217; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3218; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3219; SSE2-NEXT:    retq
3220;
3221; SSSE3-LABEL: sext_4i1_to_4i64:
3222; SSSE3:       # %bb.0:
3223; SSSE3-NEXT:    pslld $31, %xmm0
3224; SSSE3-NEXT:    psrad $31, %xmm0
3225; SSSE3-NEXT:    pxor %xmm2, %xmm2
3226; SSSE3-NEXT:    pxor %xmm3, %xmm3
3227; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm3
3228; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3229; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3230; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3231; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3232; SSSE3-NEXT:    retq
3233;
3234; SSE41-LABEL: sext_4i1_to_4i64:
3235; SSE41:       # %bb.0:
3236; SSE41-NEXT:    pslld $31, %xmm0
3237; SSE41-NEXT:    psrad $31, %xmm0
3238; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3239; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3240; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3241; SSE41-NEXT:    movdqa %xmm2, %xmm0
3242; SSE41-NEXT:    retq
3243;
3244; AVX1-LABEL: sext_4i1_to_4i64:
3245; AVX1:       # %bb.0:
3246; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
3247; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
3248; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
3249; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3250; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
3251; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3252; AVX1-NEXT:    retq
3253;
3254; AVX2-LABEL: sext_4i1_to_4i64:
3255; AVX2:       # %bb.0:
3256; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
3257; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
3258; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
3259; AVX2-NEXT:    retq
3260;
3261; AVX512-LABEL: sext_4i1_to_4i64:
3262; AVX512:       # %bb.0:
3263; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
3264; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
3265; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
3266; AVX512-NEXT:    retq
3267;
3268; X86-SSE2-LABEL: sext_4i1_to_4i64:
3269; X86-SSE2:       # %bb.0:
3270; X86-SSE2-NEXT:    pslld $31, %xmm0
3271; X86-SSE2-NEXT:    psrad $31, %xmm0
3272; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3273; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
3274; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
3275; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3276; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3277; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3278; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3279; X86-SSE2-NEXT:    retl
3280;
3281; X86-SSE41-LABEL: sext_4i1_to_4i64:
3282; X86-SSE41:       # %bb.0:
3283; X86-SSE41-NEXT:    pslld $31, %xmm0
3284; X86-SSE41-NEXT:    psrad $31, %xmm0
3285; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
3286; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3287; X86-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
3288; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
3289; X86-SSE41-NEXT:    retl
3290  %extmask = sext <4 x i1> %mask to <4 x i64>
3291  ret <4 x i64> %extmask
3292}
3293
3294define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
3295; SSE2-LABEL: sext_4i8_to_4i64:
3296; SSE2:       # %bb.0:
3297; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3298; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3299; SSE2-NEXT:    psrad $24, %xmm1
3300; SSE2-NEXT:    pxor %xmm2, %xmm2
3301; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3302; SSE2-NEXT:    movdqa %xmm1, %xmm0
3303; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3304; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3305; SSE2-NEXT:    retq
3306;
3307; SSSE3-LABEL: sext_4i8_to_4i64:
3308; SSSE3:       # %bb.0:
3309; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3310; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3311; SSSE3-NEXT:    psrad $24, %xmm1
3312; SSSE3-NEXT:    pxor %xmm2, %xmm2
3313; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
3314; SSSE3-NEXT:    movdqa %xmm1, %xmm0
3315; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3316; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3317; SSSE3-NEXT:    retq
3318;
3319; SSE41-LABEL: sext_4i8_to_4i64:
3320; SSE41:       # %bb.0:
3321; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
3322; SSE41-NEXT:    psrld $16, %xmm0
3323; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
3324; SSE41-NEXT:    movdqa %xmm2, %xmm0
3325; SSE41-NEXT:    retq
3326;
3327; AVX1-LABEL: sext_4i8_to_4i64:
3328; AVX1:       # %bb.0:
3329; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
3330; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
3331; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
3332; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3333; AVX1-NEXT:    retq
3334;
3335; AVX2-LABEL: sext_4i8_to_4i64:
3336; AVX2:       # %bb.0:
3337; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
3338; AVX2-NEXT:    retq
3339;
3340; AVX512-LABEL: sext_4i8_to_4i64:
3341; AVX512:       # %bb.0:
3342; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
3343; AVX512-NEXT:    retq
3344;
3345; X86-SSE2-LABEL: sext_4i8_to_4i64:
3346; X86-SSE2:       # %bb.0:
3347; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3348; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3349; X86-SSE2-NEXT:    psrad $24, %xmm1
3350; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
3351; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
3352; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
3353; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3354; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3355; X86-SSE2-NEXT:    retl
3356;
3357; X86-SSE41-LABEL: sext_4i8_to_4i64:
3358; X86-SSE41:       # %bb.0:
3359; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
3360; X86-SSE41-NEXT:    psrld $16, %xmm0
3361; X86-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
3362; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
3363; X86-SSE41-NEXT:    retl
3364  %extmask = sext <4 x i8> %mask to <4 x i64>
3365  ret <4 x i64> %extmask
3366}
3367
3368define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
3369; SSE-LABEL: sext_32xi1_to_32xi8:
3370; SSE:       # %bb.0:
3371; SSE-NEXT:    pcmpeqw %xmm5, %xmm1
3372; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
3373; SSE-NEXT:    packsswb %xmm1, %xmm0
3374; SSE-NEXT:    pcmpeqw %xmm7, %xmm3
3375; SSE-NEXT:    pcmpeqw %xmm6, %xmm2
3376; SSE-NEXT:    packsswb %xmm3, %xmm2
3377; SSE-NEXT:    movdqa %xmm2, %xmm1
3378; SSE-NEXT:    retq
3379;
3380; AVX1-LABEL: sext_32xi1_to_32xi8:
3381; AVX1:       # %bb.0:
3382; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3383; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
3384; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm5, %xmm4
3385; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
3386; AVX1-NEXT:    vpacksswb %xmm4, %xmm1, %xmm1
3387; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
3388; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
3389; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
3390; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
3391; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
3392; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3393; AVX1-NEXT:    retq
3394;
3395; AVX2-LABEL: sext_32xi1_to_32xi8:
3396; AVX2:       # %bb.0:
3397; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
3398; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
3399; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
3400; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
3401; AVX2-NEXT:    retq
3402;
3403; AVX512F-LABEL: sext_32xi1_to_32xi8:
3404; AVX512F:       # %bb.0:
3405; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
3406; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
3407; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm3, %ymm2
3408; AVX512F-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
3409; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3410; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3411; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
3412; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3413; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3414; AVX512F-NEXT:    retq
3415;
3416; AVX512BW-LABEL: sext_32xi1_to_32xi8:
3417; AVX512BW:       # %bb.0:
3418; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
3419; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
3420; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3421; AVX512BW-NEXT:    retq
3422;
3423; X86-SSE-LABEL: sext_32xi1_to_32xi8:
3424; X86-SSE:       # %bb.0:
3425; X86-SSE-NEXT:    pushl %ebp
3426; X86-SSE-NEXT:    movl %esp, %ebp
3427; X86-SSE-NEXT:    andl $-16, %esp
3428; X86-SSE-NEXT:    subl $16, %esp
3429; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
3430; X86-SSE-NEXT:    pcmpeqw 40(%ebp), %xmm1
3431; X86-SSE-NEXT:    pcmpeqw 24(%ebp), %xmm0
3432; X86-SSE-NEXT:    packsswb %xmm1, %xmm0
3433; X86-SSE-NEXT:    pcmpeqw 72(%ebp), %xmm3
3434; X86-SSE-NEXT:    pcmpeqw 56(%ebp), %xmm2
3435; X86-SSE-NEXT:    packsswb %xmm3, %xmm2
3436; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
3437; X86-SSE-NEXT:    movl %ebp, %esp
3438; X86-SSE-NEXT:    popl %ebp
3439; X86-SSE-NEXT:    retl
3440  %a = icmp eq <32 x i16> %c1, %c2
3441  %b = sext <32 x i1> %a to <32 x i8>
3442  ret <32 x i8> %b
3443}
3444
3445define <2 x i32> @sext_2i8_to_2i32(ptr %addr) {
3446; SSE2-LABEL: sext_2i8_to_2i32:
3447; SSE2:       # %bb.0:
3448; SSE2-NEXT:    movzwl (%rdi), %eax
3449; SSE2-NEXT:    movd %eax, %xmm0
3450; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3451; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3452; SSE2-NEXT:    psrad $24, %xmm0
3453; SSE2-NEXT:    paddd %xmm0, %xmm0
3454; SSE2-NEXT:    retq
3455;
3456; SSSE3-LABEL: sext_2i8_to_2i32:
3457; SSSE3:       # %bb.0:
3458; SSSE3-NEXT:    movzwl (%rdi), %eax
3459; SSSE3-NEXT:    movd %eax, %xmm0
3460; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3461; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3462; SSSE3-NEXT:    psrad $24, %xmm0
3463; SSSE3-NEXT:    paddd %xmm0, %xmm0
3464; SSSE3-NEXT:    retq
3465;
3466; SSE41-LABEL: sext_2i8_to_2i32:
3467; SSE41:       # %bb.0:
3468; SSE41-NEXT:    movzwl (%rdi), %eax
3469; SSE41-NEXT:    movd %eax, %xmm0
3470; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3471; SSE41-NEXT:    paddd %xmm0, %xmm0
3472; SSE41-NEXT:    retq
3473;
3474; AVX-LABEL: sext_2i8_to_2i32:
3475; AVX:       # %bb.0:
3476; AVX-NEXT:    movzwl (%rdi), %eax
3477; AVX-NEXT:    vmovd %eax, %xmm0
3478; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
3479; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
3480; AVX-NEXT:    retq
3481;
3482; X86-SSE2-LABEL: sext_2i8_to_2i32:
3483; X86-SSE2:       # %bb.0:
3484; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
3485; X86-SSE2-NEXT:    movzwl (%eax), %eax
3486; X86-SSE2-NEXT:    movd %eax, %xmm0
3487; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3488; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
3489; X86-SSE2-NEXT:    psrad $24, %xmm0
3490; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
3491; X86-SSE2-NEXT:    retl
3492;
3493; X86-SSE41-LABEL: sext_2i8_to_2i32:
3494; X86-SSE41:       # %bb.0:
3495; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3496; X86-SSE41-NEXT:    movzwl (%eax), %eax
3497; X86-SSE41-NEXT:    movd %eax, %xmm0
3498; X86-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
3499; X86-SSE41-NEXT:    paddd %xmm0, %xmm0
3500; X86-SSE41-NEXT:    retl
3501  %x = load <2 x i8>, ptr %addr, align 1
3502  %y = sext <2 x i8> %x to <2 x i32>
3503  %z = add <2 x i32>%y, %y
3504  ret <2 x i32>%z
3505}
3506
3507define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) {
3508; SSE2-LABEL: sext_4i17_to_4i32:
3509; SSE2:       # %bb.0:
3510; SSE2-NEXT:    movq (%rdi), %rax
3511; SSE2-NEXT:    movl %eax, %ecx
3512; SSE2-NEXT:    shll $15, %ecx
3513; SSE2-NEXT:    sarl $15, %ecx
3514; SSE2-NEXT:    movd %ecx, %xmm0
3515; SSE2-NEXT:    movq %rax, %rcx
3516; SSE2-NEXT:    shrq $17, %rcx
3517; SSE2-NEXT:    shll $15, %ecx
3518; SSE2-NEXT:    sarl $15, %ecx
3519; SSE2-NEXT:    movd %ecx, %xmm1
3520; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3521; SSE2-NEXT:    movl 8(%rdi), %ecx
3522; SSE2-NEXT:    shll $13, %ecx
3523; SSE2-NEXT:    movq %rax, %rdx
3524; SSE2-NEXT:    shrq $51, %rdx
3525; SSE2-NEXT:    orl %ecx, %edx
3526; SSE2-NEXT:    shll $15, %edx
3527; SSE2-NEXT:    sarl $15, %edx
3528; SSE2-NEXT:    movd %edx, %xmm1
3529; SSE2-NEXT:    shrq $34, %rax
3530; SSE2-NEXT:    shll $15, %eax
3531; SSE2-NEXT:    sarl $15, %eax
3532; SSE2-NEXT:    movd %eax, %xmm2
3533; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3534; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3535; SSE2-NEXT:    retq
3536;
3537; SSSE3-LABEL: sext_4i17_to_4i32:
3538; SSSE3:       # %bb.0:
3539; SSSE3-NEXT:    movq (%rdi), %rax
3540; SSSE3-NEXT:    movl %eax, %ecx
3541; SSSE3-NEXT:    shll $15, %ecx
3542; SSSE3-NEXT:    sarl $15, %ecx
3543; SSSE3-NEXT:    movd %ecx, %xmm0
3544; SSSE3-NEXT:    movq %rax, %rcx
3545; SSSE3-NEXT:    shrq $17, %rcx
3546; SSSE3-NEXT:    shll $15, %ecx
3547; SSSE3-NEXT:    sarl $15, %ecx
3548; SSSE3-NEXT:    movd %ecx, %xmm1
3549; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3550; SSSE3-NEXT:    movl 8(%rdi), %ecx
3551; SSSE3-NEXT:    shll $13, %ecx
3552; SSSE3-NEXT:    movq %rax, %rdx
3553; SSSE3-NEXT:    shrq $51, %rdx
3554; SSSE3-NEXT:    orl %ecx, %edx
3555; SSSE3-NEXT:    shll $15, %edx
3556; SSSE3-NEXT:    sarl $15, %edx
3557; SSSE3-NEXT:    movd %edx, %xmm1
3558; SSSE3-NEXT:    shrq $34, %rax
3559; SSSE3-NEXT:    shll $15, %eax
3560; SSSE3-NEXT:    sarl $15, %eax
3561; SSSE3-NEXT:    movd %eax, %xmm2
3562; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3563; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3564; SSSE3-NEXT:    retq
3565;
3566; SSE41-LABEL: sext_4i17_to_4i32:
3567; SSE41:       # %bb.0:
3568; SSE41-NEXT:    movq (%rdi), %rax
3569; SSE41-NEXT:    movq %rax, %rcx
3570; SSE41-NEXT:    shrq $17, %rcx
3571; SSE41-NEXT:    shll $15, %ecx
3572; SSE41-NEXT:    sarl $15, %ecx
3573; SSE41-NEXT:    movl %eax, %edx
3574; SSE41-NEXT:    shll $15, %edx
3575; SSE41-NEXT:    sarl $15, %edx
3576; SSE41-NEXT:    movd %edx, %xmm0
3577; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
3578; SSE41-NEXT:    movq %rax, %rcx
3579; SSE41-NEXT:    shrq $34, %rcx
3580; SSE41-NEXT:    shll $15, %ecx
3581; SSE41-NEXT:    sarl $15, %ecx
3582; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
3583; SSE41-NEXT:    movl 8(%rdi), %ecx
3584; SSE41-NEXT:    shll $13, %ecx
3585; SSE41-NEXT:    shrq $51, %rax
3586; SSE41-NEXT:    orl %ecx, %eax
3587; SSE41-NEXT:    shll $15, %eax
3588; SSE41-NEXT:    sarl $15, %eax
3589; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
3590; SSE41-NEXT:    retq
3591;
3592; AVX-LABEL: sext_4i17_to_4i32:
3593; AVX:       # %bb.0:
3594; AVX-NEXT:    movq (%rdi), %rax
3595; AVX-NEXT:    movq %rax, %rcx
3596; AVX-NEXT:    shrq $17, %rcx
3597; AVX-NEXT:    shll $15, %ecx
3598; AVX-NEXT:    sarl $15, %ecx
3599; AVX-NEXT:    movl %eax, %edx
3600; AVX-NEXT:    shll $15, %edx
3601; AVX-NEXT:    sarl $15, %edx
3602; AVX-NEXT:    vmovd %edx, %xmm0
3603; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
3604; AVX-NEXT:    movq %rax, %rcx
3605; AVX-NEXT:    shrq $34, %rcx
3606; AVX-NEXT:    shll $15, %ecx
3607; AVX-NEXT:    sarl $15, %ecx
3608; AVX-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
3609; AVX-NEXT:    movl 8(%rdi), %ecx
3610; AVX-NEXT:    shll $13, %ecx
3611; AVX-NEXT:    shrq $51, %rax
3612; AVX-NEXT:    orl %ecx, %eax
3613; AVX-NEXT:    shll $15, %eax
3614; AVX-NEXT:    sarl $15, %eax
3615; AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
3616; AVX-NEXT:    retq
3617;
3618; X86-SSE2-LABEL: sext_4i17_to_4i32:
3619; X86-SSE2:       # %bb.0:
3620; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
3621; X86-SSE2-NEXT:    movl (%edx), %ecx
3622; X86-SSE2-NEXT:    movl 4(%edx), %eax
3623; X86-SSE2-NEXT:    movl 8(%edx), %edx
3624; X86-SSE2-NEXT:    shldl $13, %eax, %edx
3625; X86-SSE2-NEXT:    shll $15, %edx
3626; X86-SSE2-NEXT:    sarl $15, %edx
3627; X86-SSE2-NEXT:    movd %edx, %xmm0
3628; X86-SSE2-NEXT:    movl %eax, %edx
3629; X86-SSE2-NEXT:    shll $13, %edx
3630; X86-SSE2-NEXT:    sarl $15, %edx
3631; X86-SSE2-NEXT:    movd %edx, %xmm1
3632; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3633; X86-SSE2-NEXT:    shldl $15, %ecx, %eax
3634; X86-SSE2-NEXT:    shll $15, %ecx
3635; X86-SSE2-NEXT:    sarl $15, %ecx
3636; X86-SSE2-NEXT:    movd %ecx, %xmm0
3637; X86-SSE2-NEXT:    shll $15, %eax
3638; X86-SSE2-NEXT:    sarl $15, %eax
3639; X86-SSE2-NEXT:    movd %eax, %xmm2
3640; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3641; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3642; X86-SSE2-NEXT:    retl
3643;
3644; X86-SSE41-LABEL: sext_4i17_to_4i32:
3645; X86-SSE41:       # %bb.0:
3646; X86-SSE41-NEXT:    pushl %esi
3647; X86-SSE41-NEXT:    .cfi_def_cfa_offset 8
3648; X86-SSE41-NEXT:    .cfi_offset %esi, -8
3649; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %edx
3650; X86-SSE41-NEXT:    movl (%edx), %eax
3651; X86-SSE41-NEXT:    movl 4(%edx), %ecx
3652; X86-SSE41-NEXT:    movl %ecx, %esi
3653; X86-SSE41-NEXT:    movl 8(%edx), %edx
3654; X86-SSE41-NEXT:    shldl $13, %ecx, %edx
3655; X86-SSE41-NEXT:    shldl $15, %eax, %ecx
3656; X86-SSE41-NEXT:    shll $15, %ecx
3657; X86-SSE41-NEXT:    sarl $15, %ecx
3658; X86-SSE41-NEXT:    shll $15, %eax
3659; X86-SSE41-NEXT:    sarl $15, %eax
3660; X86-SSE41-NEXT:    movd %eax, %xmm0
3661; X86-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
3662; X86-SSE41-NEXT:    shll $13, %esi
3663; X86-SSE41-NEXT:    sarl $15, %esi
3664; X86-SSE41-NEXT:    pinsrd $2, %esi, %xmm0
3665; X86-SSE41-NEXT:    shll $15, %edx
3666; X86-SSE41-NEXT:    sarl $15, %edx
3667; X86-SSE41-NEXT:    pinsrd $3, %edx, %xmm0
3668; X86-SSE41-NEXT:    popl %esi
3669; X86-SSE41-NEXT:    .cfi_def_cfa_offset 4
3670; X86-SSE41-NEXT:    retl
3671  %a = load <4 x i17>, ptr %ptr
3672  %b = sext <4 x i17> %a to <4 x i32>
3673  ret <4 x i32> %b
3674}
3675
3676define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
3677; SSE2-LABEL: sext_8i6_to_8i64:
3678; SSE2:       # %bb.0: # %entry
3679; SSE2-NEXT:    movd %edi, %xmm0
3680; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3681; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3682; SSE2-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3683; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3684; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3685; SSE2-NEXT:    psllq $58, %xmm0
3686; SSE2-NEXT:    movdqa %xmm0, %xmm1
3687; SSE2-NEXT:    psrad $31, %xmm1
3688; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3689; SSE2-NEXT:    psrad $26, %xmm0
3690; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3691; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3692; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3693; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3694; SSE2-NEXT:    psllq $58, %xmm1
3695; SSE2-NEXT:    movdqa %xmm1, %xmm2
3696; SSE2-NEXT:    psrad $31, %xmm2
3697; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3698; SSE2-NEXT:    psrad $26, %xmm1
3699; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3700; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3701; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3702; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3703; SSE2-NEXT:    psllq $58, %xmm2
3704; SSE2-NEXT:    movdqa %xmm2, %xmm4
3705; SSE2-NEXT:    psrad $31, %xmm4
3706; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3707; SSE2-NEXT:    psrad $26, %xmm2
3708; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3709; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3710; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3711; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3712; SSE2-NEXT:    psllq $58, %xmm3
3713; SSE2-NEXT:    movdqa %xmm3, %xmm4
3714; SSE2-NEXT:    psrad $31, %xmm4
3715; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3716; SSE2-NEXT:    psrad $26, %xmm3
3717; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3718; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3719; SSE2-NEXT:    retq
3720;
3721; SSSE3-LABEL: sext_8i6_to_8i64:
3722; SSSE3:       # %bb.0: # %entry
3723; SSSE3-NEXT:    movd %edi, %xmm0
3724; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3725; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3726; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3727; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3728; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3729; SSSE3-NEXT:    psllq $58, %xmm0
3730; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3731; SSSE3-NEXT:    psrad $31, %xmm1
3732; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3733; SSSE3-NEXT:    psrad $26, %xmm0
3734; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3735; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3736; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3737; SSSE3-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3738; SSSE3-NEXT:    psllq $58, %xmm1
3739; SSSE3-NEXT:    movdqa %xmm1, %xmm2
3740; SSSE3-NEXT:    psrad $31, %xmm2
3741; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3742; SSSE3-NEXT:    psrad $26, %xmm1
3743; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3744; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3745; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3746; SSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3747; SSSE3-NEXT:    psllq $58, %xmm2
3748; SSSE3-NEXT:    movdqa %xmm2, %xmm4
3749; SSSE3-NEXT:    psrad $31, %xmm4
3750; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3751; SSSE3-NEXT:    psrad $26, %xmm2
3752; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3753; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3754; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3755; SSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3756; SSSE3-NEXT:    psllq $58, %xmm3
3757; SSSE3-NEXT:    movdqa %xmm3, %xmm4
3758; SSSE3-NEXT:    psrad $31, %xmm4
3759; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3760; SSSE3-NEXT:    psrad $26, %xmm3
3761; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3762; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3763; SSSE3-NEXT:    retq
3764;
3765; SSE41-LABEL: sext_8i6_to_8i64:
3766; SSE41:       # %bb.0: # %entry
3767; SSE41-NEXT:    movd %edi, %xmm0
3768; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3769; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3770; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
3771; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3772; SSE41-NEXT:    psllq $58, %xmm0
3773; SSE41-NEXT:    movdqa %xmm0, %xmm1
3774; SSE41-NEXT:    psrad $31, %xmm1
3775; SSE41-NEXT:    psrad $26, %xmm0
3776; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3777; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3778; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3779; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3780; SSE41-NEXT:    psllq $58, %xmm1
3781; SSE41-NEXT:    movdqa %xmm1, %xmm2
3782; SSE41-NEXT:    psrad $31, %xmm2
3783; SSE41-NEXT:    psrad $26, %xmm1
3784; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3785; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3786; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
3787; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3788; SSE41-NEXT:    psllq $58, %xmm2
3789; SSE41-NEXT:    movdqa %xmm2, %xmm4
3790; SSE41-NEXT:    psrad $31, %xmm4
3791; SSE41-NEXT:    psrad $26, %xmm2
3792; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3793; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
3794; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3795; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3796; SSE41-NEXT:    psllq $58, %xmm3
3797; SSE41-NEXT:    movdqa %xmm3, %xmm4
3798; SSE41-NEXT:    psrad $31, %xmm4
3799; SSE41-NEXT:    psrad $26, %xmm3
3800; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3801; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
3802; SSE41-NEXT:    retq
3803;
3804; AVX1-LABEL: sext_8i6_to_8i64:
3805; AVX1:       # %bb.0: # %entry
3806; AVX1-NEXT:    vmovd %edi, %xmm0
3807; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3808; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3809; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3810; AVX1-NEXT:    vpsllw $10, %xmm0, %xmm0
3811; AVX1-NEXT:    vpsraw $10, %xmm0, %xmm1
3812; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm0
3813; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
3814; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
3815; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3816; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
3817; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
3818; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
3819; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
3820; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
3821; AVX1-NEXT:    retq
3822;
3823; AVX2-LABEL: sext_8i6_to_8i64:
3824; AVX2:       # %bb.0: # %entry
3825; AVX2-NEXT:    vmovd %edi, %xmm0
3826; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
3827; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3828; AVX2-NEXT:    vpsllw $10, %xmm0, %xmm0
3829; AVX2-NEXT:    vpsraw $10, %xmm0, %xmm1
3830; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm0
3831; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3832; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
3833; AVX2-NEXT:    retq
3834;
3835; AVX512-LABEL: sext_8i6_to_8i64:
3836; AVX512:       # %bb.0: # %entry
3837; AVX512-NEXT:    vmovd %edi, %xmm0
3838; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
3839; AVX512-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3840; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3841; AVX512-NEXT:    vpsllq $58, %zmm0, %zmm0
3842; AVX512-NEXT:    vpsraq $58, %zmm0, %zmm0
3843; AVX512-NEXT:    retq
3844;
3845; X86-SSE2-LABEL: sext_8i6_to_8i64:
3846; X86-SSE2:       # %bb.0: # %entry
3847; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3848; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3849; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3850; X86-SSE2-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
3851; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
3852; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3853; X86-SSE2-NEXT:    psllq $58, %xmm0
3854; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
3855; X86-SSE2-NEXT:    psrad $31, %xmm1
3856; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3857; X86-SSE2-NEXT:    psrad $26, %xmm0
3858; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
3859; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3860; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3861; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3862; X86-SSE2-NEXT:    psllq $58, %xmm1
3863; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
3864; X86-SSE2-NEXT:    psrad $31, %xmm2
3865; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3866; X86-SSE2-NEXT:    psrad $26, %xmm1
3867; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
3868; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3869; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
3870; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
3871; X86-SSE2-NEXT:    psllq $58, %xmm2
3872; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
3873; X86-SSE2-NEXT:    psrad $31, %xmm4
3874; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3875; X86-SSE2-NEXT:    psrad $26, %xmm2
3876; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
3877; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
3878; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3879; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
3880; X86-SSE2-NEXT:    psllq $58, %xmm3
3881; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
3882; X86-SSE2-NEXT:    psrad $31, %xmm4
3883; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
3884; X86-SSE2-NEXT:    psrad $26, %xmm3
3885; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
3886; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3887; X86-SSE2-NEXT:    retl
3888;
3889; X86-SSE41-LABEL: sext_8i6_to_8i64:
3890; X86-SSE41:       # %bb.0: # %entry
3891; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3892; X86-SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3893; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3894; X86-SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
3895; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3896; X86-SSE41-NEXT:    psllq $58, %xmm0
3897; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
3898; X86-SSE41-NEXT:    psrad $31, %xmm1
3899; X86-SSE41-NEXT:    psrad $26, %xmm0
3900; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
3901; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
3902; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
3903; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3904; X86-SSE41-NEXT:    psllq $58, %xmm1
3905; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
3906; X86-SSE41-NEXT:    psrad $31, %xmm2
3907; X86-SSE41-NEXT:    psrad $26, %xmm1
3908; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
3909; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3910; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
3911; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
3912; X86-SSE41-NEXT:    psllq $58, %xmm2
3913; X86-SSE41-NEXT:    movdqa %xmm2, %xmm4
3914; X86-SSE41-NEXT:    psrad $31, %xmm4
3915; X86-SSE41-NEXT:    psrad $26, %xmm2
3916; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
3917; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
3918; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
3919; X86-SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3920; X86-SSE41-NEXT:    psllq $58, %xmm3
3921; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
3922; X86-SSE41-NEXT:    psrad $31, %xmm4
3923; X86-SSE41-NEXT:    psrad $26, %xmm3
3924; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
3925; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
3926; X86-SSE41-NEXT:    retl
3927entry:
3928  %a = trunc i32 %x to i6
3929  %b = insertelement <8 x i6> undef, i6 %a, i32 0
3930  %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer
3931  %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7>
3932  %e = sext <8 x i6> %d to <8 x i64>
3933  ret <8 x i64> %e
3934}
3935
3936define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
3937; SSE2-LABEL: zext_negate_sext:
3938; SSE2:       # %bb.0:
3939; SSE2-NEXT:    pxor %xmm1, %xmm1
3940; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3941; SSE2-NEXT:    psubw %xmm0, %xmm1
3942; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3943; SSE2-NEXT:    psrad $16, %xmm0
3944; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3945; SSE2-NEXT:    psrad $16, %xmm1
3946; SSE2-NEXT:    retq
3947;
3948; SSSE3-LABEL: zext_negate_sext:
3949; SSSE3:       # %bb.0:
3950; SSSE3-NEXT:    pxor %xmm1, %xmm1
3951; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3952; SSSE3-NEXT:    psubw %xmm0, %xmm1
3953; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3954; SSSE3-NEXT:    psrad $16, %xmm0
3955; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
3956; SSSE3-NEXT:    psrad $16, %xmm1
3957; SSSE3-NEXT:    retq
3958;
3959; SSE41-LABEL: zext_negate_sext:
3960; SSE41:       # %bb.0:
3961; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3962; SSE41-NEXT:    pxor %xmm1, %xmm1
3963; SSE41-NEXT:    psubw %xmm0, %xmm1
3964; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
3965; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
3966; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
3967; SSE41-NEXT:    retq
3968;
3969; AVX1-LABEL: zext_negate_sext:
3970; AVX1:       # %bb.0:
3971; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3972; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
3973; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3974; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
3975; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3976; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
3977; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3978; AVX1-NEXT:    retq
3979;
3980; AVX2-LABEL: zext_negate_sext:
3981; AVX2:       # %bb.0:
3982; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3983; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3984; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
3985; AVX2-NEXT:    retq
3986;
3987; AVX512-LABEL: zext_negate_sext:
3988; AVX512:       # %bb.0:
3989; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3990; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3991; AVX512-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
3992; AVX512-NEXT:    retq
3993;
3994; X86-SSE2-LABEL: zext_negate_sext:
3995; X86-SSE2:       # %bb.0:
3996; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
3997; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3998; X86-SSE2-NEXT:    psubw %xmm0, %xmm1
3999; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4000; X86-SSE2-NEXT:    psrad $16, %xmm0
4001; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4002; X86-SSE2-NEXT:    psrad $16, %xmm1
4003; X86-SSE2-NEXT:    retl
4004;
4005; X86-SSE41-LABEL: zext_negate_sext:
4006; X86-SSE41:       # %bb.0:
4007; X86-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
4008; X86-SSE41-NEXT:    pxor %xmm1, %xmm1
4009; X86-SSE41-NEXT:    psubw %xmm0, %xmm1
4010; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
4011; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4012; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
4013; X86-SSE41-NEXT:    retl
4014  %z = zext <8 x i8> %x to <8 x i16>
4015  %neg = sub nsw <8 x i16> zeroinitializer, %z
4016  %r = sext <8 x i16> %neg to <8 x i32>
4017  ret <8 x i32> %r
4018}
4019
4020define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
4021; SSE2-LABEL: zext_decremenet_sext:
4022; SSE2:       # %bb.0:
4023; SSE2-NEXT:    pxor %xmm1, %xmm1
4024; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4025; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
4026; SSE2-NEXT:    paddw %xmm0, %xmm1
4027; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4028; SSE2-NEXT:    psrad $16, %xmm0
4029; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4030; SSE2-NEXT:    psrad $16, %xmm1
4031; SSE2-NEXT:    retq
4032;
4033; SSSE3-LABEL: zext_decremenet_sext:
4034; SSSE3:       # %bb.0:
4035; SSSE3-NEXT:    pxor %xmm1, %xmm1
4036; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4037; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
4038; SSSE3-NEXT:    paddw %xmm0, %xmm1
4039; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4040; SSSE3-NEXT:    psrad $16, %xmm0
4041; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4042; SSSE3-NEXT:    psrad $16, %xmm1
4043; SSSE3-NEXT:    retq
4044;
4045; SSE41-LABEL: zext_decremenet_sext:
4046; SSE41:       # %bb.0:
4047; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
4048; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
4049; SSE41-NEXT:    paddw %xmm0, %xmm1
4050; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
4051; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4052; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
4053; SSE41-NEXT:    retq
4054;
4055; AVX1-LABEL: zext_decremenet_sext:
4056; AVX1:       # %bb.0:
4057; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
4058; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
4059; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
4060; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
4061; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4062; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
4063; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4064; AVX1-NEXT:    retq
4065;
4066; AVX2-LABEL: zext_decremenet_sext:
4067; AVX2:       # %bb.0:
4068; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
4069; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
4070; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4071; AVX2-NEXT:    retq
4072;
4073; AVX512-LABEL: zext_decremenet_sext:
4074; AVX512:       # %bb.0:
4075; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
4076; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
4077; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4078; AVX512-NEXT:    retq
4079;
4080; X86-SSE2-LABEL: zext_decremenet_sext:
4081; X86-SSE2:       # %bb.0:
4082; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
4083; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4084; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
4085; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
4086; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4087; X86-SSE2-NEXT:    psrad $16, %xmm0
4088; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
4089; X86-SSE2-NEXT:    psrad $16, %xmm1
4090; X86-SSE2-NEXT:    retl
4091;
4092; X86-SSE41-LABEL: zext_decremenet_sext:
4093; X86-SSE41:       # %bb.0:
4094; X86-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
4095; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
4096; X86-SSE41-NEXT:    paddw %xmm0, %xmm1
4097; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
4098; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
4099; X86-SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
4100; X86-SSE41-NEXT:    retl
4101  %z = zext <8 x i8> %x to <8 x i16>
4102  %dec = add <8 x i16> %z, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4103  %r = sext <8 x i16> %dec to <8 x i32>
4104  ret <8 x i32> %r
4105}
4106