1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
9;
10; Just one 32-bit run to make sure we do reasonable things there.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
12
13define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
14; SSE2-LABEL: sext_16i8_to_8i16:
15; SSE2:       # BB#0: # %entry
16; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
17; SSE2-NEXT:    psraw $8, %xmm0
18; SSE2-NEXT:    retq
19;
20; SSSE3-LABEL: sext_16i8_to_8i16:
21; SSSE3:       # BB#0: # %entry
22; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
23; SSSE3-NEXT:    psraw $8, %xmm0
24; SSSE3-NEXT:    retq
25;
26; SSE41-LABEL: sext_16i8_to_8i16:
27; SSE41:       # BB#0: # %entry
28; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
29; SSE41-NEXT:    retq
30;
31; AVX-LABEL: sext_16i8_to_8i16:
32; AVX:       # BB#0: # %entry
33; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
34; AVX-NEXT:    retq
35;
36; X32-SSE41-LABEL: sext_16i8_to_8i16:
37; X32-SSE41:       # BB#0: # %entry
38; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
39; X32-SSE41-NEXT:    retl
40entry:
41  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
42  %C = sext <8 x i8> %B to <8 x i16>
43  ret <8 x i16> %C
44}
45
46define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
47; SSE2-LABEL: sext_16i8_to_16i16:
48; SSE2:       # BB#0: # %entry
49; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
50; SSE2-NEXT:    psraw $8, %xmm2
51; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
52; SSE2-NEXT:    psraw $8, %xmm1
53; SSE2-NEXT:    movdqa %xmm2, %xmm0
54; SSE2-NEXT:    retq
55;
56; SSSE3-LABEL: sext_16i8_to_16i16:
57; SSSE3:       # BB#0: # %entry
58; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
59; SSSE3-NEXT:    psraw $8, %xmm2
60; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
61; SSSE3-NEXT:    psraw $8, %xmm1
62; SSSE3-NEXT:    movdqa %xmm2, %xmm0
63; SSSE3-NEXT:    retq
64;
65; SSE41-LABEL: sext_16i8_to_16i16:
66; SSE41:       # BB#0: # %entry
67; SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
68; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
69; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
70; SSE41-NEXT:    movdqa %xmm2, %xmm0
71; SSE41-NEXT:    retq
72;
73; AVX1-LABEL: sext_16i8_to_16i16:
74; AVX1:       # BB#0: # %entry
75; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
76; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
77; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
78; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
79; AVX1-NEXT:    retq
80;
81; AVX2-LABEL: sext_16i8_to_16i16:
82; AVX2:       # BB#0: # %entry
83; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
84; AVX2-NEXT:    retq
85;
86; AVX512-LABEL: sext_16i8_to_16i16:
87; AVX512:       # BB#0: # %entry
88; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
89; AVX512-NEXT:    retq
90;
91; X32-SSE41-LABEL: sext_16i8_to_16i16:
92; X32-SSE41:       # BB#0: # %entry
93; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm2
94; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
95; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
96; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
97; X32-SSE41-NEXT:    retl
98entry:
99  %B = sext <16 x i8> %A to <16 x i16>
100  ret <16 x i16> %B
101}
102
103define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
104; SSE2-LABEL: sext_32i8_to_32i16:
105; SSE2:       # BB#0: # %entry
106; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
107; SSE2-NEXT:    psraw $8, %xmm4
108; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
109; SSE2-NEXT:    psraw $8, %xmm5
110; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
111; SSE2-NEXT:    psraw $8, %xmm2
112; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
113; SSE2-NEXT:    psraw $8, %xmm3
114; SSE2-NEXT:    movdqa %xmm4, %xmm0
115; SSE2-NEXT:    movdqa %xmm5, %xmm1
116; SSE2-NEXT:    retq
117;
118; SSSE3-LABEL: sext_32i8_to_32i16:
119; SSSE3:       # BB#0: # %entry
120; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
121; SSSE3-NEXT:    psraw $8, %xmm4
122; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
123; SSSE3-NEXT:    psraw $8, %xmm5
124; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
125; SSSE3-NEXT:    psraw $8, %xmm2
126; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
127; SSSE3-NEXT:    psraw $8, %xmm3
128; SSSE3-NEXT:    movdqa %xmm4, %xmm0
129; SSSE3-NEXT:    movdqa %xmm5, %xmm1
130; SSSE3-NEXT:    retq
131;
132; SSE41-LABEL: sext_32i8_to_32i16:
133; SSE41:       # BB#0: # %entry
134; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
135; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
136; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
137; SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
138; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
139; SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
140; SSE41-NEXT:    movdqa %xmm5, %xmm0
141; SSE41-NEXT:    movdqa %xmm4, %xmm1
142; SSE41-NEXT:    retq
143;
144; AVX1-LABEL: sext_32i8_to_32i16:
145; AVX1:       # BB#0: # %entry
146; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
147; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
148; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
149; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
150; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
151; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
152; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
153; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
154; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
155; AVX1-NEXT:    vmovaps %ymm2, %ymm0
156; AVX1-NEXT:    retq
157;
158; AVX2-LABEL: sext_32i8_to_32i16:
159; AVX2:       # BB#0: # %entry
160; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
161; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
162; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
163; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
164; AVX2-NEXT:    retq
165;
166; AVX512F-LABEL: sext_32i8_to_32i16:
167; AVX512F:       # BB#0: # %entry
168; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm2
169; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
170; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
171; AVX512F-NEXT:    vmovdqa %ymm2, %ymm0
172; AVX512F-NEXT:    retq
173;
174; AVX512BW-LABEL: sext_32i8_to_32i16:
175; AVX512BW:       # BB#0: # %entry
176; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
177; AVX512BW-NEXT:    retq
178;
179; X32-SSE41-LABEL: sext_32i8_to_32i16:
180; X32-SSE41:       # BB#0: # %entry
181; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
182; X32-SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
183; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
184; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm4
185; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
186; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm3
187; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
188; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
189; X32-SSE41-NEXT:    retl
190entry:
191  %B = sext <32 x i8> %A to <32 x i16>
192  ret <32 x i16> %B
193}
194
195define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
196; SSE2-LABEL: sext_16i8_to_4i32:
197; SSE2:       # BB#0: # %entry
198; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
199; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
200; SSE2-NEXT:    psrad $24, %xmm0
201; SSE2-NEXT:    retq
202;
203; SSSE3-LABEL: sext_16i8_to_4i32:
204; SSSE3:       # BB#0: # %entry
205; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
206; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
207; SSSE3-NEXT:    psrad $24, %xmm0
208; SSSE3-NEXT:    retq
209;
210; SSE41-LABEL: sext_16i8_to_4i32:
211; SSE41:       # BB#0: # %entry
212; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
213; SSE41-NEXT:    retq
214;
215; AVX-LABEL: sext_16i8_to_4i32:
216; AVX:       # BB#0: # %entry
217; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
218; AVX-NEXT:    retq
219;
220; X32-SSE41-LABEL: sext_16i8_to_4i32:
221; X32-SSE41:       # BB#0: # %entry
222; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
223; X32-SSE41-NEXT:    retl
224entry:
225  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
226  %C = sext <4 x i8> %B to <4 x i32>
227  ret <4 x i32> %C
228}
229
230define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
231; SSE2-LABEL: sext_16i8_to_8i32:
232; SSE2:       # BB#0: # %entry
233; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
234; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
235; SSE2-NEXT:    psrad $24, %xmm2
236; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
237; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
238; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
239; SSE2-NEXT:    psrad $24, %xmm1
240; SSE2-NEXT:    movdqa %xmm2, %xmm0
241; SSE2-NEXT:    retq
242;
243; SSSE3-LABEL: sext_16i8_to_8i32:
244; SSSE3:       # BB#0: # %entry
245; SSSE3-NEXT:    movdqa %xmm0, %xmm1
246; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
247; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
248; SSSE3-NEXT:    psrad $24, %xmm0
249; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
250; SSSE3-NEXT:    psrad $24, %xmm1
251; SSSE3-NEXT:    retq
252;
253; SSE41-LABEL: sext_16i8_to_8i32:
254; SSE41:       # BB#0: # %entry
255; SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
256; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
257; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
258; SSE41-NEXT:    movdqa %xmm2, %xmm0
259; SSE41-NEXT:    retq
260;
261; AVX1-LABEL: sext_16i8_to_8i32:
262; AVX1:       # BB#0: # %entry
263; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
264; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
265; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
266; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
267; AVX1-NEXT:    retq
268;
269; AVX2-LABEL: sext_16i8_to_8i32:
270; AVX2:       # BB#0: # %entry
271; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
272; AVX2-NEXT:    retq
273;
274; AVX512-LABEL: sext_16i8_to_8i32:
275; AVX512:       # BB#0: # %entry
276; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
277; AVX512-NEXT:    retq
278;
279; X32-SSE41-LABEL: sext_16i8_to_8i32:
280; X32-SSE41:       # BB#0: # %entry
281; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm2
282; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
283; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
284; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
285; X32-SSE41-NEXT:    retl
286entry:
287  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
288  %C = sext <8 x i8> %B to <8 x i32>
289  ret <8 x i32> %C
290}
291
292define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
293; SSE2-LABEL: sext_16i8_to_16i32:
294; SSE2:       # BB#0: # %entry
295; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
296; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
297; SSE2-NEXT:    psrad $24, %xmm4
298; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
299; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
300; SSE2-NEXT:    psrad $24, %xmm2
301; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
302; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
303; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
304; SSE2-NEXT:    psrad $24, %xmm1
305; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
306; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
307; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
308; SSE2-NEXT:    psrad $24, %xmm3
309; SSE2-NEXT:    movdqa %xmm4, %xmm0
310; SSE2-NEXT:    retq
311;
312; SSSE3-LABEL: sext_16i8_to_16i32:
313; SSSE3:       # BB#0: # %entry
314; SSSE3-NEXT:    movdqa %xmm0, %xmm3
315; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
316; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
317; SSSE3-NEXT:    psrad $24, %xmm0
318; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
319; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
320; SSSE3-NEXT:    psrad $24, %xmm2
321; SSSE3-NEXT:    movdqa %xmm3, %xmm1
322; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
323; SSSE3-NEXT:    psrad $24, %xmm1
324; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,u,u,12,u,u,u,13,u,u,u,14,u,u,u,15]
325; SSSE3-NEXT:    psrad $24, %xmm3
326; SSSE3-NEXT:    retq
327;
328; SSE41-LABEL: sext_16i8_to_16i32:
329; SSE41:       # BB#0: # %entry
330; SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
331; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
332; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
333; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
334; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
335; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
336; SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
337; SSE41-NEXT:    movdqa %xmm4, %xmm0
338; SSE41-NEXT:    retq
339;
340; AVX1-LABEL: sext_16i8_to_16i32:
341; AVX1:       # BB#0: # %entry
342; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
343; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
344; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
345; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
346; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
347; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
348; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
349; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
350; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
351; AVX1-NEXT:    vmovaps %ymm2, %ymm0
352; AVX1-NEXT:    retq
353;
354; AVX2-LABEL: sext_16i8_to_16i32:
355; AVX2:       # BB#0: # %entry
356; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm2
357; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
358; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm1
359; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
360; AVX2-NEXT:    retq
361;
362; AVX512-LABEL: sext_16i8_to_16i32:
363; AVX512:       # BB#0: # %entry
364; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
365; AVX512-NEXT:    retq
366;
367; X32-SSE41-LABEL: sext_16i8_to_16i32:
368; X32-SSE41:       # BB#0: # %entry
369; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm4
370; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
371; X32-SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
372; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
373; X32-SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
374; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
375; X32-SSE41-NEXT:    pmovsxbd %xmm0, %xmm3
376; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
377; X32-SSE41-NEXT:    retl
378entry:
379  %B = sext <16 x i8> %A to <16 x i32>
380  ret <16 x i32> %B
381}
382
383define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
384; SSE2-LABEL: sext_16i8_to_2i64:
385; SSE2:       # BB#0: # %entry
386; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
387; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
388; SSE2-NEXT:    movdqa %xmm0, %xmm1
389; SSE2-NEXT:    psrad $31, %xmm1
390; SSE2-NEXT:    psrad $24, %xmm0
391; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
392; SSE2-NEXT:    retq
393;
394; SSSE3-LABEL: sext_16i8_to_2i64:
395; SSSE3:       # BB#0: # %entry
396; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
397; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
398; SSSE3-NEXT:    movdqa %xmm0, %xmm1
399; SSSE3-NEXT:    psrad $31, %xmm1
400; SSSE3-NEXT:    psrad $24, %xmm0
401; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
402; SSSE3-NEXT:    retq
403;
404; SSE41-LABEL: sext_16i8_to_2i64:
405; SSE41:       # BB#0: # %entry
406; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
407; SSE41-NEXT:    retq
408;
409; AVX-LABEL: sext_16i8_to_2i64:
410; AVX:       # BB#0: # %entry
411; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
412; AVX-NEXT:    retq
413;
414; X32-SSE41-LABEL: sext_16i8_to_2i64:
415; X32-SSE41:       # BB#0: # %entry
416; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
417; X32-SSE41-NEXT:    retl
418entry:
419  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
420  %C = sext <2 x i8> %B to <2 x i64>
421  ret <2 x i64> %C
422}
423
424define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
425; SSE2-LABEL: sext_16i8_to_4i64:
426; SSE2:       # BB#0: # %entry
427; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
428; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
429; SSE2-NEXT:    movdqa %xmm2, %xmm1
430; SSE2-NEXT:    psrad $31, %xmm1
431; SSE2-NEXT:    psrad $24, %xmm2
432; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
433; SSE2-NEXT:    psrld $16, %xmm0
434; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
435; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
436; SSE2-NEXT:    movdqa %xmm1, %xmm0
437; SSE2-NEXT:    psrad $31, %xmm0
438; SSE2-NEXT:    psrad $24, %xmm1
439; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
440; SSE2-NEXT:    movdqa %xmm2, %xmm0
441; SSE2-NEXT:    retq
442;
443; SSSE3-LABEL: sext_16i8_to_4i64:
444; SSSE3:       # BB#0: # %entry
445; SSSE3-NEXT:    movdqa %xmm0, %xmm1
446; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
447; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
448; SSSE3-NEXT:    movdqa %xmm0, %xmm2
449; SSSE3-NEXT:    psrad $31, %xmm2
450; SSSE3-NEXT:    psrad $24, %xmm0
451; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
452; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,u,u,u,3,u,u,u],zero,xmm1[u,u,u],zero
453; SSSE3-NEXT:    movdqa %xmm1, %xmm2
454; SSSE3-NEXT:    psrad $31, %xmm2
455; SSSE3-NEXT:    psrad $24, %xmm1
456; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
457; SSSE3-NEXT:    retq
458;
459; SSE41-LABEL: sext_16i8_to_4i64:
460; SSE41:       # BB#0: # %entry
461; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
462; SSE41-NEXT:    psrld $16, %xmm0
463; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
464; SSE41-NEXT:    movdqa %xmm2, %xmm0
465; SSE41-NEXT:    retq
466;
467; AVX1-LABEL: sext_16i8_to_4i64:
468; AVX1:       # BB#0: # %entry
469; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
470; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
471; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
472; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
473; AVX1-NEXT:    retq
474;
475; AVX2-LABEL: sext_16i8_to_4i64:
476; AVX2:       # BB#0: # %entry
477; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
478; AVX2-NEXT:    retq
479;
480; AVX512-LABEL: sext_16i8_to_4i64:
481; AVX512:       # BB#0: # %entry
482; AVX512-NEXT:    vpmovsxbq %xmm0, %ymm0
483; AVX512-NEXT:    retq
484;
485; X32-SSE41-LABEL: sext_16i8_to_4i64:
486; X32-SSE41:       # BB#0: # %entry
487; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
488; X32-SSE41-NEXT:    psrld $16, %xmm0
489; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
490; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
491; X32-SSE41-NEXT:    retl
492entry:
493  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
494  %C = sext <4 x i8> %B to <4 x i64>
495  ret <4 x i64> %C
496}
497
498define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
499; SSE2-LABEL: sext_16i8_to_8i64:
500; SSE2:       # BB#0: # %entry
501; SSE2-NEXT:    movdqa %xmm0, %xmm1
502; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
503; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
504; SSE2-NEXT:    movdqa %xmm0, %xmm2
505; SSE2-NEXT:    psrad $31, %xmm2
506; SSE2-NEXT:    psrad $24, %xmm0
507; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
508; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
509; SSE2-NEXT:    psrld $16, %xmm1
510; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
511; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
512; SSE2-NEXT:    movdqa %xmm1, %xmm2
513; SSE2-NEXT:    psrad $31, %xmm2
514; SSE2-NEXT:    psrad $24, %xmm1
515; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
516; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
517; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
518; SSE2-NEXT:    movdqa %xmm2, %xmm4
519; SSE2-NEXT:    psrad $31, %xmm4
520; SSE2-NEXT:    psrad $24, %xmm2
521; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
522; SSE2-NEXT:    psrld $16, %xmm3
523; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
524; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
525; SSE2-NEXT:    movdqa %xmm3, %xmm4
526; SSE2-NEXT:    psrad $31, %xmm4
527; SSE2-NEXT:    psrad $24, %xmm3
528; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
529; SSE2-NEXT:    retq
530;
531; SSSE3-LABEL: sext_16i8_to_8i64:
532; SSSE3:       # BB#0: # %entry
533; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,255,u,u,u,255>
534; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
535; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
536; SSSE3-NEXT:    movdqa %xmm0, %xmm1
537; SSSE3-NEXT:    pshufb %xmm2, %xmm1
538; SSSE3-NEXT:    movdqa %xmm1, %xmm0
539; SSSE3-NEXT:    psrad $31, %xmm0
540; SSSE3-NEXT:    psrad $24, %xmm1
541; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
542; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
543; SSSE3-NEXT:    movdqa %xmm0, %xmm4
544; SSSE3-NEXT:    psrad $31, %xmm4
545; SSSE3-NEXT:    psrad $24, %xmm0
546; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
547; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
548; SSSE3-NEXT:    pshufb %xmm2, %xmm3
549; SSSE3-NEXT:    movdqa %xmm3, %xmm2
550; SSSE3-NEXT:    psrad $31, %xmm2
551; SSSE3-NEXT:    psrad $24, %xmm3
552; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
553; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
554; SSSE3-NEXT:    movdqa %xmm2, %xmm4
555; SSSE3-NEXT:    psrad $31, %xmm4
556; SSSE3-NEXT:    psrad $24, %xmm2
557; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
558; SSSE3-NEXT:    retq
559;
560; SSE41-LABEL: sext_16i8_to_8i64:
561; SSE41:       # BB#0: # %entry
562; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
563; SSE41-NEXT:    movdqa %xmm0, %xmm1
564; SSE41-NEXT:    psrld $16, %xmm1
565; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
566; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
567; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
568; SSE41-NEXT:    psrlq $48, %xmm0
569; SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
570; SSE41-NEXT:    movdqa %xmm4, %xmm0
571; SSE41-NEXT:    retq
572;
573; AVX1-LABEL: sext_16i8_to_8i64:
574; AVX1:       # BB#0: # %entry
575; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm1
576; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
577; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
578; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
579; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
580; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
581; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
582; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
583; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
584; AVX1-NEXT:    vmovaps %ymm2, %ymm0
585; AVX1-NEXT:    retq
586;
587; AVX2-LABEL: sext_16i8_to_8i64:
588; AVX2:       # BB#0: # %entry
589; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm2
590; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
591; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm1
592; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
593; AVX2-NEXT:    retq
594;
595; AVX512-LABEL: sext_16i8_to_8i64:
596; AVX512:       # BB#0: # %entry
597; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
598; AVX512-NEXT:    retq
599;
600; X32-SSE41-LABEL: sext_16i8_to_8i64:
601; X32-SSE41:       # BB#0: # %entry
602; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
603; X32-SSE41-NEXT:    movdqa %xmm0, %xmm1
604; X32-SSE41-NEXT:    psrld $16, %xmm1
605; X32-SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
606; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
607; X32-SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
608; X32-SSE41-NEXT:    psrlq $48, %xmm0
609; X32-SSE41-NEXT:    pmovsxbq %xmm0, %xmm3
610; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
611; X32-SSE41-NEXT:    retl
612entry:
613  %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
614  %C = sext <8 x i8> %B to <8 x i64>
615  ret <8 x i64> %C
616}
617
618define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
619; SSE2-LABEL: sext_8i16_to_4i32:
620; SSE2:       # BB#0: # %entry
621; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
622; SSE2-NEXT:    psrad $16, %xmm0
623; SSE2-NEXT:    retq
624;
625; SSSE3-LABEL: sext_8i16_to_4i32:
626; SSSE3:       # BB#0: # %entry
627; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
628; SSSE3-NEXT:    psrad $16, %xmm0
629; SSSE3-NEXT:    retq
630;
631; SSE41-LABEL: sext_8i16_to_4i32:
632; SSE41:       # BB#0: # %entry
633; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
634; SSE41-NEXT:    retq
635;
636; AVX-LABEL: sext_8i16_to_4i32:
637; AVX:       # BB#0: # %entry
638; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
639; AVX-NEXT:    retq
640;
641; X32-SSE41-LABEL: sext_8i16_to_4i32:
642; X32-SSE41:       # BB#0: # %entry
643; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
644; X32-SSE41-NEXT:    retl
645entry:
646  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
647  %C = sext <4 x i16> %B to <4 x i32>
648  ret <4 x i32> %C
649}
650
651define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
652; SSE2-LABEL: sext_8i16_to_8i32:
653; SSE2:       # BB#0: # %entry
654; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
655; SSE2-NEXT:    psrad $16, %xmm2
656; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
657; SSE2-NEXT:    psrad $16, %xmm1
658; SSE2-NEXT:    movdqa %xmm2, %xmm0
659; SSE2-NEXT:    retq
660;
661; SSSE3-LABEL: sext_8i16_to_8i32:
662; SSSE3:       # BB#0: # %entry
663; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
664; SSSE3-NEXT:    psrad $16, %xmm2
665; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
666; SSSE3-NEXT:    psrad $16, %xmm1
667; SSSE3-NEXT:    movdqa %xmm2, %xmm0
668; SSSE3-NEXT:    retq
669;
670; SSE41-LABEL: sext_8i16_to_8i32:
671; SSE41:       # BB#0: # %entry
672; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
673; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
674; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
675; SSE41-NEXT:    movdqa %xmm2, %xmm0
676; SSE41-NEXT:    retq
677;
678; AVX1-LABEL: sext_8i16_to_8i32:
679; AVX1:       # BB#0: # %entry
680; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
681; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
682; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
683; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
684; AVX1-NEXT:    retq
685;
686; AVX2-LABEL: sext_8i16_to_8i32:
687; AVX2:       # BB#0: # %entry
688; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
689; AVX2-NEXT:    retq
690;
691; AVX512-LABEL: sext_8i16_to_8i32:
692; AVX512:       # BB#0: # %entry
693; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
694; AVX512-NEXT:    retq
695;
696; X32-SSE41-LABEL: sext_8i16_to_8i32:
697; X32-SSE41:       # BB#0: # %entry
698; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
699; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
700; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
701; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
702; X32-SSE41-NEXT:    retl
703entry:
704  %B = sext <8 x i16> %A to <8 x i32>
705  ret <8 x i32> %B
706}
707
708define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
709; SSE2-LABEL: sext_16i16_to_16i32:
710; SSE2:       # BB#0: # %entry
711; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
712; SSE2-NEXT:    psrad $16, %xmm4
713; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
714; SSE2-NEXT:    psrad $16, %xmm5
715; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
716; SSE2-NEXT:    psrad $16, %xmm2
717; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
718; SSE2-NEXT:    psrad $16, %xmm3
719; SSE2-NEXT:    movdqa %xmm4, %xmm0
720; SSE2-NEXT:    movdqa %xmm5, %xmm1
721; SSE2-NEXT:    retq
722;
723; SSSE3-LABEL: sext_16i16_to_16i32:
724; SSSE3:       # BB#0: # %entry
725; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
726; SSSE3-NEXT:    psrad $16, %xmm4
727; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
728; SSSE3-NEXT:    psrad $16, %xmm5
729; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
730; SSSE3-NEXT:    psrad $16, %xmm2
731; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
732; SSSE3-NEXT:    psrad $16, %xmm3
733; SSSE3-NEXT:    movdqa %xmm4, %xmm0
734; SSSE3-NEXT:    movdqa %xmm5, %xmm1
735; SSSE3-NEXT:    retq
736;
737; SSE41-LABEL: sext_16i16_to_16i32:
738; SSE41:       # BB#0: # %entry
739; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
740; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
741; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
742; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
743; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
744; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
745; SSE41-NEXT:    movdqa %xmm5, %xmm0
746; SSE41-NEXT:    movdqa %xmm4, %xmm1
747; SSE41-NEXT:    retq
748;
749; AVX1-LABEL: sext_16i16_to_16i32:
750; AVX1:       # BB#0: # %entry
751; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
752; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
753; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
754; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
755; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
756; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
757; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
758; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
759; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
760; AVX1-NEXT:    vmovaps %ymm2, %ymm0
761; AVX1-NEXT:    retq
762;
763; AVX2-LABEL: sext_16i16_to_16i32:
764; AVX2:       # BB#0: # %entry
765; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
766; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
767; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm1
768; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
769; AVX2-NEXT:    retq
770;
771; AVX512-LABEL: sext_16i16_to_16i32:
772; AVX512:       # BB#0: # %entry
773; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
774; AVX512-NEXT:    retq
775;
776; X32-SSE41-LABEL: sext_16i16_to_16i32:
777; X32-SSE41:       # BB#0: # %entry
778; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
779; X32-SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
780; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
781; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
782; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
783; X32-SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
784; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
785; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
786; X32-SSE41-NEXT:    retl
787entry:
788  %B = sext <16 x i16> %A to <16 x i32>
789  ret <16 x i32> %B
790}
791
792define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
793; SSE2-LABEL: sext_8i16_to_2i64:
794; SSE2:       # BB#0: # %entry
795; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
796; SSE2-NEXT:    movdqa %xmm0, %xmm1
797; SSE2-NEXT:    psrad $31, %xmm1
798; SSE2-NEXT:    psrad $16, %xmm0
799; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
800; SSE2-NEXT:    retq
801;
802; SSSE3-LABEL: sext_8i16_to_2i64:
803; SSSE3:       # BB#0: # %entry
804; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
805; SSSE3-NEXT:    movdqa %xmm0, %xmm1
806; SSSE3-NEXT:    psrad $31, %xmm1
807; SSSE3-NEXT:    psrad $16, %xmm0
808; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
809; SSSE3-NEXT:    retq
810;
811; SSE41-LABEL: sext_8i16_to_2i64:
812; SSE41:       # BB#0: # %entry
813; SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
814; SSE41-NEXT:    retq
815;
816; AVX-LABEL: sext_8i16_to_2i64:
817; AVX:       # BB#0: # %entry
818; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
819; AVX-NEXT:    retq
820;
821; X32-SSE41-LABEL: sext_8i16_to_2i64:
822; X32-SSE41:       # BB#0: # %entry
823; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm0
824; X32-SSE41-NEXT:    retl
825entry:
826  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
827  %C = sext <2 x i16> %B to <2 x i64>
828  ret <2 x i64> %C
829}
830
831define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
832; SSE2-LABEL: sext_8i16_to_4i64:
833; SSE2:       # BB#0: # %entry
834; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
835; SSE2-NEXT:    movdqa %xmm2, %xmm1
836; SSE2-NEXT:    psrad $31, %xmm1
837; SSE2-NEXT:    psrad $16, %xmm2
838; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
839; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
840; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
841; SSE2-NEXT:    movdqa %xmm1, %xmm0
842; SSE2-NEXT:    psrad $31, %xmm0
843; SSE2-NEXT:    psrad $16, %xmm1
844; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
845; SSE2-NEXT:    movdqa %xmm2, %xmm0
846; SSE2-NEXT:    retq
847;
848; SSSE3-LABEL: sext_8i16_to_4i64:
849; SSSE3:       # BB#0: # %entry
850; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
851; SSSE3-NEXT:    movdqa %xmm2, %xmm1
852; SSSE3-NEXT:    psrad $31, %xmm1
853; SSSE3-NEXT:    psrad $16, %xmm2
854; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
855; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
856; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
857; SSSE3-NEXT:    movdqa %xmm1, %xmm0
858; SSSE3-NEXT:    psrad $31, %xmm0
859; SSSE3-NEXT:    psrad $16, %xmm1
860; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
861; SSSE3-NEXT:    movdqa %xmm2, %xmm0
862; SSSE3-NEXT:    retq
863;
864; SSE41-LABEL: sext_8i16_to_4i64:
865; SSE41:       # BB#0: # %entry
866; SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
867; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
868; SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
869; SSE41-NEXT:    movdqa %xmm2, %xmm0
870; SSE41-NEXT:    retq
871;
872; AVX1-LABEL: sext_8i16_to_4i64:
873; AVX1:       # BB#0: # %entry
874; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
875; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
876; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
877; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
878; AVX1-NEXT:    retq
879;
880; AVX2-LABEL: sext_8i16_to_4i64:
881; AVX2:       # BB#0: # %entry
882; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
883; AVX2-NEXT:    retq
884;
885; AVX512-LABEL: sext_8i16_to_4i64:
886; AVX512:       # BB#0: # %entry
887; AVX512-NEXT:    vpmovsxwq %xmm0, %ymm0
888; AVX512-NEXT:    retq
889;
890; X32-SSE41-LABEL: sext_8i16_to_4i64:
891; X32-SSE41:       # BB#0: # %entry
892; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm2
893; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
894; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm1
895; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
896; X32-SSE41-NEXT:    retl
897entry:
898  %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
899  %C = sext <4 x i16> %B to <4 x i64>
900  ret <4 x i64> %C
901}
902
903define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
904; SSE2-LABEL: sext_8i16_to_8i64:
905; SSE2:       # BB#0: # %entry
906; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
907; SSE2-NEXT:    movdqa %xmm4, %xmm1
908; SSE2-NEXT:    psrad $31, %xmm1
909; SSE2-NEXT:    psrad $16, %xmm4
910; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
911; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
912; SSE2-NEXT:    movdqa %xmm2, %xmm1
913; SSE2-NEXT:    psrad $31, %xmm1
914; SSE2-NEXT:    psrad $16, %xmm2
915; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
916; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
917; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
918; SSE2-NEXT:    movdqa %xmm1, %xmm3
919; SSE2-NEXT:    psrad $31, %xmm3
920; SSE2-NEXT:    psrad $16, %xmm1
921; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
922; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
923; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
924; SSE2-NEXT:    movdqa %xmm3, %xmm0
925; SSE2-NEXT:    psrad $31, %xmm0
926; SSE2-NEXT:    psrad $16, %xmm3
927; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
928; SSE2-NEXT:    movdqa %xmm4, %xmm0
929; SSE2-NEXT:    retq
930;
931; SSSE3-LABEL: sext_8i16_to_8i64:
932; SSSE3:       # BB#0: # %entry
933; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
934; SSSE3-NEXT:    movdqa %xmm4, %xmm1
935; SSSE3-NEXT:    psrad $31, %xmm1
936; SSSE3-NEXT:    psrad $16, %xmm4
937; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
938; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
939; SSSE3-NEXT:    movdqa %xmm2, %xmm1
940; SSSE3-NEXT:    psrad $31, %xmm1
941; SSSE3-NEXT:    psrad $16, %xmm2
942; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
943; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
944; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
945; SSSE3-NEXT:    movdqa %xmm1, %xmm3
946; SSSE3-NEXT:    psrad $31, %xmm3
947; SSSE3-NEXT:    psrad $16, %xmm1
948; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
949; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
950; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
951; SSSE3-NEXT:    movdqa %xmm3, %xmm0
952; SSSE3-NEXT:    psrad $31, %xmm0
953; SSSE3-NEXT:    psrad $16, %xmm3
954; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
955; SSSE3-NEXT:    movdqa %xmm4, %xmm0
956; SSSE3-NEXT:    retq
957;
958; SSE41-LABEL: sext_8i16_to_8i64:
959; SSE41:       # BB#0: # %entry
960; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
961; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
962; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
963; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
964; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
965; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
966; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
967; SSE41-NEXT:    movdqa %xmm4, %xmm0
968; SSE41-NEXT:    retq
969;
970; AVX1-LABEL: sext_8i16_to_8i64:
971; AVX1:       # BB#0: # %entry
972; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
973; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
974; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
975; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
976; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
977; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
978; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
979; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
980; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
981; AVX1-NEXT:    vmovaps %ymm2, %ymm0
982; AVX1-NEXT:    retq
983;
984; AVX2-LABEL: sext_8i16_to_8i64:
985; AVX2:       # BB#0: # %entry
986; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm2
987; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
988; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm1
989; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
990; AVX2-NEXT:    retq
991;
992; AVX512-LABEL: sext_8i16_to_8i64:
993; AVX512:       # BB#0: # %entry
994; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
995; AVX512-NEXT:    retq
996;
997; X32-SSE41-LABEL: sext_8i16_to_8i64:
998; X32-SSE41:       # BB#0: # %entry
999; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1000; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1001; X32-SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1002; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1003; X32-SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1004; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1005; X32-SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1006; X32-SSE41-NEXT:    movdqa %xmm4, %xmm0
1007; X32-SSE41-NEXT:    retl
1008entry:
1009  %B = sext <8 x i16> %A to <8 x i64>
1010  ret <8 x i64> %B
1011}
1012
1013define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1014; SSE2-LABEL: sext_4i32_to_2i64:
1015; SSE2:       # BB#0: # %entry
1016; SSE2-NEXT:    movdqa %xmm0, %xmm1
1017; SSE2-NEXT:    psrad $31, %xmm1
1018; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1019; SSE2-NEXT:    retq
1020;
1021; SSSE3-LABEL: sext_4i32_to_2i64:
1022; SSSE3:       # BB#0: # %entry
1023; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1024; SSSE3-NEXT:    psrad $31, %xmm1
1025; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1026; SSSE3-NEXT:    retq
1027;
1028; SSE41-LABEL: sext_4i32_to_2i64:
1029; SSE41:       # BB#0: # %entry
1030; SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1031; SSE41-NEXT:    retq
1032;
1033; AVX-LABEL: sext_4i32_to_2i64:
1034; AVX:       # BB#0: # %entry
1035; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
1036; AVX-NEXT:    retq
1037;
1038; X32-SSE41-LABEL: sext_4i32_to_2i64:
1039; X32-SSE41:       # BB#0: # %entry
1040; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm0
1041; X32-SSE41-NEXT:    retl
1042entry:
1043  %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1044  %C = sext <2 x i32> %B to <2 x i64>
1045  ret <2 x i64> %C
1046}
1047
1048define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
1049; SSE2-LABEL: sext_4i32_to_4i64:
1050; SSE2:       # BB#0: # %entry
1051; SSE2-NEXT:    movdqa %xmm0, %xmm2
1052; SSE2-NEXT:    psrad $31, %xmm2
1053; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1054; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1055; SSE2-NEXT:    movdqa %xmm1, %xmm2
1056; SSE2-NEXT:    psrad $31, %xmm2
1057; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1058; SSE2-NEXT:    retq
1059;
1060; SSSE3-LABEL: sext_4i32_to_4i64:
1061; SSSE3:       # BB#0: # %entry
1062; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1063; SSSE3-NEXT:    psrad $31, %xmm2
1064; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1065; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1066; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1067; SSSE3-NEXT:    psrad $31, %xmm2
1068; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1069; SSSE3-NEXT:    retq
1070;
1071; SSE41-LABEL: sext_4i32_to_4i64:
1072; SSE41:       # BB#0: # %entry
1073; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1074; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1075; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1076; SSE41-NEXT:    movdqa %xmm2, %xmm0
1077; SSE41-NEXT:    retq
1078;
1079; AVX1-LABEL: sext_4i32_to_4i64:
1080; AVX1:       # BB#0: # %entry
1081; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1082; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1083; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1084; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1085; AVX1-NEXT:    retq
1086;
1087; AVX2-LABEL: sext_4i32_to_4i64:
1088; AVX2:       # BB#0: # %entry
1089; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
1090; AVX2-NEXT:    retq
1091;
1092; AVX512-LABEL: sext_4i32_to_4i64:
1093; AVX512:       # BB#0: # %entry
1094; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
1095; AVX512-NEXT:    retq
1096;
1097; X32-SSE41-LABEL: sext_4i32_to_4i64:
1098; X32-SSE41:       # BB#0: # %entry
1099; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
1100; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1101; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
1102; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
1103; X32-SSE41-NEXT:    retl
1104entry:
1105  %B = sext <4 x i32> %A to <4 x i64>
1106  ret <4 x i64> %B
1107}
1108
1109define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
1110; SSE2-LABEL: sext_8i32_to_8i64:
1111; SSE2:       # BB#0: # %entry
1112; SSE2-NEXT:    movdqa %xmm1, %xmm2
1113; SSE2-NEXT:    movdqa %xmm0, %xmm3
1114; SSE2-NEXT:    psrad $31, %xmm3
1115; SSE2-NEXT:    movdqa %xmm2, %xmm4
1116; SSE2-NEXT:    psrad $31, %xmm4
1117; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1118; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1119; SSE2-NEXT:    movdqa %xmm1, %xmm3
1120; SSE2-NEXT:    psrad $31, %xmm3
1121; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1122; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1123; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1124; SSE2-NEXT:    movdqa %xmm3, %xmm4
1125; SSE2-NEXT:    psrad $31, %xmm4
1126; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1127; SSE2-NEXT:    retq
1128;
1129; SSSE3-LABEL: sext_8i32_to_8i64:
1130; SSSE3:       # BB#0: # %entry
1131; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1132; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1133; SSSE3-NEXT:    psrad $31, %xmm3
1134; SSSE3-NEXT:    movdqa %xmm2, %xmm4
1135; SSSE3-NEXT:    psrad $31, %xmm4
1136; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1137; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1138; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1139; SSSE3-NEXT:    psrad $31, %xmm3
1140; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1141; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1142; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1143; SSSE3-NEXT:    movdqa %xmm3, %xmm4
1144; SSSE3-NEXT:    psrad $31, %xmm4
1145; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1146; SSSE3-NEXT:    retq
1147;
1148; SSE41-LABEL: sext_8i32_to_8i64:
1149; SSE41:       # BB#0: # %entry
1150; SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1151; SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1152; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1153; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1154; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1155; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1156; SSE41-NEXT:    movdqa %xmm5, %xmm0
1157; SSE41-NEXT:    movdqa %xmm4, %xmm1
1158; SSE41-NEXT:    retq
1159;
1160; AVX1-LABEL: sext_8i32_to_8i64:
1161; AVX1:       # BB#0: # %entry
1162; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1163; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1164; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
1165; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
1166; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1167; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1168; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1169; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1170; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1171; AVX1-NEXT:    vmovaps %ymm2, %ymm0
1172; AVX1-NEXT:    retq
1173;
1174; AVX2-LABEL: sext_8i32_to_8i64:
1175; AVX2:       # BB#0: # %entry
1176; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
1177; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1178; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm1
1179; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
1180; AVX2-NEXT:    retq
1181;
1182; AVX512-LABEL: sext_8i32_to_8i64:
1183; AVX512:       # BB#0: # %entry
1184; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
1185; AVX512-NEXT:    retq
1186;
1187; X32-SSE41-LABEL: sext_8i32_to_8i64:
1188; X32-SSE41:       # BB#0: # %entry
1189; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm5
1190; X32-SSE41-NEXT:    pmovsxdq %xmm1, %xmm2
1191; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1192; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
1193; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1194; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
1195; X32-SSE41-NEXT:    movdqa %xmm5, %xmm0
1196; X32-SSE41-NEXT:    movdqa %xmm4, %xmm1
1197; X32-SSE41-NEXT:    retl
1198entry:
1199  %B = sext <8 x i32> %A to <8 x i64>
1200  ret <8 x i64> %B
1201}
1202
1203define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
1204; SSE-LABEL: load_sext_2i1_to_2i64:
1205; SSE:       # BB#0: # %entry
1206; SSE-NEXT:    movzbl (%rdi), %eax
1207; SSE-NEXT:    movq %rax, %rcx
1208; SSE-NEXT:    shlq $62, %rcx
1209; SSE-NEXT:    sarq $63, %rcx
1210; SSE-NEXT:    movd %rcx, %xmm1
1211; SSE-NEXT:    shlq $63, %rax
1212; SSE-NEXT:    sarq $63, %rax
1213; SSE-NEXT:    movd %rax, %xmm0
1214; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1215; SSE-NEXT:    retq
1216;
1217; AVX1-LABEL: load_sext_2i1_to_2i64:
1218; AVX1:       # BB#0: # %entry
1219; AVX1-NEXT:    movzbl (%rdi), %eax
1220; AVX1-NEXT:    movq %rax, %rcx
1221; AVX1-NEXT:    shlq $62, %rcx
1222; AVX1-NEXT:    sarq $63, %rcx
1223; AVX1-NEXT:    vmovq %rcx, %xmm0
1224; AVX1-NEXT:    shlq $63, %rax
1225; AVX1-NEXT:    sarq $63, %rax
1226; AVX1-NEXT:    vmovq %rax, %xmm1
1227; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1228; AVX1-NEXT:    retq
1229;
1230; AVX2-LABEL: load_sext_2i1_to_2i64:
1231; AVX2:       # BB#0: # %entry
1232; AVX2-NEXT:    movzbl (%rdi), %eax
1233; AVX2-NEXT:    movq %rax, %rcx
1234; AVX2-NEXT:    shlq $62, %rcx
1235; AVX2-NEXT:    sarq $63, %rcx
1236; AVX2-NEXT:    vmovq %rcx, %xmm0
1237; AVX2-NEXT:    shlq $63, %rax
1238; AVX2-NEXT:    sarq $63, %rax
1239; AVX2-NEXT:    vmovq %rax, %xmm1
1240; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1241; AVX2-NEXT:    retq
1242;
1243; AVX512F-LABEL: load_sext_2i1_to_2i64:
1244; AVX512F:       # BB#0: # %entry
1245; AVX512F-NEXT:    movzbl (%rdi), %eax
1246; AVX512F-NEXT:    kmovw %eax, %k1
1247; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1248; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
1249; AVX512F-NEXT:    retq
1250;
1251; AVX512BW-LABEL: load_sext_2i1_to_2i64:
1252; AVX512BW:       # BB#0: # %entry
1253; AVX512BW-NEXT:    movzbl (%rdi), %eax
1254; AVX512BW-NEXT:    kmovd %eax, %k1
1255; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1256; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
1257; AVX512BW-NEXT:    retq
1258;
1259; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
1260; X32-SSE41:       # BB#0: # %entry
1261; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1262; X32-SSE41-NEXT:    movzbl (%eax), %eax
1263; X32-SSE41-NEXT:    movl %eax, %ecx
1264; X32-SSE41-NEXT:    shll $31, %ecx
1265; X32-SSE41-NEXT:    sarl $31, %ecx
1266; X32-SSE41-NEXT:    movd %ecx, %xmm0
1267; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1268; X32-SSE41-NEXT:    shll $30, %eax
1269; X32-SSE41-NEXT:    sarl $31, %eax
1270; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
1271; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1272; X32-SSE41-NEXT:    retl
1273entry:
1274 %X = load <2 x i1>, <2 x i1>* %ptr
1275 %Y = sext <2 x i1> %X to <2 x i64>
1276 ret <2 x i64> %Y
1277}
1278
1279define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
1280; SSE2-LABEL: load_sext_2i8_to_2i64:
1281; SSE2:       # BB#0: # %entry
1282; SSE2-NEXT:    movzwl (%rdi), %eax
1283; SSE2-NEXT:    movd %eax, %xmm0
1284; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1285; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1286; SSE2-NEXT:    movdqa %xmm0, %xmm1
1287; SSE2-NEXT:    psrad $31, %xmm1
1288; SSE2-NEXT:    psrad $24, %xmm0
1289; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1290; SSE2-NEXT:    retq
1291;
1292; SSSE3-LABEL: load_sext_2i8_to_2i64:
1293; SSSE3:       # BB#0: # %entry
1294; SSSE3-NEXT:    movzwl (%rdi), %eax
1295; SSSE3-NEXT:    movd %eax, %xmm0
1296; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1297; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1298; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1299; SSSE3-NEXT:    psrad $31, %xmm1
1300; SSSE3-NEXT:    psrad $24, %xmm0
1301; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1302; SSSE3-NEXT:    retq
1303;
1304; SSE41-LABEL: load_sext_2i8_to_2i64:
1305; SSE41:       # BB#0: # %entry
1306; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1307; SSE41-NEXT:    retq
1308;
1309; AVX-LABEL: load_sext_2i8_to_2i64:
1310; AVX:       # BB#0: # %entry
1311; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
1312; AVX-NEXT:    retq
1313;
1314; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
1315; X32-SSE41:       # BB#0: # %entry
1316; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1317; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1318; X32-SSE41-NEXT:    retl
1319entry:
1320 %X = load <2 x i8>, <2 x i8>* %ptr
1321 %Y = sext <2 x i8> %X to <2 x i64>
1322 ret <2 x i64> %Y
1323}
1324
1325define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
1326; SSE2-LABEL: load_sext_4i1_to_4i32:
1327; SSE2:       # BB#0: # %entry
1328; SSE2-NEXT:    movzbl (%rdi), %eax
1329; SSE2-NEXT:    movq %rax, %rcx
1330; SSE2-NEXT:    shlq $60, %rcx
1331; SSE2-NEXT:    sarq $63, %rcx
1332; SSE2-NEXT:    movd %ecx, %xmm0
1333; SSE2-NEXT:    movq %rax, %rcx
1334; SSE2-NEXT:    shlq $62, %rcx
1335; SSE2-NEXT:    sarq $63, %rcx
1336; SSE2-NEXT:    movd %ecx, %xmm1
1337; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1338; SSE2-NEXT:    movq %rax, %rcx
1339; SSE2-NEXT:    shlq $61, %rcx
1340; SSE2-NEXT:    sarq $63, %rcx
1341; SSE2-NEXT:    movd %ecx, %xmm2
1342; SSE2-NEXT:    shlq $63, %rax
1343; SSE2-NEXT:    sarq $63, %rax
1344; SSE2-NEXT:    movd %eax, %xmm0
1345; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1346; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1347; SSE2-NEXT:    retq
1348;
1349; SSSE3-LABEL: load_sext_4i1_to_4i32:
1350; SSSE3:       # BB#0: # %entry
1351; SSSE3-NEXT:    movzbl (%rdi), %eax
1352; SSSE3-NEXT:    movq %rax, %rcx
1353; SSSE3-NEXT:    shlq $60, %rcx
1354; SSSE3-NEXT:    sarq $63, %rcx
1355; SSSE3-NEXT:    movd %ecx, %xmm0
1356; SSSE3-NEXT:    movq %rax, %rcx
1357; SSSE3-NEXT:    shlq $62, %rcx
1358; SSSE3-NEXT:    sarq $63, %rcx
1359; SSSE3-NEXT:    movd %ecx, %xmm1
1360; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1361; SSSE3-NEXT:    movq %rax, %rcx
1362; SSSE3-NEXT:    shlq $61, %rcx
1363; SSSE3-NEXT:    sarq $63, %rcx
1364; SSSE3-NEXT:    movd %ecx, %xmm2
1365; SSSE3-NEXT:    shlq $63, %rax
1366; SSSE3-NEXT:    sarq $63, %rax
1367; SSSE3-NEXT:    movd %eax, %xmm0
1368; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1369; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1370; SSSE3-NEXT:    retq
1371;
1372; SSE41-LABEL: load_sext_4i1_to_4i32:
1373; SSE41:       # BB#0: # %entry
1374; SSE41-NEXT:    movzbl (%rdi), %eax
1375; SSE41-NEXT:    movq %rax, %rcx
1376; SSE41-NEXT:    shlq $62, %rcx
1377; SSE41-NEXT:    sarq $63, %rcx
1378; SSE41-NEXT:    movq %rax, %rdx
1379; SSE41-NEXT:    shlq $63, %rdx
1380; SSE41-NEXT:    sarq $63, %rdx
1381; SSE41-NEXT:    movd %edx, %xmm0
1382; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1383; SSE41-NEXT:    movq %rax, %rcx
1384; SSE41-NEXT:    shlq $61, %rcx
1385; SSE41-NEXT:    sarq $63, %rcx
1386; SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1387; SSE41-NEXT:    shlq $60, %rax
1388; SSE41-NEXT:    sarq $63, %rax
1389; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1390; SSE41-NEXT:    retq
1391;
1392; AVX1-LABEL: load_sext_4i1_to_4i32:
1393; AVX1:       # BB#0: # %entry
1394; AVX1-NEXT:    movzbl (%rdi), %eax
1395; AVX1-NEXT:    movq %rax, %rcx
1396; AVX1-NEXT:    shlq $62, %rcx
1397; AVX1-NEXT:    sarq $63, %rcx
1398; AVX1-NEXT:    movq %rax, %rdx
1399; AVX1-NEXT:    shlq $63, %rdx
1400; AVX1-NEXT:    sarq $63, %rdx
1401; AVX1-NEXT:    vmovd %edx, %xmm0
1402; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1403; AVX1-NEXT:    movq %rax, %rcx
1404; AVX1-NEXT:    shlq $61, %rcx
1405; AVX1-NEXT:    sarq $63, %rcx
1406; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1407; AVX1-NEXT:    shlq $60, %rax
1408; AVX1-NEXT:    sarq $63, %rax
1409; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1410; AVX1-NEXT:    retq
1411;
1412; AVX2-LABEL: load_sext_4i1_to_4i32:
1413; AVX2:       # BB#0: # %entry
1414; AVX2-NEXT:    movzbl (%rdi), %eax
1415; AVX2-NEXT:    movq %rax, %rcx
1416; AVX2-NEXT:    shlq $62, %rcx
1417; AVX2-NEXT:    sarq $63, %rcx
1418; AVX2-NEXT:    movq %rax, %rdx
1419; AVX2-NEXT:    shlq $63, %rdx
1420; AVX2-NEXT:    sarq $63, %rdx
1421; AVX2-NEXT:    vmovd %edx, %xmm0
1422; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1423; AVX2-NEXT:    movq %rax, %rcx
1424; AVX2-NEXT:    shlq $61, %rcx
1425; AVX2-NEXT:    sarq $63, %rcx
1426; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1427; AVX2-NEXT:    shlq $60, %rax
1428; AVX2-NEXT:    sarq $63, %rax
1429; AVX2-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1430; AVX2-NEXT:    retq
1431;
1432; AVX512F-LABEL: load_sext_4i1_to_4i32:
1433; AVX512F:       # BB#0: # %entry
1434; AVX512F-NEXT:    movzbl (%rdi), %eax
1435; AVX512F-NEXT:    kmovw %eax, %k1
1436; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1437; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1438; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1439; AVX512F-NEXT:    retq
1440;
1441; AVX512BW-LABEL: load_sext_4i1_to_4i32:
1442; AVX512BW:       # BB#0: # %entry
1443; AVX512BW-NEXT:    movzbl (%rdi), %eax
1444; AVX512BW-NEXT:    kmovd %eax, %k1
1445; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1446; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1447; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1448; AVX512BW-NEXT:    retq
1449;
1450; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
1451; X32-SSE41:       # BB#0: # %entry
1452; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1453; X32-SSE41-NEXT:    movl (%eax), %eax
1454; X32-SSE41-NEXT:    movl %eax, %ecx
1455; X32-SSE41-NEXT:    shll $30, %ecx
1456; X32-SSE41-NEXT:    sarl $31, %ecx
1457; X32-SSE41-NEXT:    movl %eax, %edx
1458; X32-SSE41-NEXT:    shll $31, %edx
1459; X32-SSE41-NEXT:    sarl $31, %edx
1460; X32-SSE41-NEXT:    movd %edx, %xmm0
1461; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
1462; X32-SSE41-NEXT:    movl %eax, %ecx
1463; X32-SSE41-NEXT:    shll $29, %ecx
1464; X32-SSE41-NEXT:    sarl $31, %ecx
1465; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm0
1466; X32-SSE41-NEXT:    shll $28, %eax
1467; X32-SSE41-NEXT:    sarl $31, %eax
1468; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
1469; X32-SSE41-NEXT:    retl
1470entry:
1471 %X = load <4 x i1>, <4 x i1>* %ptr
1472 %Y = sext <4 x i1> %X to <4 x i32>
1473 ret <4 x i32> %Y
1474}
1475
1476define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
1477; SSE2-LABEL: load_sext_4i8_to_4i32:
1478; SSE2:       # BB#0: # %entry
1479; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1480; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1481; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1482; SSE2-NEXT:    psrad $24, %xmm0
1483; SSE2-NEXT:    retq
1484;
1485; SSSE3-LABEL: load_sext_4i8_to_4i32:
1486; SSSE3:       # BB#0: # %entry
1487; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1488; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1489; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1490; SSSE3-NEXT:    psrad $24, %xmm0
1491; SSSE3-NEXT:    retq
1492;
1493; SSE41-LABEL: load_sext_4i8_to_4i32:
1494; SSE41:       # BB#0: # %entry
1495; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
1496; SSE41-NEXT:    retq
1497;
1498; AVX-LABEL: load_sext_4i8_to_4i32:
1499; AVX:       # BB#0: # %entry
1500; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
1501; AVX-NEXT:    retq
1502;
1503; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
1504; X32-SSE41:       # BB#0: # %entry
1505; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1506; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
1507; X32-SSE41-NEXT:    retl
1508entry:
1509 %X = load <4 x i8>, <4 x i8>* %ptr
1510 %Y = sext <4 x i8> %X to <4 x i32>
1511 ret <4 x i32> %Y
1512}
1513
1514define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
1515; SSE2-LABEL: load_sext_4i1_to_4i64:
1516; SSE2:       # BB#0: # %entry
1517; SSE2-NEXT:    movl (%rdi), %eax
1518; SSE2-NEXT:    movl %eax, %ecx
1519; SSE2-NEXT:    shrl $3, %ecx
1520; SSE2-NEXT:    movd %ecx, %xmm0
1521; SSE2-NEXT:    movl %eax, %ecx
1522; SSE2-NEXT:    shrl %ecx
1523; SSE2-NEXT:    movd %ecx, %xmm1
1524; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1525; SSE2-NEXT:    movd %eax, %xmm2
1526; SSE2-NEXT:    shrl $2, %eax
1527; SSE2-NEXT:    movd %eax, %xmm0
1528; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1529; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1530; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1531; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1532; SSE2-NEXT:    psllq $63, %xmm0
1533; SSE2-NEXT:    psrad $31, %xmm0
1534; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1535; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1536; SSE2-NEXT:    psllq $63, %xmm1
1537; SSE2-NEXT:    psrad $31, %xmm1
1538; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1539; SSE2-NEXT:    retq
1540;
1541; SSSE3-LABEL: load_sext_4i1_to_4i64:
1542; SSSE3:       # BB#0: # %entry
1543; SSSE3-NEXT:    movl (%rdi), %eax
1544; SSSE3-NEXT:    movl %eax, %ecx
1545; SSSE3-NEXT:    shrl $3, %ecx
1546; SSSE3-NEXT:    movd %ecx, %xmm0
1547; SSSE3-NEXT:    movl %eax, %ecx
1548; SSSE3-NEXT:    shrl %ecx
1549; SSSE3-NEXT:    movd %ecx, %xmm1
1550; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1551; SSSE3-NEXT:    movd %eax, %xmm2
1552; SSSE3-NEXT:    shrl $2, %eax
1553; SSSE3-NEXT:    movd %eax, %xmm0
1554; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1555; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1556; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm2
1557; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
1558; SSSE3-NEXT:    psllq $63, %xmm0
1559; SSSE3-NEXT:    psrad $31, %xmm0
1560; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1561; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
1562; SSSE3-NEXT:    psllq $63, %xmm1
1563; SSSE3-NEXT:    psrad $31, %xmm1
1564; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1565; SSSE3-NEXT:    retq
1566;
1567; SSE41-LABEL: load_sext_4i1_to_4i64:
1568; SSE41:       # BB#0: # %entry
1569; SSE41-NEXT:    movl (%rdi), %eax
1570; SSE41-NEXT:    movl %eax, %ecx
1571; SSE41-NEXT:    shrl %ecx
1572; SSE41-NEXT:    movd %eax, %xmm1
1573; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
1574; SSE41-NEXT:    movl %eax, %ecx
1575; SSE41-NEXT:    shrl $2, %ecx
1576; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
1577; SSE41-NEXT:    shrl $3, %eax
1578; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
1579; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
1580; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1581; SSE41-NEXT:    psllq $63, %xmm0
1582; SSE41-NEXT:    psrad $31, %xmm0
1583; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1584; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1585; SSE41-NEXT:    psllq $63, %xmm1
1586; SSE41-NEXT:    psrad $31, %xmm1
1587; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1588; SSE41-NEXT:    retq
1589;
1590; AVX1-LABEL: load_sext_4i1_to_4i64:
1591; AVX1:       # BB#0: # %entry
1592; AVX1-NEXT:    movzbl (%rdi), %eax
1593; AVX1-NEXT:    movq %rax, %rcx
1594; AVX1-NEXT:    shlq $62, %rcx
1595; AVX1-NEXT:    sarq $63, %rcx
1596; AVX1-NEXT:    movq %rax, %rdx
1597; AVX1-NEXT:    shlq $63, %rdx
1598; AVX1-NEXT:    sarq $63, %rdx
1599; AVX1-NEXT:    vmovd %edx, %xmm0
1600; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
1601; AVX1-NEXT:    movq %rax, %rcx
1602; AVX1-NEXT:    shlq $61, %rcx
1603; AVX1-NEXT:    sarq $63, %rcx
1604; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
1605; AVX1-NEXT:    shlq $60, %rax
1606; AVX1-NEXT:    sarq $63, %rax
1607; AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
1608; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1609; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1610; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1611; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1612; AVX1-NEXT:    retq
1613;
1614; AVX2-LABEL: load_sext_4i1_to_4i64:
1615; AVX2:       # BB#0: # %entry
1616; AVX2-NEXT:    movzbl (%rdi), %eax
1617; AVX2-NEXT:    movq %rax, %rcx
1618; AVX2-NEXT:    shlq $60, %rcx
1619; AVX2-NEXT:    sarq $63, %rcx
1620; AVX2-NEXT:    vmovq %rcx, %xmm0
1621; AVX2-NEXT:    movq %rax, %rcx
1622; AVX2-NEXT:    shlq $61, %rcx
1623; AVX2-NEXT:    sarq $63, %rcx
1624; AVX2-NEXT:    vmovq %rcx, %xmm1
1625; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1626; AVX2-NEXT:    movq %rax, %rcx
1627; AVX2-NEXT:    shlq $62, %rcx
1628; AVX2-NEXT:    sarq $63, %rcx
1629; AVX2-NEXT:    vmovq %rcx, %xmm1
1630; AVX2-NEXT:    shlq $63, %rax
1631; AVX2-NEXT:    sarq $63, %rax
1632; AVX2-NEXT:    vmovq %rax, %xmm2
1633; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1634; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1635; AVX2-NEXT:    retq
1636;
1637; AVX512F-LABEL: load_sext_4i1_to_4i64:
1638; AVX512F:       # BB#0: # %entry
1639; AVX512F-NEXT:    movzbl (%rdi), %eax
1640; AVX512F-NEXT:    kmovw %eax, %k1
1641; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1642; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
1643; AVX512F-NEXT:    retq
1644;
1645; AVX512BW-LABEL: load_sext_4i1_to_4i64:
1646; AVX512BW:       # BB#0: # %entry
1647; AVX512BW-NEXT:    movzbl (%rdi), %eax
1648; AVX512BW-NEXT:    kmovd %eax, %k1
1649; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1650; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
1651; AVX512BW-NEXT:    retq
1652;
1653; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
1654; X32-SSE41:       # BB#0: # %entry
1655; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1656; X32-SSE41-NEXT:    movzbl (%eax), %eax
1657; X32-SSE41-NEXT:    movl %eax, %ecx
1658; X32-SSE41-NEXT:    shrl %ecx
1659; X32-SSE41-NEXT:    movd %eax, %xmm1
1660; X32-SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
1661; X32-SSE41-NEXT:    movl %eax, %ecx
1662; X32-SSE41-NEXT:    shrl $2, %ecx
1663; X32-SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
1664; X32-SSE41-NEXT:    shrl $3, %eax
1665; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
1666; X32-SSE41-NEXT:    pand {{\.LCPI.*}}, %xmm1
1667; X32-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1668; X32-SSE41-NEXT:    psllq $63, %xmm0
1669; X32-SSE41-NEXT:    psrad $31, %xmm0
1670; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1671; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1672; X32-SSE41-NEXT:    psllq $63, %xmm1
1673; X32-SSE41-NEXT:    psrad $31, %xmm1
1674; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1675; X32-SSE41-NEXT:    retl
1676entry:
1677 %X = load <4 x i1>, <4 x i1>* %ptr
1678 %Y = sext <4 x i1> %X to <4 x i64>
1679 ret <4 x i64> %Y
1680}
1681
1682define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
1683; SSE2-LABEL: load_sext_4i8_to_4i64:
1684; SSE2:       # BB#0: # %entry
1685; SSE2-NEXT:    movsbq 1(%rdi), %rax
1686; SSE2-NEXT:    movd %rax, %xmm1
1687; SSE2-NEXT:    movsbq (%rdi), %rax
1688; SSE2-NEXT:    movd %rax, %xmm0
1689; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1690; SSE2-NEXT:    movsbq 3(%rdi), %rax
1691; SSE2-NEXT:    movd %rax, %xmm2
1692; SSE2-NEXT:    movsbq 2(%rdi), %rax
1693; SSE2-NEXT:    movd %rax, %xmm1
1694; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1695; SSE2-NEXT:    retq
1696;
1697; SSSE3-LABEL: load_sext_4i8_to_4i64:
1698; SSSE3:       # BB#0: # %entry
1699; SSSE3-NEXT:    movsbq 1(%rdi), %rax
1700; SSSE3-NEXT:    movd %rax, %xmm1
1701; SSSE3-NEXT:    movsbq (%rdi), %rax
1702; SSSE3-NEXT:    movd %rax, %xmm0
1703; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1704; SSSE3-NEXT:    movsbq 3(%rdi), %rax
1705; SSSE3-NEXT:    movd %rax, %xmm2
1706; SSSE3-NEXT:    movsbq 2(%rdi), %rax
1707; SSSE3-NEXT:    movd %rax, %xmm1
1708; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1709; SSSE3-NEXT:    retq
1710;
1711; SSE41-LABEL: load_sext_4i8_to_4i64:
1712; SSE41:       # BB#0: # %entry
1713; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
1714; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
1715; SSE41-NEXT:    retq
1716;
1717; AVX1-LABEL: load_sext_4i8_to_4i64:
1718; AVX1:       # BB#0: # %entry
1719; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
1720; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
1721; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1722; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
1723; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1724; AVX1-NEXT:    retq
1725;
1726; AVX2-LABEL: load_sext_4i8_to_4i64:
1727; AVX2:       # BB#0: # %entry
1728; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
1729; AVX2-NEXT:    retq
1730;
1731; AVX512-LABEL: load_sext_4i8_to_4i64:
1732; AVX512:       # BB#0: # %entry
1733; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
1734; AVX512-NEXT:    retq
1735;
1736; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
1737; X32-SSE41:       # BB#0: # %entry
1738; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1739; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
1740; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
1741; X32-SSE41-NEXT:    retl
1742entry:
1743 %X = load <4 x i8>, <4 x i8>* %ptr
1744 %Y = sext <4 x i8> %X to <4 x i64>
1745 ret <4 x i64> %Y
1746}
1747
1748define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
1749; SSE2-LABEL: load_sext_8i1_to_8i16:
1750; SSE2:       # BB#0: # %entry
1751; SSE2-NEXT:    movsbq (%rdi), %rax
1752; SSE2-NEXT:    movq %rax, %rcx
1753; SSE2-NEXT:    shrq $7, %rcx
1754; SSE2-NEXT:    movd %ecx, %xmm0
1755; SSE2-NEXT:    movq %rax, %rcx
1756; SSE2-NEXT:    shlq $60, %rcx
1757; SSE2-NEXT:    sarq $63, %rcx
1758; SSE2-NEXT:    movd %ecx, %xmm2
1759; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1760; SSE2-NEXT:    movq %rax, %rcx
1761; SSE2-NEXT:    shlq $58, %rcx
1762; SSE2-NEXT:    sarq $63, %rcx
1763; SSE2-NEXT:    movd %ecx, %xmm0
1764; SSE2-NEXT:    movq %rax, %rcx
1765; SSE2-NEXT:    shlq $62, %rcx
1766; SSE2-NEXT:    sarq $63, %rcx
1767; SSE2-NEXT:    movd %ecx, %xmm1
1768; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1769; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1770; SSE2-NEXT:    movq %rax, %rcx
1771; SSE2-NEXT:    shlq $57, %rcx
1772; SSE2-NEXT:    sarq $63, %rcx
1773; SSE2-NEXT:    movd %ecx, %xmm0
1774; SSE2-NEXT:    movq %rax, %rcx
1775; SSE2-NEXT:    shlq $61, %rcx
1776; SSE2-NEXT:    sarq $63, %rcx
1777; SSE2-NEXT:    movd %ecx, %xmm2
1778; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1779; SSE2-NEXT:    movq %rax, %rcx
1780; SSE2-NEXT:    shlq $59, %rcx
1781; SSE2-NEXT:    sarq $63, %rcx
1782; SSE2-NEXT:    movd %ecx, %xmm3
1783; SSE2-NEXT:    shlq $63, %rax
1784; SSE2-NEXT:    sarq $63, %rax
1785; SSE2-NEXT:    movd %eax, %xmm0
1786; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1787; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1788; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1789; SSE2-NEXT:    retq
1790;
1791; SSSE3-LABEL: load_sext_8i1_to_8i16:
1792; SSSE3:       # BB#0: # %entry
1793; SSSE3-NEXT:    movsbq (%rdi), %rax
1794; SSSE3-NEXT:    movq %rax, %rcx
1795; SSSE3-NEXT:    shrq $7, %rcx
1796; SSSE3-NEXT:    movd %ecx, %xmm0
1797; SSSE3-NEXT:    movq %rax, %rcx
1798; SSSE3-NEXT:    shlq $60, %rcx
1799; SSSE3-NEXT:    sarq $63, %rcx
1800; SSSE3-NEXT:    movd %ecx, %xmm2
1801; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1802; SSSE3-NEXT:    movq %rax, %rcx
1803; SSSE3-NEXT:    shlq $58, %rcx
1804; SSSE3-NEXT:    sarq $63, %rcx
1805; SSSE3-NEXT:    movd %ecx, %xmm0
1806; SSSE3-NEXT:    movq %rax, %rcx
1807; SSSE3-NEXT:    shlq $62, %rcx
1808; SSSE3-NEXT:    sarq $63, %rcx
1809; SSSE3-NEXT:    movd %ecx, %xmm1
1810; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1811; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1812; SSSE3-NEXT:    movq %rax, %rcx
1813; SSSE3-NEXT:    shlq $57, %rcx
1814; SSSE3-NEXT:    sarq $63, %rcx
1815; SSSE3-NEXT:    movd %ecx, %xmm0
1816; SSSE3-NEXT:    movq %rax, %rcx
1817; SSSE3-NEXT:    shlq $61, %rcx
1818; SSSE3-NEXT:    sarq $63, %rcx
1819; SSSE3-NEXT:    movd %ecx, %xmm2
1820; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1821; SSSE3-NEXT:    movq %rax, %rcx
1822; SSSE3-NEXT:    shlq $59, %rcx
1823; SSSE3-NEXT:    sarq $63, %rcx
1824; SSSE3-NEXT:    movd %ecx, %xmm3
1825; SSSE3-NEXT:    shlq $63, %rax
1826; SSSE3-NEXT:    sarq $63, %rax
1827; SSSE3-NEXT:    movd %eax, %xmm0
1828; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1829; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1830; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1831; SSSE3-NEXT:    retq
1832;
1833; SSE41-LABEL: load_sext_8i1_to_8i16:
1834; SSE41:       # BB#0: # %entry
1835; SSE41-NEXT:    movsbq (%rdi), %rax
1836; SSE41-NEXT:    movq %rax, %rcx
1837; SSE41-NEXT:    shlq $62, %rcx
1838; SSE41-NEXT:    sarq $63, %rcx
1839; SSE41-NEXT:    movq %rax, %rdx
1840; SSE41-NEXT:    shlq $63, %rdx
1841; SSE41-NEXT:    sarq $63, %rdx
1842; SSE41-NEXT:    movd %edx, %xmm0
1843; SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1844; SSE41-NEXT:    movq %rax, %rcx
1845; SSE41-NEXT:    shlq $61, %rcx
1846; SSE41-NEXT:    sarq $63, %rcx
1847; SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1848; SSE41-NEXT:    movq %rax, %rcx
1849; SSE41-NEXT:    shlq $60, %rcx
1850; SSE41-NEXT:    sarq $63, %rcx
1851; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1852; SSE41-NEXT:    movq %rax, %rcx
1853; SSE41-NEXT:    shlq $59, %rcx
1854; SSE41-NEXT:    sarq $63, %rcx
1855; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1856; SSE41-NEXT:    movq %rax, %rcx
1857; SSE41-NEXT:    shlq $58, %rcx
1858; SSE41-NEXT:    sarq $63, %rcx
1859; SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1860; SSE41-NEXT:    movq %rax, %rcx
1861; SSE41-NEXT:    shlq $57, %rcx
1862; SSE41-NEXT:    sarq $63, %rcx
1863; SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1864; SSE41-NEXT:    shrq $7, %rax
1865; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1866; SSE41-NEXT:    retq
1867;
1868; AVX1-LABEL: load_sext_8i1_to_8i16:
1869; AVX1:       # BB#0: # %entry
1870; AVX1-NEXT:    movsbq (%rdi), %rax
1871; AVX1-NEXT:    movq %rax, %rcx
1872; AVX1-NEXT:    shlq $62, %rcx
1873; AVX1-NEXT:    sarq $63, %rcx
1874; AVX1-NEXT:    movq %rax, %rdx
1875; AVX1-NEXT:    shlq $63, %rdx
1876; AVX1-NEXT:    sarq $63, %rdx
1877; AVX1-NEXT:    vmovd %edx, %xmm0
1878; AVX1-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1879; AVX1-NEXT:    movq %rax, %rcx
1880; AVX1-NEXT:    shlq $61, %rcx
1881; AVX1-NEXT:    sarq $63, %rcx
1882; AVX1-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1883; AVX1-NEXT:    movq %rax, %rcx
1884; AVX1-NEXT:    shlq $60, %rcx
1885; AVX1-NEXT:    sarq $63, %rcx
1886; AVX1-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1887; AVX1-NEXT:    movq %rax, %rcx
1888; AVX1-NEXT:    shlq $59, %rcx
1889; AVX1-NEXT:    sarq $63, %rcx
1890; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1891; AVX1-NEXT:    movq %rax, %rcx
1892; AVX1-NEXT:    shlq $58, %rcx
1893; AVX1-NEXT:    sarq $63, %rcx
1894; AVX1-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1895; AVX1-NEXT:    movq %rax, %rcx
1896; AVX1-NEXT:    shlq $57, %rcx
1897; AVX1-NEXT:    sarq $63, %rcx
1898; AVX1-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1899; AVX1-NEXT:    shrq $7, %rax
1900; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1901; AVX1-NEXT:    retq
1902;
1903; AVX2-LABEL: load_sext_8i1_to_8i16:
1904; AVX2:       # BB#0: # %entry
1905; AVX2-NEXT:    movsbq (%rdi), %rax
1906; AVX2-NEXT:    movq %rax, %rcx
1907; AVX2-NEXT:    shlq $62, %rcx
1908; AVX2-NEXT:    sarq $63, %rcx
1909; AVX2-NEXT:    movq %rax, %rdx
1910; AVX2-NEXT:    shlq $63, %rdx
1911; AVX2-NEXT:    sarq $63, %rdx
1912; AVX2-NEXT:    vmovd %edx, %xmm0
1913; AVX2-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
1914; AVX2-NEXT:    movq %rax, %rcx
1915; AVX2-NEXT:    shlq $61, %rcx
1916; AVX2-NEXT:    sarq $63, %rcx
1917; AVX2-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
1918; AVX2-NEXT:    movq %rax, %rcx
1919; AVX2-NEXT:    shlq $60, %rcx
1920; AVX2-NEXT:    sarq $63, %rcx
1921; AVX2-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
1922; AVX2-NEXT:    movq %rax, %rcx
1923; AVX2-NEXT:    shlq $59, %rcx
1924; AVX2-NEXT:    sarq $63, %rcx
1925; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1926; AVX2-NEXT:    movq %rax, %rcx
1927; AVX2-NEXT:    shlq $58, %rcx
1928; AVX2-NEXT:    sarq $63, %rcx
1929; AVX2-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
1930; AVX2-NEXT:    movq %rax, %rcx
1931; AVX2-NEXT:    shlq $57, %rcx
1932; AVX2-NEXT:    sarq $63, %rcx
1933; AVX2-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1934; AVX2-NEXT:    shrq $7, %rax
1935; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1936; AVX2-NEXT:    retq
1937;
1938; AVX512F-LABEL: load_sext_8i1_to_8i16:
1939; AVX512F:       # BB#0: # %entry
1940; AVX512F-NEXT:    movzbl (%rdi), %eax
1941; AVX512F-NEXT:    kmovw %eax, %k1
1942; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1943; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1944; AVX512F-NEXT:    retq
1945;
1946; AVX512BW-LABEL: load_sext_8i1_to_8i16:
1947; AVX512BW:       # BB#0: # %entry
1948; AVX512BW-NEXT:    movzbl (%rdi), %eax
1949; AVX512BW-NEXT:    kmovd %eax, %k1
1950; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1951; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1952; AVX512BW-NEXT:    retq
1953;
1954; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
1955; X32-SSE41:       # BB#0: # %entry
1956; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
1957; X32-SSE41-NEXT:    movsbl (%eax), %eax
1958; X32-SSE41-NEXT:    movl %eax, %ecx
1959; X32-SSE41-NEXT:    shll $30, %ecx
1960; X32-SSE41-NEXT:    sarl $31, %ecx
1961; X32-SSE41-NEXT:    movl %eax, %edx
1962; X32-SSE41-NEXT:    shll $31, %edx
1963; X32-SSE41-NEXT:    sarl $31, %edx
1964; X32-SSE41-NEXT:    movd %edx, %xmm0
1965; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm0
1966; X32-SSE41-NEXT:    movl %eax, %ecx
1967; X32-SSE41-NEXT:    shll $29, %ecx
1968; X32-SSE41-NEXT:    sarl $31, %ecx
1969; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm0
1970; X32-SSE41-NEXT:    movl %eax, %ecx
1971; X32-SSE41-NEXT:    shll $28, %ecx
1972; X32-SSE41-NEXT:    sarl $31, %ecx
1973; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
1974; X32-SSE41-NEXT:    movl %eax, %ecx
1975; X32-SSE41-NEXT:    shll $27, %ecx
1976; X32-SSE41-NEXT:    sarl $31, %ecx
1977; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
1978; X32-SSE41-NEXT:    movl %eax, %ecx
1979; X32-SSE41-NEXT:    shll $26, %ecx
1980; X32-SSE41-NEXT:    sarl $31, %ecx
1981; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm0
1982; X32-SSE41-NEXT:    movl %eax, %ecx
1983; X32-SSE41-NEXT:    shll $25, %ecx
1984; X32-SSE41-NEXT:    sarl $31, %ecx
1985; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm0
1986; X32-SSE41-NEXT:    shrl $7, %eax
1987; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm0
1988; X32-SSE41-NEXT:    retl
1989entry:
1990 %X = load <8 x i1>, <8 x i1>* %ptr
1991 %Y = sext <8 x i1> %X to <8 x i16>
1992 ret <8 x i16> %Y
1993}
1994
1995define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
1996; SSE2-LABEL: load_sext_8i8_to_8i16:
1997; SSE2:       # BB#0: # %entry
1998; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1999; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2000; SSE2-NEXT:    psraw $8, %xmm0
2001; SSE2-NEXT:    retq
2002;
2003; SSSE3-LABEL: load_sext_8i8_to_8i16:
2004; SSSE3:       # BB#0: # %entry
2005; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2006; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2007; SSSE3-NEXT:    psraw $8, %xmm0
2008; SSSE3-NEXT:    retq
2009;
2010; SSE41-LABEL: load_sext_8i8_to_8i16:
2011; SSE41:       # BB#0: # %entry
2012; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
2013; SSE41-NEXT:    retq
2014;
2015; AVX-LABEL: load_sext_8i8_to_8i16:
2016; AVX:       # BB#0: # %entry
2017; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
2018; AVX-NEXT:    retq
2019;
2020; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
2021; X32-SSE41:       # BB#0: # %entry
2022; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2023; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
2024; X32-SSE41-NEXT:    retl
2025entry:
2026 %X = load <8 x i8>, <8 x i8>* %ptr
2027 %Y = sext <8 x i8> %X to <8 x i16>
2028 ret <8 x i16> %Y
2029}
2030
2031define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
2032; SSE2-LABEL: load_sext_8i8_to_8i64:
2033; SSE2:       # BB#0: # %entry
2034; SSE2-NEXT:    movsbq 1(%rdi), %rax
2035; SSE2-NEXT:    movd %rax, %xmm1
2036; SSE2-NEXT:    movsbq (%rdi), %rax
2037; SSE2-NEXT:    movd %rax, %xmm0
2038; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2039; SSE2-NEXT:    movsbq 3(%rdi), %rax
2040; SSE2-NEXT:    movd %rax, %xmm2
2041; SSE2-NEXT:    movsbq 2(%rdi), %rax
2042; SSE2-NEXT:    movd %rax, %xmm1
2043; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2044; SSE2-NEXT:    movsbq 5(%rdi), %rax
2045; SSE2-NEXT:    movd %rax, %xmm3
2046; SSE2-NEXT:    movsbq 4(%rdi), %rax
2047; SSE2-NEXT:    movd %rax, %xmm2
2048; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2049; SSE2-NEXT:    movsbq 7(%rdi), %rax
2050; SSE2-NEXT:    movd %rax, %xmm4
2051; SSE2-NEXT:    movsbq 6(%rdi), %rax
2052; SSE2-NEXT:    movd %rax, %xmm3
2053; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2054; SSE2-NEXT:    retq
2055;
2056; SSSE3-LABEL: load_sext_8i8_to_8i64:
2057; SSSE3:       # BB#0: # %entry
2058; SSSE3-NEXT:    movsbq 1(%rdi), %rax
2059; SSSE3-NEXT:    movd %rax, %xmm1
2060; SSSE3-NEXT:    movsbq (%rdi), %rax
2061; SSSE3-NEXT:    movd %rax, %xmm0
2062; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2063; SSSE3-NEXT:    movsbq 3(%rdi), %rax
2064; SSSE3-NEXT:    movd %rax, %xmm2
2065; SSSE3-NEXT:    movsbq 2(%rdi), %rax
2066; SSSE3-NEXT:    movd %rax, %xmm1
2067; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2068; SSSE3-NEXT:    movsbq 5(%rdi), %rax
2069; SSSE3-NEXT:    movd %rax, %xmm3
2070; SSSE3-NEXT:    movsbq 4(%rdi), %rax
2071; SSSE3-NEXT:    movd %rax, %xmm2
2072; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2073; SSSE3-NEXT:    movsbq 7(%rdi), %rax
2074; SSSE3-NEXT:    movd %rax, %xmm4
2075; SSSE3-NEXT:    movsbq 6(%rdi), %rax
2076; SSSE3-NEXT:    movd %rax, %xmm3
2077; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
2078; SSSE3-NEXT:    retq
2079;
2080; SSE41-LABEL: load_sext_8i8_to_8i64:
2081; SSE41:       # BB#0: # %entry
2082; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
2083; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm1
2084; SSE41-NEXT:    pmovsxbq 4(%rdi), %xmm2
2085; SSE41-NEXT:    pmovsxbq 6(%rdi), %xmm3
2086; SSE41-NEXT:    retq
2087;
2088; AVX1-LABEL: load_sext_8i8_to_8i64:
2089; AVX1:       # BB#0: # %entry
2090; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
2091; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
2092; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2093; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
2094; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2095; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm1
2096; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
2097; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2098; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
2099; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2100; AVX1-NEXT:    retq
2101;
2102; AVX2-LABEL: load_sext_8i8_to_8i64:
2103; AVX2:       # BB#0: # %entry
2104; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
2105; AVX2-NEXT:    vpmovsxbq 4(%rdi), %ymm1
2106; AVX2-NEXT:    retq
2107;
2108; AVX512-LABEL: load_sext_8i8_to_8i64:
2109; AVX512:       # BB#0: # %entry
2110; AVX512-NEXT:    vpmovsxbq (%rdi), %zmm0
2111; AVX512-NEXT:    retq
2112;
2113; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
2114; X32-SSE41:       # BB#0: # %entry
2115; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2116; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
2117; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm1
2118; X32-SSE41-NEXT:    pmovsxbq 4(%eax), %xmm2
2119; X32-SSE41-NEXT:    pmovsxbq 6(%eax), %xmm3
2120; X32-SSE41-NEXT:    retl
2121entry:
2122 %X = load <8 x i8>, <8 x i8>* %ptr
2123 %Y = sext <8 x i8> %X to <8 x i64>
2124 ret <8 x i64> %Y
2125}
2126
2127define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
2128; SSE2-LABEL: load_sext_8i1_to_8i32:
2129; SSE2:       # BB#0: # %entry
2130; SSE2-NEXT:    movzbl (%rdi), %eax
2131; SSE2-NEXT:    movl %eax, %ecx
2132; SSE2-NEXT:    shrl $6, %ecx
2133; SSE2-NEXT:    andl $1, %ecx
2134; SSE2-NEXT:    movd %ecx, %xmm0
2135; SSE2-NEXT:    movl %eax, %ecx
2136; SSE2-NEXT:    shrl $2, %ecx
2137; SSE2-NEXT:    andl $1, %ecx
2138; SSE2-NEXT:    movd %ecx, %xmm2
2139; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2140; SSE2-NEXT:    movl %eax, %ecx
2141; SSE2-NEXT:    andl $1, %ecx
2142; SSE2-NEXT:    movd %ecx, %xmm1
2143; SSE2-NEXT:    movl %eax, %ecx
2144; SSE2-NEXT:    shrl $4, %ecx
2145; SSE2-NEXT:    andl $1, %ecx
2146; SSE2-NEXT:    movd %ecx, %xmm0
2147; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2148; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2149; SSE2-NEXT:    movl %eax, %ecx
2150; SSE2-NEXT:    shrl $5, %ecx
2151; SSE2-NEXT:    andl $1, %ecx
2152; SSE2-NEXT:    movd %ecx, %xmm0
2153; SSE2-NEXT:    movl %eax, %ecx
2154; SSE2-NEXT:    shrl %ecx
2155; SSE2-NEXT:    andl $1, %ecx
2156; SSE2-NEXT:    movd %ecx, %xmm2
2157; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2158; SSE2-NEXT:    movl %eax, %ecx
2159; SSE2-NEXT:    shrl $3, %ecx
2160; SSE2-NEXT:    andl $1, %ecx
2161; SSE2-NEXT:    movd %ecx, %xmm0
2162; SSE2-NEXT:    shrl $7, %eax
2163; SSE2-NEXT:    movzwl %ax, %eax
2164; SSE2-NEXT:    movd %eax, %xmm3
2165; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2166; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2167; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2168; SSE2-NEXT:    movdqa %xmm1, %xmm0
2169; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2170; SSE2-NEXT:    pslld $31, %xmm0
2171; SSE2-NEXT:    psrad $31, %xmm0
2172; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2173; SSE2-NEXT:    pslld $31, %xmm1
2174; SSE2-NEXT:    psrad $31, %xmm1
2175; SSE2-NEXT:    retq
2176;
2177; SSSE3-LABEL: load_sext_8i1_to_8i32:
2178; SSSE3:       # BB#0: # %entry
2179; SSSE3-NEXT:    movzbl (%rdi), %eax
2180; SSSE3-NEXT:    movl %eax, %ecx
2181; SSSE3-NEXT:    shrl $6, %ecx
2182; SSSE3-NEXT:    andl $1, %ecx
2183; SSSE3-NEXT:    movd %ecx, %xmm0
2184; SSSE3-NEXT:    movl %eax, %ecx
2185; SSSE3-NEXT:    shrl $2, %ecx
2186; SSSE3-NEXT:    andl $1, %ecx
2187; SSSE3-NEXT:    movd %ecx, %xmm2
2188; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2189; SSSE3-NEXT:    movl %eax, %ecx
2190; SSSE3-NEXT:    andl $1, %ecx
2191; SSSE3-NEXT:    movd %ecx, %xmm1
2192; SSSE3-NEXT:    movl %eax, %ecx
2193; SSSE3-NEXT:    shrl $4, %ecx
2194; SSSE3-NEXT:    andl $1, %ecx
2195; SSSE3-NEXT:    movd %ecx, %xmm0
2196; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2197; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2198; SSSE3-NEXT:    movl %eax, %ecx
2199; SSSE3-NEXT:    shrl $5, %ecx
2200; SSSE3-NEXT:    andl $1, %ecx
2201; SSSE3-NEXT:    movd %ecx, %xmm0
2202; SSSE3-NEXT:    movl %eax, %ecx
2203; SSSE3-NEXT:    shrl %ecx
2204; SSSE3-NEXT:    andl $1, %ecx
2205; SSSE3-NEXT:    movd %ecx, %xmm2
2206; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2207; SSSE3-NEXT:    movl %eax, %ecx
2208; SSSE3-NEXT:    shrl $3, %ecx
2209; SSSE3-NEXT:    andl $1, %ecx
2210; SSSE3-NEXT:    movd %ecx, %xmm0
2211; SSSE3-NEXT:    shrl $7, %eax
2212; SSSE3-NEXT:    movzwl %ax, %eax
2213; SSSE3-NEXT:    movd %eax, %xmm3
2214; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2215; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2216; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2217; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2218; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2219; SSSE3-NEXT:    pslld $31, %xmm0
2220; SSSE3-NEXT:    psrad $31, %xmm0
2221; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2222; SSSE3-NEXT:    pslld $31, %xmm1
2223; SSSE3-NEXT:    psrad $31, %xmm1
2224; SSSE3-NEXT:    retq
2225;
2226; SSE41-LABEL: load_sext_8i1_to_8i32:
2227; SSE41:       # BB#0: # %entry
2228; SSE41-NEXT:    movzbl (%rdi), %eax
2229; SSE41-NEXT:    movl %eax, %ecx
2230; SSE41-NEXT:    shrl %ecx
2231; SSE41-NEXT:    andl $1, %ecx
2232; SSE41-NEXT:    movl %eax, %edx
2233; SSE41-NEXT:    andl $1, %edx
2234; SSE41-NEXT:    movd %edx, %xmm1
2235; SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
2236; SSE41-NEXT:    movl %eax, %ecx
2237; SSE41-NEXT:    shrl $2, %ecx
2238; SSE41-NEXT:    andl $1, %ecx
2239; SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
2240; SSE41-NEXT:    movl %eax, %ecx
2241; SSE41-NEXT:    shrl $3, %ecx
2242; SSE41-NEXT:    andl $1, %ecx
2243; SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
2244; SSE41-NEXT:    movl %eax, %ecx
2245; SSE41-NEXT:    shrl $4, %ecx
2246; SSE41-NEXT:    andl $1, %ecx
2247; SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
2248; SSE41-NEXT:    movl %eax, %ecx
2249; SSE41-NEXT:    shrl $5, %ecx
2250; SSE41-NEXT:    andl $1, %ecx
2251; SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
2252; SSE41-NEXT:    movl %eax, %ecx
2253; SSE41-NEXT:    shrl $6, %ecx
2254; SSE41-NEXT:    andl $1, %ecx
2255; SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
2256; SSE41-NEXT:    shrl $7, %eax
2257; SSE41-NEXT:    movzwl %ax, %eax
2258; SSE41-NEXT:    pinsrw $7, %eax, %xmm1
2259; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2260; SSE41-NEXT:    pslld $31, %xmm0
2261; SSE41-NEXT:    psrad $31, %xmm0
2262; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2263; SSE41-NEXT:    pslld $31, %xmm1
2264; SSE41-NEXT:    psrad $31, %xmm1
2265; SSE41-NEXT:    retq
2266;
2267; AVX1-LABEL: load_sext_8i1_to_8i32:
2268; AVX1:       # BB#0: # %entry
2269; AVX1-NEXT:    movsbq (%rdi), %rax
2270; AVX1-NEXT:    movq %rax, %rcx
2271; AVX1-NEXT:    shlq $58, %rcx
2272; AVX1-NEXT:    sarq $63, %rcx
2273; AVX1-NEXT:    movq %rax, %rdx
2274; AVX1-NEXT:    shlq $59, %rdx
2275; AVX1-NEXT:    sarq $63, %rdx
2276; AVX1-NEXT:    vmovd %edx, %xmm0
2277; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
2278; AVX1-NEXT:    movq %rax, %rcx
2279; AVX1-NEXT:    shlq $57, %rcx
2280; AVX1-NEXT:    sarq $63, %rcx
2281; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
2282; AVX1-NEXT:    movq %rax, %rcx
2283; AVX1-NEXT:    shrq $7, %rcx
2284; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
2285; AVX1-NEXT:    movq %rax, %rcx
2286; AVX1-NEXT:    shlq $62, %rcx
2287; AVX1-NEXT:    sarq $63, %rcx
2288; AVX1-NEXT:    movq %rax, %rdx
2289; AVX1-NEXT:    shlq $63, %rdx
2290; AVX1-NEXT:    sarq $63, %rdx
2291; AVX1-NEXT:    vmovd %edx, %xmm1
2292; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
2293; AVX1-NEXT:    movq %rax, %rcx
2294; AVX1-NEXT:    shlq $61, %rcx
2295; AVX1-NEXT:    sarq $63, %rcx
2296; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
2297; AVX1-NEXT:    shlq $60, %rax
2298; AVX1-NEXT:    sarq $63, %rax
2299; AVX1-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
2300; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2301; AVX1-NEXT:    retq
2302;
2303; AVX2-LABEL: load_sext_8i1_to_8i32:
2304; AVX2:       # BB#0: # %entry
2305; AVX2-NEXT:    movsbq (%rdi), %rax
2306; AVX2-NEXT:    movq %rax, %rcx
2307; AVX2-NEXT:    shlq $58, %rcx
2308; AVX2-NEXT:    sarq $63, %rcx
2309; AVX2-NEXT:    movq %rax, %rdx
2310; AVX2-NEXT:    shlq $59, %rdx
2311; AVX2-NEXT:    sarq $63, %rdx
2312; AVX2-NEXT:    vmovd %edx, %xmm0
2313; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
2314; AVX2-NEXT:    movq %rax, %rcx
2315; AVX2-NEXT:    shlq $57, %rcx
2316; AVX2-NEXT:    sarq $63, %rcx
2317; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
2318; AVX2-NEXT:    movq %rax, %rcx
2319; AVX2-NEXT:    shrq $7, %rcx
2320; AVX2-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
2321; AVX2-NEXT:    movq %rax, %rcx
2322; AVX2-NEXT:    shlq $62, %rcx
2323; AVX2-NEXT:    sarq $63, %rcx
2324; AVX2-NEXT:    movq %rax, %rdx
2325; AVX2-NEXT:    shlq $63, %rdx
2326; AVX2-NEXT:    sarq $63, %rdx
2327; AVX2-NEXT:    vmovd %edx, %xmm1
2328; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
2329; AVX2-NEXT:    movq %rax, %rcx
2330; AVX2-NEXT:    shlq $61, %rcx
2331; AVX2-NEXT:    sarq $63, %rcx
2332; AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
2333; AVX2-NEXT:    shlq $60, %rax
2334; AVX2-NEXT:    sarq $63, %rax
2335; AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
2336; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
2337; AVX2-NEXT:    retq
2338;
2339; AVX512F-LABEL: load_sext_8i1_to_8i32:
2340; AVX512F:       # BB#0: # %entry
2341; AVX512F-NEXT:    movzbl (%rdi), %eax
2342; AVX512F-NEXT:    kmovw %eax, %k1
2343; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2344; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
2345; AVX512F-NEXT:    retq
2346;
2347; AVX512BW-LABEL: load_sext_8i1_to_8i32:
2348; AVX512BW:       # BB#0: # %entry
2349; AVX512BW-NEXT:    movzbl (%rdi), %eax
2350; AVX512BW-NEXT:    kmovd %eax, %k1
2351; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2352; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
2353; AVX512BW-NEXT:    retq
2354;
2355; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
2356; X32-SSE41:       # BB#0: # %entry
2357; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2358; X32-SSE41-NEXT:    movzbl (%eax), %eax
2359; X32-SSE41-NEXT:    movl %eax, %ecx
2360; X32-SSE41-NEXT:    shrl %ecx
2361; X32-SSE41-NEXT:    andl $1, %ecx
2362; X32-SSE41-NEXT:    movl %eax, %edx
2363; X32-SSE41-NEXT:    andl $1, %edx
2364; X32-SSE41-NEXT:    movd %edx, %xmm1
2365; X32-SSE41-NEXT:    pinsrw $1, %ecx, %xmm1
2366; X32-SSE41-NEXT:    movl %eax, %ecx
2367; X32-SSE41-NEXT:    shrl $2, %ecx
2368; X32-SSE41-NEXT:    andl $1, %ecx
2369; X32-SSE41-NEXT:    pinsrw $2, %ecx, %xmm1
2370; X32-SSE41-NEXT:    movl %eax, %ecx
2371; X32-SSE41-NEXT:    shrl $3, %ecx
2372; X32-SSE41-NEXT:    andl $1, %ecx
2373; X32-SSE41-NEXT:    pinsrw $3, %ecx, %xmm1
2374; X32-SSE41-NEXT:    movl %eax, %ecx
2375; X32-SSE41-NEXT:    shrl $4, %ecx
2376; X32-SSE41-NEXT:    andl $1, %ecx
2377; X32-SSE41-NEXT:    pinsrw $4, %ecx, %xmm1
2378; X32-SSE41-NEXT:    movl %eax, %ecx
2379; X32-SSE41-NEXT:    shrl $5, %ecx
2380; X32-SSE41-NEXT:    andl $1, %ecx
2381; X32-SSE41-NEXT:    pinsrw $5, %ecx, %xmm1
2382; X32-SSE41-NEXT:    movl %eax, %ecx
2383; X32-SSE41-NEXT:    shrl $6, %ecx
2384; X32-SSE41-NEXT:    andl $1, %ecx
2385; X32-SSE41-NEXT:    pinsrw $6, %ecx, %xmm1
2386; X32-SSE41-NEXT:    shrl $7, %eax
2387; X32-SSE41-NEXT:    pinsrw $7, %eax, %xmm1
2388; X32-SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2389; X32-SSE41-NEXT:    pslld $31, %xmm0
2390; X32-SSE41-NEXT:    psrad $31, %xmm0
2391; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2392; X32-SSE41-NEXT:    pslld $31, %xmm1
2393; X32-SSE41-NEXT:    psrad $31, %xmm1
2394; X32-SSE41-NEXT:    retl
2395entry:
2396 %X = load <8 x i1>, <8 x i1>* %ptr
2397 %Y = sext <8 x i1> %X to <8 x i32>
2398 ret <8 x i32> %Y
2399}
2400
2401define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
2402; SSE2-LABEL: load_sext_8i8_to_8i32:
2403; SSE2:       # BB#0: # %entry
2404; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2405; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2406; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2407; SSE2-NEXT:    psrad $24, %xmm0
2408; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2409; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2410; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2411; SSE2-NEXT:    psrad $24, %xmm1
2412; SSE2-NEXT:    retq
2413;
2414; SSSE3-LABEL: load_sext_8i8_to_8i32:
2415; SSSE3:       # BB#0: # %entry
2416; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2417; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2418; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2419; SSSE3-NEXT:    psrad $24, %xmm0
2420; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2421; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2422; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2423; SSSE3-NEXT:    psrad $24, %xmm1
2424; SSSE3-NEXT:    retq
2425;
2426; SSE41-LABEL: load_sext_8i8_to_8i32:
2427; SSE41:       # BB#0: # %entry
2428; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
2429; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
2430; SSE41-NEXT:    retq
2431;
2432; AVX1-LABEL: load_sext_8i8_to_8i32:
2433; AVX1:       # BB#0: # %entry
2434; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
2435; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
2436; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2437; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2438; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2439; AVX1-NEXT:    retq
2440;
2441; AVX2-LABEL: load_sext_8i8_to_8i32:
2442; AVX2:       # BB#0: # %entry
2443; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
2444; AVX2-NEXT:    retq
2445;
2446; AVX512-LABEL: load_sext_8i8_to_8i32:
2447; AVX512:       # BB#0: # %entry
2448; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
2449; AVX512-NEXT:    retq
2450;
2451; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
2452; X32-SSE41:       # BB#0: # %entry
2453; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2454; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
2455; X32-SSE41-NEXT:    pmovsxbd 4(%eax), %xmm1
2456; X32-SSE41-NEXT:    retl
2457entry:
2458 %X = load <8 x i8>, <8 x i8>* %ptr
2459 %Y = sext <8 x i8> %X to <8 x i32>
2460 ret <8 x i32> %Y
2461}
2462
2463define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
2464; SSE2-LABEL: load_sext_16i1_to_16i8:
2465; SSE2:       # BB#0: # %entry
2466; SSE2-NEXT:    pushq %rbp
2467; SSE2-NEXT:    pushq %r15
2468; SSE2-NEXT:    pushq %r14
2469; SSE2-NEXT:    pushq %r13
2470; SSE2-NEXT:    pushq %r12
2471; SSE2-NEXT:    pushq %rbx
2472; SSE2-NEXT:    movswq (%rdi), %rax
2473; SSE2-NEXT:    movq %rax, %r8
2474; SSE2-NEXT:    movq %rax, %r9
2475; SSE2-NEXT:    movq %rax, %r10
2476; SSE2-NEXT:    movq %rax, %r11
2477; SSE2-NEXT:    movq %rax, %r14
2478; SSE2-NEXT:    movq %rax, %r15
2479; SSE2-NEXT:    movq %rax, %r12
2480; SSE2-NEXT:    movq %rax, %r13
2481; SSE2-NEXT:    movq %rax, %rbx
2482; SSE2-NEXT:    movq %rax, %rcx
2483; SSE2-NEXT:    movq %rax, %rdx
2484; SSE2-NEXT:    movq %rax, %rsi
2485; SSE2-NEXT:    movq %rax, %rdi
2486; SSE2-NEXT:    movq %rax, %rbp
2487; SSE2-NEXT:    shlq $49, %rbp
2488; SSE2-NEXT:    sarq $63, %rbp
2489; SSE2-NEXT:    movd %ebp, %xmm0
2490; SSE2-NEXT:    movq %rax, %rbp
2491; SSE2-NEXT:    movsbq %al, %rax
2492; SSE2-NEXT:    shlq $57, %r8
2493; SSE2-NEXT:    sarq $63, %r8
2494; SSE2-NEXT:    movd %r8d, %xmm1
2495; SSE2-NEXT:    shlq $53, %r9
2496; SSE2-NEXT:    sarq $63, %r9
2497; SSE2-NEXT:    movd %r9d, %xmm2
2498; SSE2-NEXT:    shlq $61, %r10
2499; SSE2-NEXT:    sarq $63, %r10
2500; SSE2-NEXT:    movd %r10d, %xmm3
2501; SSE2-NEXT:    shlq $51, %r11
2502; SSE2-NEXT:    sarq $63, %r11
2503; SSE2-NEXT:    movd %r11d, %xmm4
2504; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2505; SSE2-NEXT:    shlq $59, %r14
2506; SSE2-NEXT:    sarq $63, %r14
2507; SSE2-NEXT:    movd %r14d, %xmm5
2508; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2509; SSE2-NEXT:    shlq $55, %r15
2510; SSE2-NEXT:    sarq $63, %r15
2511; SSE2-NEXT:    movd %r15d, %xmm2
2512; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2513; SSE2-NEXT:    shlq $63, %r12
2514; SSE2-NEXT:    sarq $63, %r12
2515; SSE2-NEXT:    movd %r12d, %xmm0
2516; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2517; SSE2-NEXT:    shlq $50, %r13
2518; SSE2-NEXT:    sarq $63, %r13
2519; SSE2-NEXT:    movd %r13d, %xmm1
2520; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2521; SSE2-NEXT:    shlq $58, %rbx
2522; SSE2-NEXT:    sarq $63, %rbx
2523; SSE2-NEXT:    movd %ebx, %xmm2
2524; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2525; SSE2-NEXT:    shlq $54, %rcx
2526; SSE2-NEXT:    sarq $63, %rcx
2527; SSE2-NEXT:    movd %ecx, %xmm4
2528; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2529; SSE2-NEXT:    shlq $62, %rdx
2530; SSE2-NEXT:    sarq $63, %rdx
2531; SSE2-NEXT:    movd %edx, %xmm3
2532; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2533; SSE2-NEXT:    shlq $52, %rsi
2534; SSE2-NEXT:    sarq $63, %rsi
2535; SSE2-NEXT:    movd %esi, %xmm1
2536; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2537; SSE2-NEXT:    shlq $60, %rdi
2538; SSE2-NEXT:    sarq $63, %rdi
2539; SSE2-NEXT:    movd %edi, %xmm4
2540; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2541; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2542; SSE2-NEXT:    shrq $15, %rbp
2543; SSE2-NEXT:    movd %ebp, %xmm1
2544; SSE2-NEXT:    shrq $7, %rax
2545; SSE2-NEXT:    movd %eax, %xmm2
2546; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2547; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2548; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2549; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2550; SSE2-NEXT:    popq %rbx
2551; SSE2-NEXT:    popq %r12
2552; SSE2-NEXT:    popq %r13
2553; SSE2-NEXT:    popq %r14
2554; SSE2-NEXT:    popq %r15
2555; SSE2-NEXT:    popq %rbp
2556; SSE2-NEXT:    retq
2557;
2558; SSSE3-LABEL: load_sext_16i1_to_16i8:
2559; SSSE3:       # BB#0: # %entry
2560; SSSE3-NEXT:    pushq %rbp
2561; SSSE3-NEXT:    pushq %r15
2562; SSSE3-NEXT:    pushq %r14
2563; SSSE3-NEXT:    pushq %r13
2564; SSSE3-NEXT:    pushq %r12
2565; SSSE3-NEXT:    pushq %rbx
2566; SSSE3-NEXT:    movswq (%rdi), %rax
2567; SSSE3-NEXT:    movq %rax, %r8
2568; SSSE3-NEXT:    movq %rax, %r9
2569; SSSE3-NEXT:    movq %rax, %r10
2570; SSSE3-NEXT:    movq %rax, %r11
2571; SSSE3-NEXT:    movq %rax, %r14
2572; SSSE3-NEXT:    movq %rax, %r15
2573; SSSE3-NEXT:    movq %rax, %r12
2574; SSSE3-NEXT:    movq %rax, %r13
2575; SSSE3-NEXT:    movq %rax, %rbx
2576; SSSE3-NEXT:    movq %rax, %rcx
2577; SSSE3-NEXT:    movq %rax, %rdx
2578; SSSE3-NEXT:    movq %rax, %rsi
2579; SSSE3-NEXT:    movq %rax, %rdi
2580; SSSE3-NEXT:    movq %rax, %rbp
2581; SSSE3-NEXT:    shlq $49, %rbp
2582; SSSE3-NEXT:    sarq $63, %rbp
2583; SSSE3-NEXT:    movd %ebp, %xmm0
2584; SSSE3-NEXT:    movq %rax, %rbp
2585; SSSE3-NEXT:    movsbq %al, %rax
2586; SSSE3-NEXT:    shlq $57, %r8
2587; SSSE3-NEXT:    sarq $63, %r8
2588; SSSE3-NEXT:    movd %r8d, %xmm1
2589; SSSE3-NEXT:    shlq $53, %r9
2590; SSSE3-NEXT:    sarq $63, %r9
2591; SSSE3-NEXT:    movd %r9d, %xmm2
2592; SSSE3-NEXT:    shlq $61, %r10
2593; SSSE3-NEXT:    sarq $63, %r10
2594; SSSE3-NEXT:    movd %r10d, %xmm3
2595; SSSE3-NEXT:    shlq $51, %r11
2596; SSSE3-NEXT:    sarq $63, %r11
2597; SSSE3-NEXT:    movd %r11d, %xmm4
2598; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2599; SSSE3-NEXT:    shlq $59, %r14
2600; SSSE3-NEXT:    sarq $63, %r14
2601; SSSE3-NEXT:    movd %r14d, %xmm5
2602; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2603; SSSE3-NEXT:    shlq $55, %r15
2604; SSSE3-NEXT:    sarq $63, %r15
2605; SSSE3-NEXT:    movd %r15d, %xmm2
2606; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2607; SSSE3-NEXT:    shlq $63, %r12
2608; SSSE3-NEXT:    sarq $63, %r12
2609; SSSE3-NEXT:    movd %r12d, %xmm0
2610; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2611; SSSE3-NEXT:    shlq $50, %r13
2612; SSSE3-NEXT:    sarq $63, %r13
2613; SSSE3-NEXT:    movd %r13d, %xmm1
2614; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2615; SSSE3-NEXT:    shlq $58, %rbx
2616; SSSE3-NEXT:    sarq $63, %rbx
2617; SSSE3-NEXT:    movd %ebx, %xmm2
2618; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2619; SSSE3-NEXT:    shlq $54, %rcx
2620; SSSE3-NEXT:    sarq $63, %rcx
2621; SSSE3-NEXT:    movd %ecx, %xmm4
2622; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2623; SSSE3-NEXT:    shlq $62, %rdx
2624; SSSE3-NEXT:    sarq $63, %rdx
2625; SSSE3-NEXT:    movd %edx, %xmm3
2626; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2627; SSSE3-NEXT:    shlq $52, %rsi
2628; SSSE3-NEXT:    sarq $63, %rsi
2629; SSSE3-NEXT:    movd %esi, %xmm1
2630; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2631; SSSE3-NEXT:    shlq $60, %rdi
2632; SSSE3-NEXT:    sarq $63, %rdi
2633; SSSE3-NEXT:    movd %edi, %xmm4
2634; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2635; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2636; SSSE3-NEXT:    shrq $15, %rbp
2637; SSSE3-NEXT:    movd %ebp, %xmm1
2638; SSSE3-NEXT:    shrq $7, %rax
2639; SSSE3-NEXT:    movd %eax, %xmm2
2640; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2641; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2642; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2643; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2644; SSSE3-NEXT:    popq %rbx
2645; SSSE3-NEXT:    popq %r12
2646; SSSE3-NEXT:    popq %r13
2647; SSSE3-NEXT:    popq %r14
2648; SSSE3-NEXT:    popq %r15
2649; SSSE3-NEXT:    popq %rbp
2650; SSSE3-NEXT:    retq
2651;
2652; SSE41-LABEL: load_sext_16i1_to_16i8:
2653; SSE41:       # BB#0: # %entry
2654; SSE41-NEXT:    movswq (%rdi), %rax
2655; SSE41-NEXT:    movq %rax, %rcx
2656; SSE41-NEXT:    shlq $62, %rcx
2657; SSE41-NEXT:    sarq $63, %rcx
2658; SSE41-NEXT:    movq %rax, %rdx
2659; SSE41-NEXT:    shlq $63, %rdx
2660; SSE41-NEXT:    sarq $63, %rdx
2661; SSE41-NEXT:    movd %edx, %xmm0
2662; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2663; SSE41-NEXT:    movq %rax, %rcx
2664; SSE41-NEXT:    shlq $61, %rcx
2665; SSE41-NEXT:    sarq $63, %rcx
2666; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2667; SSE41-NEXT:    movq %rax, %rcx
2668; SSE41-NEXT:    shlq $60, %rcx
2669; SSE41-NEXT:    sarq $63, %rcx
2670; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2671; SSE41-NEXT:    movq %rax, %rcx
2672; SSE41-NEXT:    shlq $59, %rcx
2673; SSE41-NEXT:    sarq $63, %rcx
2674; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2675; SSE41-NEXT:    movq %rax, %rcx
2676; SSE41-NEXT:    shlq $58, %rcx
2677; SSE41-NEXT:    sarq $63, %rcx
2678; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2679; SSE41-NEXT:    movq %rax, %rcx
2680; SSE41-NEXT:    shlq $57, %rcx
2681; SSE41-NEXT:    sarq $63, %rcx
2682; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2683; SSE41-NEXT:    movsbq %al, %rcx
2684; SSE41-NEXT:    shrq $7, %rcx
2685; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2686; SSE41-NEXT:    movq %rax, %rcx
2687; SSE41-NEXT:    shlq $55, %rcx
2688; SSE41-NEXT:    sarq $63, %rcx
2689; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2690; SSE41-NEXT:    movq %rax, %rcx
2691; SSE41-NEXT:    shlq $54, %rcx
2692; SSE41-NEXT:    sarq $63, %rcx
2693; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2694; SSE41-NEXT:    movq %rax, %rcx
2695; SSE41-NEXT:    shlq $53, %rcx
2696; SSE41-NEXT:    sarq $63, %rcx
2697; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2698; SSE41-NEXT:    movq %rax, %rcx
2699; SSE41-NEXT:    shlq $52, %rcx
2700; SSE41-NEXT:    sarq $63, %rcx
2701; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2702; SSE41-NEXT:    movq %rax, %rcx
2703; SSE41-NEXT:    shlq $51, %rcx
2704; SSE41-NEXT:    sarq $63, %rcx
2705; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2706; SSE41-NEXT:    movq %rax, %rcx
2707; SSE41-NEXT:    shlq $50, %rcx
2708; SSE41-NEXT:    sarq $63, %rcx
2709; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2710; SSE41-NEXT:    movq %rax, %rcx
2711; SSE41-NEXT:    shlq $49, %rcx
2712; SSE41-NEXT:    sarq $63, %rcx
2713; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2714; SSE41-NEXT:    shrq $15, %rax
2715; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2716; SSE41-NEXT:    retq
2717;
2718; AVX1-LABEL: load_sext_16i1_to_16i8:
2719; AVX1:       # BB#0: # %entry
2720; AVX1-NEXT:    movswq (%rdi), %rax
2721; AVX1-NEXT:    movq %rax, %rcx
2722; AVX1-NEXT:    shlq $62, %rcx
2723; AVX1-NEXT:    sarq $63, %rcx
2724; AVX1-NEXT:    movq %rax, %rdx
2725; AVX1-NEXT:    shlq $63, %rdx
2726; AVX1-NEXT:    sarq $63, %rdx
2727; AVX1-NEXT:    vmovd %edx, %xmm0
2728; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
2729; AVX1-NEXT:    movq %rax, %rcx
2730; AVX1-NEXT:    shlq $61, %rcx
2731; AVX1-NEXT:    sarq $63, %rcx
2732; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2733; AVX1-NEXT:    movq %rax, %rcx
2734; AVX1-NEXT:    shlq $60, %rcx
2735; AVX1-NEXT:    sarq $63, %rcx
2736; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
2737; AVX1-NEXT:    movq %rax, %rcx
2738; AVX1-NEXT:    shlq $59, %rcx
2739; AVX1-NEXT:    sarq $63, %rcx
2740; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
2741; AVX1-NEXT:    movq %rax, %rcx
2742; AVX1-NEXT:    shlq $58, %rcx
2743; AVX1-NEXT:    sarq $63, %rcx
2744; AVX1-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
2745; AVX1-NEXT:    movq %rax, %rcx
2746; AVX1-NEXT:    shlq $57, %rcx
2747; AVX1-NEXT:    sarq $63, %rcx
2748; AVX1-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
2749; AVX1-NEXT:    movsbq %al, %rcx
2750; AVX1-NEXT:    shrq $7, %rcx
2751; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
2752; AVX1-NEXT:    movq %rax, %rcx
2753; AVX1-NEXT:    shlq $55, %rcx
2754; AVX1-NEXT:    sarq $63, %rcx
2755; AVX1-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2756; AVX1-NEXT:    movq %rax, %rcx
2757; AVX1-NEXT:    shlq $54, %rcx
2758; AVX1-NEXT:    sarq $63, %rcx
2759; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
2760; AVX1-NEXT:    movq %rax, %rcx
2761; AVX1-NEXT:    shlq $53, %rcx
2762; AVX1-NEXT:    sarq $63, %rcx
2763; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
2764; AVX1-NEXT:    movq %rax, %rcx
2765; AVX1-NEXT:    shlq $52, %rcx
2766; AVX1-NEXT:    sarq $63, %rcx
2767; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
2768; AVX1-NEXT:    movq %rax, %rcx
2769; AVX1-NEXT:    shlq $51, %rcx
2770; AVX1-NEXT:    sarq $63, %rcx
2771; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
2772; AVX1-NEXT:    movq %rax, %rcx
2773; AVX1-NEXT:    shlq $50, %rcx
2774; AVX1-NEXT:    sarq $63, %rcx
2775; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
2776; AVX1-NEXT:    movq %rax, %rcx
2777; AVX1-NEXT:    shlq $49, %rcx
2778; AVX1-NEXT:    sarq $63, %rcx
2779; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2780; AVX1-NEXT:    shrq $15, %rax
2781; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2782; AVX1-NEXT:    retq
2783;
2784; AVX2-LABEL: load_sext_16i1_to_16i8:
2785; AVX2:       # BB#0: # %entry
2786; AVX2-NEXT:    movswq (%rdi), %rax
2787; AVX2-NEXT:    movq %rax, %rcx
2788; AVX2-NEXT:    shlq $62, %rcx
2789; AVX2-NEXT:    sarq $63, %rcx
2790; AVX2-NEXT:    movq %rax, %rdx
2791; AVX2-NEXT:    shlq $63, %rdx
2792; AVX2-NEXT:    sarq $63, %rdx
2793; AVX2-NEXT:    vmovd %edx, %xmm0
2794; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
2795; AVX2-NEXT:    movq %rax, %rcx
2796; AVX2-NEXT:    shlq $61, %rcx
2797; AVX2-NEXT:    sarq $63, %rcx
2798; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
2799; AVX2-NEXT:    movq %rax, %rcx
2800; AVX2-NEXT:    shlq $60, %rcx
2801; AVX2-NEXT:    sarq $63, %rcx
2802; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
2803; AVX2-NEXT:    movq %rax, %rcx
2804; AVX2-NEXT:    shlq $59, %rcx
2805; AVX2-NEXT:    sarq $63, %rcx
2806; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
2807; AVX2-NEXT:    movq %rax, %rcx
2808; AVX2-NEXT:    shlq $58, %rcx
2809; AVX2-NEXT:    sarq $63, %rcx
2810; AVX2-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
2811; AVX2-NEXT:    movq %rax, %rcx
2812; AVX2-NEXT:    shlq $57, %rcx
2813; AVX2-NEXT:    sarq $63, %rcx
2814; AVX2-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
2815; AVX2-NEXT:    movsbq %al, %rcx
2816; AVX2-NEXT:    shrq $7, %rcx
2817; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
2818; AVX2-NEXT:    movq %rax, %rcx
2819; AVX2-NEXT:    shlq $55, %rcx
2820; AVX2-NEXT:    sarq $63, %rcx
2821; AVX2-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
2822; AVX2-NEXT:    movq %rax, %rcx
2823; AVX2-NEXT:    shlq $54, %rcx
2824; AVX2-NEXT:    sarq $63, %rcx
2825; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
2826; AVX2-NEXT:    movq %rax, %rcx
2827; AVX2-NEXT:    shlq $53, %rcx
2828; AVX2-NEXT:    sarq $63, %rcx
2829; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
2830; AVX2-NEXT:    movq %rax, %rcx
2831; AVX2-NEXT:    shlq $52, %rcx
2832; AVX2-NEXT:    sarq $63, %rcx
2833; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
2834; AVX2-NEXT:    movq %rax, %rcx
2835; AVX2-NEXT:    shlq $51, %rcx
2836; AVX2-NEXT:    sarq $63, %rcx
2837; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
2838; AVX2-NEXT:    movq %rax, %rcx
2839; AVX2-NEXT:    shlq $50, %rcx
2840; AVX2-NEXT:    sarq $63, %rcx
2841; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
2842; AVX2-NEXT:    movq %rax, %rcx
2843; AVX2-NEXT:    shlq $49, %rcx
2844; AVX2-NEXT:    sarq $63, %rcx
2845; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
2846; AVX2-NEXT:    shrq $15, %rax
2847; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2848; AVX2-NEXT:    retq
2849;
2850; AVX512-LABEL: load_sext_16i1_to_16i8:
2851; AVX512:       # BB#0: # %entry
2852; AVX512-NEXT:    kmovw (%rdi), %k1
2853; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2854; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2855; AVX512-NEXT:    retq
2856;
2857; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
2858; X32-SSE41:       # BB#0: # %entry
2859; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
2860; X32-SSE41-NEXT:    movswl (%eax), %eax
2861; X32-SSE41-NEXT:    movl %eax, %ecx
2862; X32-SSE41-NEXT:    shll $30, %ecx
2863; X32-SSE41-NEXT:    sarl $31, %ecx
2864; X32-SSE41-NEXT:    movl %eax, %edx
2865; X32-SSE41-NEXT:    shll $31, %edx
2866; X32-SSE41-NEXT:    sarl $31, %edx
2867; X32-SSE41-NEXT:    movd %edx, %xmm0
2868; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
2869; X32-SSE41-NEXT:    movl %eax, %ecx
2870; X32-SSE41-NEXT:    shll $29, %ecx
2871; X32-SSE41-NEXT:    sarl $31, %ecx
2872; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
2873; X32-SSE41-NEXT:    movl %eax, %ecx
2874; X32-SSE41-NEXT:    shll $28, %ecx
2875; X32-SSE41-NEXT:    sarl $31, %ecx
2876; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
2877; X32-SSE41-NEXT:    movl %eax, %ecx
2878; X32-SSE41-NEXT:    shll $27, %ecx
2879; X32-SSE41-NEXT:    sarl $31, %ecx
2880; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
2881; X32-SSE41-NEXT:    movl %eax, %ecx
2882; X32-SSE41-NEXT:    shll $26, %ecx
2883; X32-SSE41-NEXT:    sarl $31, %ecx
2884; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
2885; X32-SSE41-NEXT:    movl %eax, %ecx
2886; X32-SSE41-NEXT:    shll $25, %ecx
2887; X32-SSE41-NEXT:    sarl $31, %ecx
2888; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
2889; X32-SSE41-NEXT:    movsbl %al, %ecx
2890; X32-SSE41-NEXT:    shrl $7, %ecx
2891; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
2892; X32-SSE41-NEXT:    movl %eax, %ecx
2893; X32-SSE41-NEXT:    shll $23, %ecx
2894; X32-SSE41-NEXT:    sarl $31, %ecx
2895; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
2896; X32-SSE41-NEXT:    movl %eax, %ecx
2897; X32-SSE41-NEXT:    shll $22, %ecx
2898; X32-SSE41-NEXT:    sarl $31, %ecx
2899; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
2900; X32-SSE41-NEXT:    movl %eax, %ecx
2901; X32-SSE41-NEXT:    shll $21, %ecx
2902; X32-SSE41-NEXT:    sarl $31, %ecx
2903; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
2904; X32-SSE41-NEXT:    movl %eax, %ecx
2905; X32-SSE41-NEXT:    shll $20, %ecx
2906; X32-SSE41-NEXT:    sarl $31, %ecx
2907; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
2908; X32-SSE41-NEXT:    movl %eax, %ecx
2909; X32-SSE41-NEXT:    shll $19, %ecx
2910; X32-SSE41-NEXT:    sarl $31, %ecx
2911; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
2912; X32-SSE41-NEXT:    movl %eax, %ecx
2913; X32-SSE41-NEXT:    shll $18, %ecx
2914; X32-SSE41-NEXT:    sarl $31, %ecx
2915; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
2916; X32-SSE41-NEXT:    movl %eax, %ecx
2917; X32-SSE41-NEXT:    shll $17, %ecx
2918; X32-SSE41-NEXT:    sarl $31, %ecx
2919; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
2920; X32-SSE41-NEXT:    shrl $15, %eax
2921; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm0
2922; X32-SSE41-NEXT:    retl
2923entry:
2924 %X = load <16 x i1>, <16 x i1>* %ptr
2925 %Y = sext <16 x i1> %X to <16 x i8>
2926 ret <16 x i8> %Y
2927}
2928
2929define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
2930; SSE2-LABEL: load_sext_16i1_to_16i16:
2931; SSE2:       # BB#0: # %entry
2932; SSE2-NEXT:    movzwl (%rdi), %eax
2933; SSE2-NEXT:    movl %eax, %ecx
2934; SSE2-NEXT:    shrl $14, %ecx
2935; SSE2-NEXT:    andl $1, %ecx
2936; SSE2-NEXT:    movd %ecx, %xmm0
2937; SSE2-NEXT:    movl %eax, %ecx
2938; SSE2-NEXT:    shrl $6, %ecx
2939; SSE2-NEXT:    andl $1, %ecx
2940; SSE2-NEXT:    movd %ecx, %xmm1
2941; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2942; SSE2-NEXT:    movl %eax, %ecx
2943; SSE2-NEXT:    shrl $10, %ecx
2944; SSE2-NEXT:    andl $1, %ecx
2945; SSE2-NEXT:    movd %ecx, %xmm0
2946; SSE2-NEXT:    movl %eax, %ecx
2947; SSE2-NEXT:    shrl $2, %ecx
2948; SSE2-NEXT:    andl $1, %ecx
2949; SSE2-NEXT:    movd %ecx, %xmm2
2950; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2951; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2952; SSE2-NEXT:    movl %eax, %ecx
2953; SSE2-NEXT:    shrl $12, %ecx
2954; SSE2-NEXT:    andl $1, %ecx
2955; SSE2-NEXT:    movd %ecx, %xmm0
2956; SSE2-NEXT:    movl %eax, %ecx
2957; SSE2-NEXT:    shrl $4, %ecx
2958; SSE2-NEXT:    andl $1, %ecx
2959; SSE2-NEXT:    movd %ecx, %xmm3
2960; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2961; SSE2-NEXT:    movl %eax, %ecx
2962; SSE2-NEXT:    andl $1, %ecx
2963; SSE2-NEXT:    movd %ecx, %xmm1
2964; SSE2-NEXT:    movl %eax, %ecx
2965; SSE2-NEXT:    shrl $8, %ecx
2966; SSE2-NEXT:    andl $1, %ecx
2967; SSE2-NEXT:    movd %ecx, %xmm0
2968; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2969; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2970; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2971; SSE2-NEXT:    movl %eax, %ecx
2972; SSE2-NEXT:    shrl $13, %ecx
2973; SSE2-NEXT:    andl $1, %ecx
2974; SSE2-NEXT:    movd %ecx, %xmm0
2975; SSE2-NEXT:    movl %eax, %ecx
2976; SSE2-NEXT:    shrl $5, %ecx
2977; SSE2-NEXT:    andl $1, %ecx
2978; SSE2-NEXT:    movd %ecx, %xmm2
2979; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2980; SSE2-NEXT:    movl %eax, %ecx
2981; SSE2-NEXT:    shrl $9, %ecx
2982; SSE2-NEXT:    andl $1, %ecx
2983; SSE2-NEXT:    movd %ecx, %xmm3
2984; SSE2-NEXT:    movl %eax, %ecx
2985; SSE2-NEXT:    shrl %ecx
2986; SSE2-NEXT:    andl $1, %ecx
2987; SSE2-NEXT:    movd %ecx, %xmm0
2988; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2989; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2990; SSE2-NEXT:    movl %eax, %ecx
2991; SSE2-NEXT:    shrl $11, %ecx
2992; SSE2-NEXT:    andl $1, %ecx
2993; SSE2-NEXT:    movd %ecx, %xmm2
2994; SSE2-NEXT:    movl %eax, %ecx
2995; SSE2-NEXT:    shrl $3, %ecx
2996; SSE2-NEXT:    andl $1, %ecx
2997; SSE2-NEXT:    movd %ecx, %xmm3
2998; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2999; SSE2-NEXT:    movl %eax, %ecx
3000; SSE2-NEXT:    shrl $7, %ecx
3001; SSE2-NEXT:    andl $1, %ecx
3002; SSE2-NEXT:    movd %ecx, %xmm2
3003; SSE2-NEXT:    shrl $15, %eax
3004; SSE2-NEXT:    movzwl %ax, %eax
3005; SSE2-NEXT:    movd %eax, %xmm4
3006; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3007; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3008; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3009; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3010; SSE2-NEXT:    movdqa %xmm1, %xmm0
3011; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3012; SSE2-NEXT:    psllw $15, %xmm0
3013; SSE2-NEXT:    psraw $15, %xmm0
3014; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3015; SSE2-NEXT:    psllw $15, %xmm1
3016; SSE2-NEXT:    psraw $15, %xmm1
3017; SSE2-NEXT:    retq
3018;
3019; SSSE3-LABEL: load_sext_16i1_to_16i16:
3020; SSSE3:       # BB#0: # %entry
3021; SSSE3-NEXT:    movzwl (%rdi), %eax
3022; SSSE3-NEXT:    movl %eax, %ecx
3023; SSSE3-NEXT:    shrl $14, %ecx
3024; SSSE3-NEXT:    andl $1, %ecx
3025; SSSE3-NEXT:    movd %ecx, %xmm0
3026; SSSE3-NEXT:    movl %eax, %ecx
3027; SSSE3-NEXT:    shrl $6, %ecx
3028; SSSE3-NEXT:    andl $1, %ecx
3029; SSSE3-NEXT:    movd %ecx, %xmm1
3030; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3031; SSSE3-NEXT:    movl %eax, %ecx
3032; SSSE3-NEXT:    shrl $10, %ecx
3033; SSSE3-NEXT:    andl $1, %ecx
3034; SSSE3-NEXT:    movd %ecx, %xmm0
3035; SSSE3-NEXT:    movl %eax, %ecx
3036; SSSE3-NEXT:    shrl $2, %ecx
3037; SSSE3-NEXT:    andl $1, %ecx
3038; SSSE3-NEXT:    movd %ecx, %xmm2
3039; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
3040; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3041; SSSE3-NEXT:    movl %eax, %ecx
3042; SSSE3-NEXT:    shrl $12, %ecx
3043; SSSE3-NEXT:    andl $1, %ecx
3044; SSSE3-NEXT:    movd %ecx, %xmm0
3045; SSSE3-NEXT:    movl %eax, %ecx
3046; SSSE3-NEXT:    shrl $4, %ecx
3047; SSSE3-NEXT:    andl $1, %ecx
3048; SSSE3-NEXT:    movd %ecx, %xmm3
3049; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
3050; SSSE3-NEXT:    movl %eax, %ecx
3051; SSSE3-NEXT:    andl $1, %ecx
3052; SSSE3-NEXT:    movd %ecx, %xmm1
3053; SSSE3-NEXT:    movl %eax, %ecx
3054; SSSE3-NEXT:    shrl $8, %ecx
3055; SSSE3-NEXT:    andl $1, %ecx
3056; SSSE3-NEXT:    movd %ecx, %xmm0
3057; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3058; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3059; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3060; SSSE3-NEXT:    movl %eax, %ecx
3061; SSSE3-NEXT:    shrl $13, %ecx
3062; SSSE3-NEXT:    andl $1, %ecx
3063; SSSE3-NEXT:    movd %ecx, %xmm0
3064; SSSE3-NEXT:    movl %eax, %ecx
3065; SSSE3-NEXT:    shrl $5, %ecx
3066; SSSE3-NEXT:    andl $1, %ecx
3067; SSSE3-NEXT:    movd %ecx, %xmm2
3068; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
3069; SSSE3-NEXT:    movl %eax, %ecx
3070; SSSE3-NEXT:    shrl $9, %ecx
3071; SSSE3-NEXT:    andl $1, %ecx
3072; SSSE3-NEXT:    movd %ecx, %xmm3
3073; SSSE3-NEXT:    movl %eax, %ecx
3074; SSSE3-NEXT:    shrl %ecx
3075; SSSE3-NEXT:    andl $1, %ecx
3076; SSSE3-NEXT:    movd %ecx, %xmm0
3077; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3078; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3079; SSSE3-NEXT:    movl %eax, %ecx
3080; SSSE3-NEXT:    shrl $11, %ecx
3081; SSSE3-NEXT:    andl $1, %ecx
3082; SSSE3-NEXT:    movd %ecx, %xmm2
3083; SSSE3-NEXT:    movl %eax, %ecx
3084; SSSE3-NEXT:    shrl $3, %ecx
3085; SSSE3-NEXT:    andl $1, %ecx
3086; SSSE3-NEXT:    movd %ecx, %xmm3
3087; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3088; SSSE3-NEXT:    movl %eax, %ecx
3089; SSSE3-NEXT:    shrl $7, %ecx
3090; SSSE3-NEXT:    andl $1, %ecx
3091; SSSE3-NEXT:    movd %ecx, %xmm2
3092; SSSE3-NEXT:    shrl $15, %eax
3093; SSSE3-NEXT:    movzwl %ax, %eax
3094; SSSE3-NEXT:    movd %eax, %xmm4
3095; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3096; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3097; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3098; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3099; SSSE3-NEXT:    movdqa %xmm1, %xmm0
3100; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3101; SSSE3-NEXT:    psllw $15, %xmm0
3102; SSSE3-NEXT:    psraw $15, %xmm0
3103; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3104; SSSE3-NEXT:    psllw $15, %xmm1
3105; SSSE3-NEXT:    psraw $15, %xmm1
3106; SSSE3-NEXT:    retq
3107;
3108; SSE41-LABEL: load_sext_16i1_to_16i16:
3109; SSE41:       # BB#0: # %entry
3110; SSE41-NEXT:    movzwl (%rdi), %eax
3111; SSE41-NEXT:    movl %eax, %ecx
3112; SSE41-NEXT:    shrl %ecx
3113; SSE41-NEXT:    andl $1, %ecx
3114; SSE41-NEXT:    movl %eax, %edx
3115; SSE41-NEXT:    andl $1, %edx
3116; SSE41-NEXT:    movd %edx, %xmm1
3117; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3118; SSE41-NEXT:    movl %eax, %ecx
3119; SSE41-NEXT:    shrl $2, %ecx
3120; SSE41-NEXT:    andl $1, %ecx
3121; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3122; SSE41-NEXT:    movl %eax, %ecx
3123; SSE41-NEXT:    shrl $3, %ecx
3124; SSE41-NEXT:    andl $1, %ecx
3125; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3126; SSE41-NEXT:    movl %eax, %ecx
3127; SSE41-NEXT:    shrl $4, %ecx
3128; SSE41-NEXT:    andl $1, %ecx
3129; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3130; SSE41-NEXT:    movl %eax, %ecx
3131; SSE41-NEXT:    shrl $5, %ecx
3132; SSE41-NEXT:    andl $1, %ecx
3133; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3134; SSE41-NEXT:    movl %eax, %ecx
3135; SSE41-NEXT:    shrl $6, %ecx
3136; SSE41-NEXT:    andl $1, %ecx
3137; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3138; SSE41-NEXT:    movl %eax, %ecx
3139; SSE41-NEXT:    shrl $7, %ecx
3140; SSE41-NEXT:    andl $1, %ecx
3141; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3142; SSE41-NEXT:    movl %eax, %ecx
3143; SSE41-NEXT:    shrl $8, %ecx
3144; SSE41-NEXT:    andl $1, %ecx
3145; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3146; SSE41-NEXT:    movl %eax, %ecx
3147; SSE41-NEXT:    shrl $9, %ecx
3148; SSE41-NEXT:    andl $1, %ecx
3149; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3150; SSE41-NEXT:    movl %eax, %ecx
3151; SSE41-NEXT:    shrl $10, %ecx
3152; SSE41-NEXT:    andl $1, %ecx
3153; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3154; SSE41-NEXT:    movl %eax, %ecx
3155; SSE41-NEXT:    shrl $11, %ecx
3156; SSE41-NEXT:    andl $1, %ecx
3157; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3158; SSE41-NEXT:    movl %eax, %ecx
3159; SSE41-NEXT:    shrl $12, %ecx
3160; SSE41-NEXT:    andl $1, %ecx
3161; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3162; SSE41-NEXT:    movl %eax, %ecx
3163; SSE41-NEXT:    shrl $13, %ecx
3164; SSE41-NEXT:    andl $1, %ecx
3165; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3166; SSE41-NEXT:    movl %eax, %ecx
3167; SSE41-NEXT:    shrl $14, %ecx
3168; SSE41-NEXT:    andl $1, %ecx
3169; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3170; SSE41-NEXT:    shrl $15, %eax
3171; SSE41-NEXT:    movzwl %ax, %eax
3172; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3173; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
3174; SSE41-NEXT:    psllw $15, %xmm0
3175; SSE41-NEXT:    psraw $15, %xmm0
3176; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3177; SSE41-NEXT:    psllw $15, %xmm1
3178; SSE41-NEXT:    psraw $15, %xmm1
3179; SSE41-NEXT:    retq
3180;
3181; AVX1-LABEL: load_sext_16i1_to_16i16:
3182; AVX1:       # BB#0: # %entry
3183; AVX1-NEXT:    pushq %rbp
3184; AVX1-NEXT:  .Lcfi0:
3185; AVX1-NEXT:    .cfi_def_cfa_offset 16
3186; AVX1-NEXT:    pushq %r15
3187; AVX1-NEXT:  .Lcfi1:
3188; AVX1-NEXT:    .cfi_def_cfa_offset 24
3189; AVX1-NEXT:    pushq %r14
3190; AVX1-NEXT:  .Lcfi2:
3191; AVX1-NEXT:    .cfi_def_cfa_offset 32
3192; AVX1-NEXT:    pushq %r13
3193; AVX1-NEXT:  .Lcfi3:
3194; AVX1-NEXT:    .cfi_def_cfa_offset 40
3195; AVX1-NEXT:    pushq %r12
3196; AVX1-NEXT:  .Lcfi4:
3197; AVX1-NEXT:    .cfi_def_cfa_offset 48
3198; AVX1-NEXT:    pushq %rbx
3199; AVX1-NEXT:  .Lcfi5:
3200; AVX1-NEXT:    .cfi_def_cfa_offset 56
3201; AVX1-NEXT:  .Lcfi6:
3202; AVX1-NEXT:    .cfi_offset %rbx, -56
3203; AVX1-NEXT:  .Lcfi7:
3204; AVX1-NEXT:    .cfi_offset %r12, -48
3205; AVX1-NEXT:  .Lcfi8:
3206; AVX1-NEXT:    .cfi_offset %r13, -40
3207; AVX1-NEXT:  .Lcfi9:
3208; AVX1-NEXT:    .cfi_offset %r14, -32
3209; AVX1-NEXT:  .Lcfi10:
3210; AVX1-NEXT:    .cfi_offset %r15, -24
3211; AVX1-NEXT:  .Lcfi11:
3212; AVX1-NEXT:    .cfi_offset %rbp, -16
3213; AVX1-NEXT:    movswq (%rdi), %rax
3214; AVX1-NEXT:    movq %rax, %rcx
3215; AVX1-NEXT:    shlq $55, %rcx
3216; AVX1-NEXT:    sarq $63, %rcx
3217; AVX1-NEXT:    vmovd %ecx, %xmm0
3218; AVX1-NEXT:    movq %rax, %r8
3219; AVX1-NEXT:    movq %rax, %r10
3220; AVX1-NEXT:    movq %rax, %r11
3221; AVX1-NEXT:    movq %rax, %r14
3222; AVX1-NEXT:    movq %rax, %r15
3223; AVX1-NEXT:    movq %rax, %r9
3224; AVX1-NEXT:    movq %rax, %r12
3225; AVX1-NEXT:    movq %rax, %r13
3226; AVX1-NEXT:    movq %rax, %rbx
3227; AVX1-NEXT:    movq %rax, %rdi
3228; AVX1-NEXT:    movq %rax, %rcx
3229; AVX1-NEXT:    movq %rax, %rdx
3230; AVX1-NEXT:    movq %rax, %rsi
3231; AVX1-NEXT:    movsbq %al, %rbp
3232; AVX1-NEXT:    shlq $54, %rax
3233; AVX1-NEXT:    sarq $63, %rax
3234; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3235; AVX1-NEXT:    shlq $53, %r8
3236; AVX1-NEXT:    sarq $63, %r8
3237; AVX1-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
3238; AVX1-NEXT:    shlq $52, %r10
3239; AVX1-NEXT:    sarq $63, %r10
3240; AVX1-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
3241; AVX1-NEXT:    shlq $51, %r11
3242; AVX1-NEXT:    sarq $63, %r11
3243; AVX1-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
3244; AVX1-NEXT:    shlq $50, %r14
3245; AVX1-NEXT:    sarq $63, %r14
3246; AVX1-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
3247; AVX1-NEXT:    shlq $49, %r15
3248; AVX1-NEXT:    sarq $63, %r15
3249; AVX1-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
3250; AVX1-NEXT:    shrq $15, %r9
3251; AVX1-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
3252; AVX1-NEXT:    shlq $63, %r13
3253; AVX1-NEXT:    sarq $63, %r13
3254; AVX1-NEXT:    vmovd %r13d, %xmm1
3255; AVX1-NEXT:    shlq $62, %r12
3256; AVX1-NEXT:    sarq $63, %r12
3257; AVX1-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
3258; AVX1-NEXT:    shlq $61, %rbx
3259; AVX1-NEXT:    sarq $63, %rbx
3260; AVX1-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
3261; AVX1-NEXT:    shlq $60, %rdi
3262; AVX1-NEXT:    sarq $63, %rdi
3263; AVX1-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
3264; AVX1-NEXT:    shlq $59, %rcx
3265; AVX1-NEXT:    sarq $63, %rcx
3266; AVX1-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
3267; AVX1-NEXT:    shlq $58, %rdx
3268; AVX1-NEXT:    sarq $63, %rdx
3269; AVX1-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
3270; AVX1-NEXT:    shlq $57, %rsi
3271; AVX1-NEXT:    sarq $63, %rsi
3272; AVX1-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
3273; AVX1-NEXT:    shrq $7, %rbp
3274; AVX1-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
3275; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
3276; AVX1-NEXT:    popq %rbx
3277; AVX1-NEXT:    popq %r12
3278; AVX1-NEXT:    popq %r13
3279; AVX1-NEXT:    popq %r14
3280; AVX1-NEXT:    popq %r15
3281; AVX1-NEXT:    popq %rbp
3282; AVX1-NEXT:    retq
3283;
3284; AVX2-LABEL: load_sext_16i1_to_16i16:
3285; AVX2:       # BB#0: # %entry
3286; AVX2-NEXT:    pushq %rbp
3287; AVX2-NEXT:  .Lcfi0:
3288; AVX2-NEXT:    .cfi_def_cfa_offset 16
3289; AVX2-NEXT:    pushq %r15
3290; AVX2-NEXT:  .Lcfi1:
3291; AVX2-NEXT:    .cfi_def_cfa_offset 24
3292; AVX2-NEXT:    pushq %r14
3293; AVX2-NEXT:  .Lcfi2:
3294; AVX2-NEXT:    .cfi_def_cfa_offset 32
3295; AVX2-NEXT:    pushq %r13
3296; AVX2-NEXT:  .Lcfi3:
3297; AVX2-NEXT:    .cfi_def_cfa_offset 40
3298; AVX2-NEXT:    pushq %r12
3299; AVX2-NEXT:  .Lcfi4:
3300; AVX2-NEXT:    .cfi_def_cfa_offset 48
3301; AVX2-NEXT:    pushq %rbx
3302; AVX2-NEXT:  .Lcfi5:
3303; AVX2-NEXT:    .cfi_def_cfa_offset 56
3304; AVX2-NEXT:  .Lcfi6:
3305; AVX2-NEXT:    .cfi_offset %rbx, -56
3306; AVX2-NEXT:  .Lcfi7:
3307; AVX2-NEXT:    .cfi_offset %r12, -48
3308; AVX2-NEXT:  .Lcfi8:
3309; AVX2-NEXT:    .cfi_offset %r13, -40
3310; AVX2-NEXT:  .Lcfi9:
3311; AVX2-NEXT:    .cfi_offset %r14, -32
3312; AVX2-NEXT:  .Lcfi10:
3313; AVX2-NEXT:    .cfi_offset %r15, -24
3314; AVX2-NEXT:  .Lcfi11:
3315; AVX2-NEXT:    .cfi_offset %rbp, -16
3316; AVX2-NEXT:    movswq (%rdi), %rax
3317; AVX2-NEXT:    movq %rax, %rcx
3318; AVX2-NEXT:    shlq $55, %rcx
3319; AVX2-NEXT:    sarq $63, %rcx
3320; AVX2-NEXT:    vmovd %ecx, %xmm0
3321; AVX2-NEXT:    movq %rax, %r8
3322; AVX2-NEXT:    movq %rax, %r10
3323; AVX2-NEXT:    movq %rax, %r11
3324; AVX2-NEXT:    movq %rax, %r14
3325; AVX2-NEXT:    movq %rax, %r15
3326; AVX2-NEXT:    movq %rax, %r9
3327; AVX2-NEXT:    movq %rax, %r12
3328; AVX2-NEXT:    movq %rax, %r13
3329; AVX2-NEXT:    movq %rax, %rbx
3330; AVX2-NEXT:    movq %rax, %rdi
3331; AVX2-NEXT:    movq %rax, %rcx
3332; AVX2-NEXT:    movq %rax, %rdx
3333; AVX2-NEXT:    movq %rax, %rsi
3334; AVX2-NEXT:    movsbq %al, %rbp
3335; AVX2-NEXT:    shlq $54, %rax
3336; AVX2-NEXT:    sarq $63, %rax
3337; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3338; AVX2-NEXT:    shlq $53, %r8
3339; AVX2-NEXT:    sarq $63, %r8
3340; AVX2-NEXT:    vpinsrw $2, %r8d, %xmm0, %xmm0
3341; AVX2-NEXT:    shlq $52, %r10
3342; AVX2-NEXT:    sarq $63, %r10
3343; AVX2-NEXT:    vpinsrw $3, %r10d, %xmm0, %xmm0
3344; AVX2-NEXT:    shlq $51, %r11
3345; AVX2-NEXT:    sarq $63, %r11
3346; AVX2-NEXT:    vpinsrw $4, %r11d, %xmm0, %xmm0
3347; AVX2-NEXT:    shlq $50, %r14
3348; AVX2-NEXT:    sarq $63, %r14
3349; AVX2-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
3350; AVX2-NEXT:    shlq $49, %r15
3351; AVX2-NEXT:    sarq $63, %r15
3352; AVX2-NEXT:    vpinsrw $6, %r15d, %xmm0, %xmm0
3353; AVX2-NEXT:    shrq $15, %r9
3354; AVX2-NEXT:    vpinsrw $7, %r9d, %xmm0, %xmm0
3355; AVX2-NEXT:    shlq $63, %r13
3356; AVX2-NEXT:    sarq $63, %r13
3357; AVX2-NEXT:    vmovd %r13d, %xmm1
3358; AVX2-NEXT:    shlq $62, %r12
3359; AVX2-NEXT:    sarq $63, %r12
3360; AVX2-NEXT:    vpinsrw $1, %r12d, %xmm1, %xmm1
3361; AVX2-NEXT:    shlq $61, %rbx
3362; AVX2-NEXT:    sarq $63, %rbx
3363; AVX2-NEXT:    vpinsrw $2, %ebx, %xmm1, %xmm1
3364; AVX2-NEXT:    shlq $60, %rdi
3365; AVX2-NEXT:    sarq $63, %rdi
3366; AVX2-NEXT:    vpinsrw $3, %edi, %xmm1, %xmm1
3367; AVX2-NEXT:    shlq $59, %rcx
3368; AVX2-NEXT:    sarq $63, %rcx
3369; AVX2-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
3370; AVX2-NEXT:    shlq $58, %rdx
3371; AVX2-NEXT:    sarq $63, %rdx
3372; AVX2-NEXT:    vpinsrw $5, %edx, %xmm1, %xmm1
3373; AVX2-NEXT:    shlq $57, %rsi
3374; AVX2-NEXT:    sarq $63, %rsi
3375; AVX2-NEXT:    vpinsrw $6, %esi, %xmm1, %xmm1
3376; AVX2-NEXT:    shrq $7, %rbp
3377; AVX2-NEXT:    vpinsrw $7, %ebp, %xmm1, %xmm1
3378; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
3379; AVX2-NEXT:    popq %rbx
3380; AVX2-NEXT:    popq %r12
3381; AVX2-NEXT:    popq %r13
3382; AVX2-NEXT:    popq %r14
3383; AVX2-NEXT:    popq %r15
3384; AVX2-NEXT:    popq %rbp
3385; AVX2-NEXT:    retq
3386;
3387; AVX512-LABEL: load_sext_16i1_to_16i16:
3388; AVX512:       # BB#0: # %entry
3389; AVX512-NEXT:    kmovw (%rdi), %k1
3390; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
3391; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3392; AVX512-NEXT:    retq
3393;
3394; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
3395; X32-SSE41:       # BB#0: # %entry
3396; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
3397; X32-SSE41-NEXT:    movzwl (%eax), %eax
3398; X32-SSE41-NEXT:    movl %eax, %ecx
3399; X32-SSE41-NEXT:    shrl %ecx
3400; X32-SSE41-NEXT:    andl $1, %ecx
3401; X32-SSE41-NEXT:    movl %eax, %edx
3402; X32-SSE41-NEXT:    andl $1, %edx
3403; X32-SSE41-NEXT:    movd %edx, %xmm1
3404; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3405; X32-SSE41-NEXT:    movl %eax, %ecx
3406; X32-SSE41-NEXT:    shrl $2, %ecx
3407; X32-SSE41-NEXT:    andl $1, %ecx
3408; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3409; X32-SSE41-NEXT:    movl %eax, %ecx
3410; X32-SSE41-NEXT:    shrl $3, %ecx
3411; X32-SSE41-NEXT:    andl $1, %ecx
3412; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3413; X32-SSE41-NEXT:    movl %eax, %ecx
3414; X32-SSE41-NEXT:    shrl $4, %ecx
3415; X32-SSE41-NEXT:    andl $1, %ecx
3416; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3417; X32-SSE41-NEXT:    movl %eax, %ecx
3418; X32-SSE41-NEXT:    shrl $5, %ecx
3419; X32-SSE41-NEXT:    andl $1, %ecx
3420; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3421; X32-SSE41-NEXT:    movl %eax, %ecx
3422; X32-SSE41-NEXT:    shrl $6, %ecx
3423; X32-SSE41-NEXT:    andl $1, %ecx
3424; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3425; X32-SSE41-NEXT:    movl %eax, %ecx
3426; X32-SSE41-NEXT:    shrl $7, %ecx
3427; X32-SSE41-NEXT:    andl $1, %ecx
3428; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3429; X32-SSE41-NEXT:    movl %eax, %ecx
3430; X32-SSE41-NEXT:    shrl $8, %ecx
3431; X32-SSE41-NEXT:    andl $1, %ecx
3432; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3433; X32-SSE41-NEXT:    movl %eax, %ecx
3434; X32-SSE41-NEXT:    shrl $9, %ecx
3435; X32-SSE41-NEXT:    andl $1, %ecx
3436; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3437; X32-SSE41-NEXT:    movl %eax, %ecx
3438; X32-SSE41-NEXT:    shrl $10, %ecx
3439; X32-SSE41-NEXT:    andl $1, %ecx
3440; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3441; X32-SSE41-NEXT:    movl %eax, %ecx
3442; X32-SSE41-NEXT:    shrl $11, %ecx
3443; X32-SSE41-NEXT:    andl $1, %ecx
3444; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3445; X32-SSE41-NEXT:    movl %eax, %ecx
3446; X32-SSE41-NEXT:    shrl $12, %ecx
3447; X32-SSE41-NEXT:    andl $1, %ecx
3448; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3449; X32-SSE41-NEXT:    movl %eax, %ecx
3450; X32-SSE41-NEXT:    shrl $13, %ecx
3451; X32-SSE41-NEXT:    andl $1, %ecx
3452; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3453; X32-SSE41-NEXT:    movl %eax, %ecx
3454; X32-SSE41-NEXT:    shrl $14, %ecx
3455; X32-SSE41-NEXT:    andl $1, %ecx
3456; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3457; X32-SSE41-NEXT:    shrl $15, %eax
3458; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3459; X32-SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
3460; X32-SSE41-NEXT:    psllw $15, %xmm0
3461; X32-SSE41-NEXT:    psraw $15, %xmm0
3462; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3463; X32-SSE41-NEXT:    psllw $15, %xmm1
3464; X32-SSE41-NEXT:    psraw $15, %xmm1
3465; X32-SSE41-NEXT:    retl
3466entry:
3467 %X = load <16 x i1>, <16 x i1>* %ptr
3468 %Y = sext <16 x i1> %X to <16 x i16>
3469 ret <16 x i16> %Y
3470}
3471
3472define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
3473; SSE2-LABEL: load_sext_32i1_to_32i8:
3474; SSE2:       # BB#0: # %entry
3475; SSE2-NEXT:    pushq %rbp
3476; SSE2-NEXT:    pushq %r15
3477; SSE2-NEXT:    pushq %r14
3478; SSE2-NEXT:    pushq %r13
3479; SSE2-NEXT:    pushq %r12
3480; SSE2-NEXT:    pushq %rbx
3481; SSE2-NEXT:    movswq (%rdi), %rbx
3482; SSE2-NEXT:    movq %rbx, %r10
3483; SSE2-NEXT:    movq %rbx, %r8
3484; SSE2-NEXT:    movq %rbx, %r9
3485; SSE2-NEXT:    movq %rbx, %r11
3486; SSE2-NEXT:    movq %rbx, %r14
3487; SSE2-NEXT:    movq %rbx, %r15
3488; SSE2-NEXT:    movq %rbx, %r12
3489; SSE2-NEXT:    movq %rbx, %r13
3490; SSE2-NEXT:    movq %rbx, %rdx
3491; SSE2-NEXT:    movq %rbx, %rsi
3492; SSE2-NEXT:    movq %rbx, %rcx
3493; SSE2-NEXT:    movq %rbx, %rbp
3494; SSE2-NEXT:    movq %rbx, %rax
3495; SSE2-NEXT:    shlq $49, %rax
3496; SSE2-NEXT:    sarq $63, %rax
3497; SSE2-NEXT:    movd %eax, %xmm0
3498; SSE2-NEXT:    movq %rbx, %rax
3499; SSE2-NEXT:    shlq $57, %r10
3500; SSE2-NEXT:    sarq $63, %r10
3501; SSE2-NEXT:    movd %r10d, %xmm15
3502; SSE2-NEXT:    movq %rbx, %r10
3503; SSE2-NEXT:    movsbq %bl, %rbx
3504; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
3505; SSE2-NEXT:    shlq $53, %r8
3506; SSE2-NEXT:    sarq $63, %r8
3507; SSE2-NEXT:    movd %r8d, %xmm8
3508; SSE2-NEXT:    shlq $61, %r9
3509; SSE2-NEXT:    sarq $63, %r9
3510; SSE2-NEXT:    movd %r9d, %xmm2
3511; SSE2-NEXT:    shlq $51, %r11
3512; SSE2-NEXT:    sarq $63, %r11
3513; SSE2-NEXT:    movd %r11d, %xmm9
3514; SSE2-NEXT:    shlq $59, %r14
3515; SSE2-NEXT:    sarq $63, %r14
3516; SSE2-NEXT:    movd %r14d, %xmm5
3517; SSE2-NEXT:    shlq $55, %r15
3518; SSE2-NEXT:    sarq $63, %r15
3519; SSE2-NEXT:    movd %r15d, %xmm10
3520; SSE2-NEXT:    shlq $63, %r12
3521; SSE2-NEXT:    sarq $63, %r12
3522; SSE2-NEXT:    movd %r12d, %xmm0
3523; SSE2-NEXT:    shlq $50, %r13
3524; SSE2-NEXT:    sarq $63, %r13
3525; SSE2-NEXT:    movd %r13d, %xmm11
3526; SSE2-NEXT:    shlq $58, %rdx
3527; SSE2-NEXT:    sarq $63, %rdx
3528; SSE2-NEXT:    movd %edx, %xmm4
3529; SSE2-NEXT:    shlq $54, %rsi
3530; SSE2-NEXT:    sarq $63, %rsi
3531; SSE2-NEXT:    movd %esi, %xmm12
3532; SSE2-NEXT:    shlq $62, %rcx
3533; SSE2-NEXT:    sarq $63, %rcx
3534; SSE2-NEXT:    movd %ecx, %xmm6
3535; SSE2-NEXT:    shlq $52, %rbp
3536; SSE2-NEXT:    sarq $63, %rbp
3537; SSE2-NEXT:    movd %ebp, %xmm13
3538; SSE2-NEXT:    shlq $60, %rax
3539; SSE2-NEXT:    sarq $63, %rax
3540; SSE2-NEXT:    movd %eax, %xmm7
3541; SSE2-NEXT:    shrq $15, %r10
3542; SSE2-NEXT:    movd %r10d, %xmm14
3543; SSE2-NEXT:    shrq $7, %rbx
3544; SSE2-NEXT:    movd %ebx, %xmm3
3545; SSE2-NEXT:    movswq 2(%rdi), %rdx
3546; SSE2-NEXT:    movq %rdx, %r8
3547; SSE2-NEXT:    movq %rdx, %r9
3548; SSE2-NEXT:    movq %rdx, %r10
3549; SSE2-NEXT:    movq %rdx, %r11
3550; SSE2-NEXT:    movq %rdx, %r14
3551; SSE2-NEXT:    movq %rdx, %r15
3552; SSE2-NEXT:    movq %rdx, %r12
3553; SSE2-NEXT:    movq %rdx, %r13
3554; SSE2-NEXT:    movq %rdx, %rbx
3555; SSE2-NEXT:    movq %rdx, %rax
3556; SSE2-NEXT:    movq %rdx, %rcx
3557; SSE2-NEXT:    movq %rdx, %rsi
3558; SSE2-NEXT:    movq %rdx, %rdi
3559; SSE2-NEXT:    movq %rdx, %rbp
3560; SSE2-NEXT:    shlq $49, %rbp
3561; SSE2-NEXT:    sarq $63, %rbp
3562; SSE2-NEXT:    movd %ebp, %xmm1
3563; SSE2-NEXT:    movq %rdx, %rbp
3564; SSE2-NEXT:    movsbq %dl, %rdx
3565; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
3566; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
3567; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
3568; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3569; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3570; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3571; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
3572; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
3573; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
3574; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
3575; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
3576; SSE2-NEXT:    shlq $57, %r8
3577; SSE2-NEXT:    sarq $63, %r8
3578; SSE2-NEXT:    movd %r8d, %xmm2
3579; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
3580; SSE2-NEXT:    shlq $53, %r9
3581; SSE2-NEXT:    sarq $63, %r9
3582; SSE2-NEXT:    movd %r9d, %xmm3
3583; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3584; SSE2-NEXT:    shlq $61, %r10
3585; SSE2-NEXT:    sarq $63, %r10
3586; SSE2-NEXT:    movd %r10d, %xmm4
3587; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3588; SSE2-NEXT:    shlq $51, %r11
3589; SSE2-NEXT:    sarq $63, %r11
3590; SSE2-NEXT:    movd %r11d, %xmm5
3591; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3592; SSE2-NEXT:    shlq $59, %r14
3593; SSE2-NEXT:    sarq $63, %r14
3594; SSE2-NEXT:    movd %r14d, %xmm6
3595; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3596; SSE2-NEXT:    shlq $55, %r15
3597; SSE2-NEXT:    sarq $63, %r15
3598; SSE2-NEXT:    movd %r15d, %xmm3
3599; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3600; SSE2-NEXT:    shlq $63, %r12
3601; SSE2-NEXT:    sarq $63, %r12
3602; SSE2-NEXT:    movd %r12d, %xmm1
3603; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3604; SSE2-NEXT:    shlq $50, %r13
3605; SSE2-NEXT:    sarq $63, %r13
3606; SSE2-NEXT:    movd %r13d, %xmm2
3607; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3608; SSE2-NEXT:    shlq $58, %rbx
3609; SSE2-NEXT:    sarq $63, %rbx
3610; SSE2-NEXT:    movd %ebx, %xmm3
3611; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
3612; SSE2-NEXT:    shlq $54, %rax
3613; SSE2-NEXT:    sarq $63, %rax
3614; SSE2-NEXT:    movd %eax, %xmm5
3615; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3616; SSE2-NEXT:    shlq $62, %rcx
3617; SSE2-NEXT:    sarq $63, %rcx
3618; SSE2-NEXT:    movd %ecx, %xmm4
3619; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3620; SSE2-NEXT:    shlq $52, %rsi
3621; SSE2-NEXT:    sarq $63, %rsi
3622; SSE2-NEXT:    movd %esi, %xmm2
3623; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3624; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3625; SSE2-NEXT:    shlq $60, %rdi
3626; SSE2-NEXT:    sarq $63, %rdi
3627; SSE2-NEXT:    movd %edi, %xmm3
3628; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3629; SSE2-NEXT:    shrq $15, %rbp
3630; SSE2-NEXT:    movd %ebp, %xmm2
3631; SSE2-NEXT:    shrq $7, %rdx
3632; SSE2-NEXT:    movd %edx, %xmm5
3633; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
3634; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3635; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3636; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3637; SSE2-NEXT:    popq %rbx
3638; SSE2-NEXT:    popq %r12
3639; SSE2-NEXT:    popq %r13
3640; SSE2-NEXT:    popq %r14
3641; SSE2-NEXT:    popq %r15
3642; SSE2-NEXT:    popq %rbp
3643; SSE2-NEXT:    retq
3644;
3645; SSSE3-LABEL: load_sext_32i1_to_32i8:
3646; SSSE3:       # BB#0: # %entry
3647; SSSE3-NEXT:    pushq %rbp
3648; SSSE3-NEXT:    pushq %r15
3649; SSSE3-NEXT:    pushq %r14
3650; SSSE3-NEXT:    pushq %r13
3651; SSSE3-NEXT:    pushq %r12
3652; SSSE3-NEXT:    pushq %rbx
3653; SSSE3-NEXT:    movswq (%rdi), %rbx
3654; SSSE3-NEXT:    movq %rbx, %r10
3655; SSSE3-NEXT:    movq %rbx, %r8
3656; SSSE3-NEXT:    movq %rbx, %r9
3657; SSSE3-NEXT:    movq %rbx, %r11
3658; SSSE3-NEXT:    movq %rbx, %r14
3659; SSSE3-NEXT:    movq %rbx, %r15
3660; SSSE3-NEXT:    movq %rbx, %r12
3661; SSSE3-NEXT:    movq %rbx, %r13
3662; SSSE3-NEXT:    movq %rbx, %rdx
3663; SSSE3-NEXT:    movq %rbx, %rsi
3664; SSSE3-NEXT:    movq %rbx, %rcx
3665; SSSE3-NEXT:    movq %rbx, %rbp
3666; SSSE3-NEXT:    movq %rbx, %rax
3667; SSSE3-NEXT:    shlq $49, %rax
3668; SSSE3-NEXT:    sarq $63, %rax
3669; SSSE3-NEXT:    movd %eax, %xmm0
3670; SSSE3-NEXT:    movq %rbx, %rax
3671; SSSE3-NEXT:    shlq $57, %r10
3672; SSSE3-NEXT:    sarq $63, %r10
3673; SSSE3-NEXT:    movd %r10d, %xmm15
3674; SSSE3-NEXT:    movq %rbx, %r10
3675; SSSE3-NEXT:    movsbq %bl, %rbx
3676; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
3677; SSSE3-NEXT:    shlq $53, %r8
3678; SSSE3-NEXT:    sarq $63, %r8
3679; SSSE3-NEXT:    movd %r8d, %xmm8
3680; SSSE3-NEXT:    shlq $61, %r9
3681; SSSE3-NEXT:    sarq $63, %r9
3682; SSSE3-NEXT:    movd %r9d, %xmm2
3683; SSSE3-NEXT:    shlq $51, %r11
3684; SSSE3-NEXT:    sarq $63, %r11
3685; SSSE3-NEXT:    movd %r11d, %xmm9
3686; SSSE3-NEXT:    shlq $59, %r14
3687; SSSE3-NEXT:    sarq $63, %r14
3688; SSSE3-NEXT:    movd %r14d, %xmm5
3689; SSSE3-NEXT:    shlq $55, %r15
3690; SSSE3-NEXT:    sarq $63, %r15
3691; SSSE3-NEXT:    movd %r15d, %xmm10
3692; SSSE3-NEXT:    shlq $63, %r12
3693; SSSE3-NEXT:    sarq $63, %r12
3694; SSSE3-NEXT:    movd %r12d, %xmm0
3695; SSSE3-NEXT:    shlq $50, %r13
3696; SSSE3-NEXT:    sarq $63, %r13
3697; SSSE3-NEXT:    movd %r13d, %xmm11
3698; SSSE3-NEXT:    shlq $58, %rdx
3699; SSSE3-NEXT:    sarq $63, %rdx
3700; SSSE3-NEXT:    movd %edx, %xmm4
3701; SSSE3-NEXT:    shlq $54, %rsi
3702; SSSE3-NEXT:    sarq $63, %rsi
3703; SSSE3-NEXT:    movd %esi, %xmm12
3704; SSSE3-NEXT:    shlq $62, %rcx
3705; SSSE3-NEXT:    sarq $63, %rcx
3706; SSSE3-NEXT:    movd %ecx, %xmm6
3707; SSSE3-NEXT:    shlq $52, %rbp
3708; SSSE3-NEXT:    sarq $63, %rbp
3709; SSSE3-NEXT:    movd %ebp, %xmm13
3710; SSSE3-NEXT:    shlq $60, %rax
3711; SSSE3-NEXT:    sarq $63, %rax
3712; SSSE3-NEXT:    movd %eax, %xmm7
3713; SSSE3-NEXT:    shrq $15, %r10
3714; SSSE3-NEXT:    movd %r10d, %xmm14
3715; SSSE3-NEXT:    shrq $7, %rbx
3716; SSSE3-NEXT:    movd %ebx, %xmm3
3717; SSSE3-NEXT:    movswq 2(%rdi), %rdx
3718; SSSE3-NEXT:    movq %rdx, %r8
3719; SSSE3-NEXT:    movq %rdx, %r9
3720; SSSE3-NEXT:    movq %rdx, %r10
3721; SSSE3-NEXT:    movq %rdx, %r11
3722; SSSE3-NEXT:    movq %rdx, %r14
3723; SSSE3-NEXT:    movq %rdx, %r15
3724; SSSE3-NEXT:    movq %rdx, %r12
3725; SSSE3-NEXT:    movq %rdx, %r13
3726; SSSE3-NEXT:    movq %rdx, %rbx
3727; SSSE3-NEXT:    movq %rdx, %rax
3728; SSSE3-NEXT:    movq %rdx, %rcx
3729; SSSE3-NEXT:    movq %rdx, %rsi
3730; SSSE3-NEXT:    movq %rdx, %rdi
3731; SSSE3-NEXT:    movq %rdx, %rbp
3732; SSSE3-NEXT:    shlq $49, %rbp
3733; SSSE3-NEXT:    sarq $63, %rbp
3734; SSSE3-NEXT:    movd %ebp, %xmm1
3735; SSSE3-NEXT:    movq %rdx, %rbp
3736; SSSE3-NEXT:    movsbq %dl, %rdx
3737; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
3738; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
3739; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
3740; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3741; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3742; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3743; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
3744; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
3745; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
3746; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
3747; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
3748; SSSE3-NEXT:    shlq $57, %r8
3749; SSSE3-NEXT:    sarq $63, %r8
3750; SSSE3-NEXT:    movd %r8d, %xmm2
3751; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
3752; SSSE3-NEXT:    shlq $53, %r9
3753; SSSE3-NEXT:    sarq $63, %r9
3754; SSSE3-NEXT:    movd %r9d, %xmm3
3755; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3756; SSSE3-NEXT:    shlq $61, %r10
3757; SSSE3-NEXT:    sarq $63, %r10
3758; SSSE3-NEXT:    movd %r10d, %xmm4
3759; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3760; SSSE3-NEXT:    shlq $51, %r11
3761; SSSE3-NEXT:    sarq $63, %r11
3762; SSSE3-NEXT:    movd %r11d, %xmm5
3763; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3764; SSSE3-NEXT:    shlq $59, %r14
3765; SSSE3-NEXT:    sarq $63, %r14
3766; SSSE3-NEXT:    movd %r14d, %xmm6
3767; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3768; SSSE3-NEXT:    shlq $55, %r15
3769; SSSE3-NEXT:    sarq $63, %r15
3770; SSSE3-NEXT:    movd %r15d, %xmm3
3771; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
3772; SSSE3-NEXT:    shlq $63, %r12
3773; SSSE3-NEXT:    sarq $63, %r12
3774; SSSE3-NEXT:    movd %r12d, %xmm1
3775; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3776; SSSE3-NEXT:    shlq $50, %r13
3777; SSSE3-NEXT:    sarq $63, %r13
3778; SSSE3-NEXT:    movd %r13d, %xmm2
3779; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3780; SSSE3-NEXT:    shlq $58, %rbx
3781; SSSE3-NEXT:    sarq $63, %rbx
3782; SSSE3-NEXT:    movd %ebx, %xmm3
3783; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
3784; SSSE3-NEXT:    shlq $54, %rax
3785; SSSE3-NEXT:    sarq $63, %rax
3786; SSSE3-NEXT:    movd %eax, %xmm5
3787; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3788; SSSE3-NEXT:    shlq $62, %rcx
3789; SSSE3-NEXT:    sarq $63, %rcx
3790; SSSE3-NEXT:    movd %ecx, %xmm4
3791; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3792; SSSE3-NEXT:    shlq $52, %rsi
3793; SSSE3-NEXT:    sarq $63, %rsi
3794; SSSE3-NEXT:    movd %esi, %xmm2
3795; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3796; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3797; SSSE3-NEXT:    shlq $60, %rdi
3798; SSSE3-NEXT:    sarq $63, %rdi
3799; SSSE3-NEXT:    movd %edi, %xmm3
3800; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3801; SSSE3-NEXT:    shrq $15, %rbp
3802; SSSE3-NEXT:    movd %ebp, %xmm2
3803; SSSE3-NEXT:    shrq $7, %rdx
3804; SSSE3-NEXT:    movd %edx, %xmm5
3805; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
3806; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3807; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
3808; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3809; SSSE3-NEXT:    popq %rbx
3810; SSSE3-NEXT:    popq %r12
3811; SSSE3-NEXT:    popq %r13
3812; SSSE3-NEXT:    popq %r14
3813; SSSE3-NEXT:    popq %r15
3814; SSSE3-NEXT:    popq %rbp
3815; SSSE3-NEXT:    retq
3816;
3817; SSE41-LABEL: load_sext_32i1_to_32i8:
3818; SSE41:       # BB#0: # %entry
3819; SSE41-NEXT:    movswq (%rdi), %rax
3820; SSE41-NEXT:    movq %rax, %rcx
3821; SSE41-NEXT:    shlq $62, %rcx
3822; SSE41-NEXT:    sarq $63, %rcx
3823; SSE41-NEXT:    movq %rax, %rdx
3824; SSE41-NEXT:    shlq $63, %rdx
3825; SSE41-NEXT:    sarq $63, %rdx
3826; SSE41-NEXT:    movd %edx, %xmm0
3827; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
3828; SSE41-NEXT:    movq %rax, %rcx
3829; SSE41-NEXT:    shlq $61, %rcx
3830; SSE41-NEXT:    sarq $63, %rcx
3831; SSE41-NEXT:    pinsrb $2, %ecx, %xmm0
3832; SSE41-NEXT:    movq %rax, %rcx
3833; SSE41-NEXT:    shlq $60, %rcx
3834; SSE41-NEXT:    sarq $63, %rcx
3835; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
3836; SSE41-NEXT:    movq %rax, %rcx
3837; SSE41-NEXT:    shlq $59, %rcx
3838; SSE41-NEXT:    sarq $63, %rcx
3839; SSE41-NEXT:    pinsrb $4, %ecx, %xmm0
3840; SSE41-NEXT:    movq %rax, %rcx
3841; SSE41-NEXT:    shlq $58, %rcx
3842; SSE41-NEXT:    sarq $63, %rcx
3843; SSE41-NEXT:    pinsrb $5, %ecx, %xmm0
3844; SSE41-NEXT:    movq %rax, %rcx
3845; SSE41-NEXT:    shlq $57, %rcx
3846; SSE41-NEXT:    sarq $63, %rcx
3847; SSE41-NEXT:    pinsrb $6, %ecx, %xmm0
3848; SSE41-NEXT:    movsbq %al, %rcx
3849; SSE41-NEXT:    shrq $7, %rcx
3850; SSE41-NEXT:    pinsrb $7, %ecx, %xmm0
3851; SSE41-NEXT:    movq %rax, %rcx
3852; SSE41-NEXT:    shlq $55, %rcx
3853; SSE41-NEXT:    sarq $63, %rcx
3854; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
3855; SSE41-NEXT:    movq %rax, %rcx
3856; SSE41-NEXT:    shlq $54, %rcx
3857; SSE41-NEXT:    sarq $63, %rcx
3858; SSE41-NEXT:    pinsrb $9, %ecx, %xmm0
3859; SSE41-NEXT:    movq %rax, %rcx
3860; SSE41-NEXT:    shlq $53, %rcx
3861; SSE41-NEXT:    sarq $63, %rcx
3862; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
3863; SSE41-NEXT:    movq %rax, %rcx
3864; SSE41-NEXT:    shlq $52, %rcx
3865; SSE41-NEXT:    sarq $63, %rcx
3866; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
3867; SSE41-NEXT:    movq %rax, %rcx
3868; SSE41-NEXT:    shlq $51, %rcx
3869; SSE41-NEXT:    sarq $63, %rcx
3870; SSE41-NEXT:    pinsrb $12, %ecx, %xmm0
3871; SSE41-NEXT:    movq %rax, %rcx
3872; SSE41-NEXT:    shlq $50, %rcx
3873; SSE41-NEXT:    sarq $63, %rcx
3874; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
3875; SSE41-NEXT:    movq %rax, %rcx
3876; SSE41-NEXT:    shlq $49, %rcx
3877; SSE41-NEXT:    sarq $63, %rcx
3878; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
3879; SSE41-NEXT:    shrq $15, %rax
3880; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
3881; SSE41-NEXT:    movswq 2(%rdi), %rax
3882; SSE41-NEXT:    movq %rax, %rcx
3883; SSE41-NEXT:    shlq $62, %rcx
3884; SSE41-NEXT:    sarq $63, %rcx
3885; SSE41-NEXT:    movq %rax, %rdx
3886; SSE41-NEXT:    shlq $63, %rdx
3887; SSE41-NEXT:    sarq $63, %rdx
3888; SSE41-NEXT:    movd %edx, %xmm1
3889; SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
3890; SSE41-NEXT:    movq %rax, %rcx
3891; SSE41-NEXT:    shlq $61, %rcx
3892; SSE41-NEXT:    sarq $63, %rcx
3893; SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
3894; SSE41-NEXT:    movq %rax, %rcx
3895; SSE41-NEXT:    shlq $60, %rcx
3896; SSE41-NEXT:    sarq $63, %rcx
3897; SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
3898; SSE41-NEXT:    movq %rax, %rcx
3899; SSE41-NEXT:    shlq $59, %rcx
3900; SSE41-NEXT:    sarq $63, %rcx
3901; SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
3902; SSE41-NEXT:    movq %rax, %rcx
3903; SSE41-NEXT:    shlq $58, %rcx
3904; SSE41-NEXT:    sarq $63, %rcx
3905; SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
3906; SSE41-NEXT:    movq %rax, %rcx
3907; SSE41-NEXT:    shlq $57, %rcx
3908; SSE41-NEXT:    sarq $63, %rcx
3909; SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
3910; SSE41-NEXT:    movsbq %al, %rcx
3911; SSE41-NEXT:    shrq $7, %rcx
3912; SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
3913; SSE41-NEXT:    movq %rax, %rcx
3914; SSE41-NEXT:    shlq $55, %rcx
3915; SSE41-NEXT:    sarq $63, %rcx
3916; SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
3917; SSE41-NEXT:    movq %rax, %rcx
3918; SSE41-NEXT:    shlq $54, %rcx
3919; SSE41-NEXT:    sarq $63, %rcx
3920; SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
3921; SSE41-NEXT:    movq %rax, %rcx
3922; SSE41-NEXT:    shlq $53, %rcx
3923; SSE41-NEXT:    sarq $63, %rcx
3924; SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
3925; SSE41-NEXT:    movq %rax, %rcx
3926; SSE41-NEXT:    shlq $52, %rcx
3927; SSE41-NEXT:    sarq $63, %rcx
3928; SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
3929; SSE41-NEXT:    movq %rax, %rcx
3930; SSE41-NEXT:    shlq $51, %rcx
3931; SSE41-NEXT:    sarq $63, %rcx
3932; SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
3933; SSE41-NEXT:    movq %rax, %rcx
3934; SSE41-NEXT:    shlq $50, %rcx
3935; SSE41-NEXT:    sarq $63, %rcx
3936; SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
3937; SSE41-NEXT:    movq %rax, %rcx
3938; SSE41-NEXT:    shlq $49, %rcx
3939; SSE41-NEXT:    sarq $63, %rcx
3940; SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
3941; SSE41-NEXT:    shrq $15, %rax
3942; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
3943; SSE41-NEXT:    retq
3944;
3945; AVX1-LABEL: load_sext_32i1_to_32i8:
3946; AVX1:       # BB#0: # %entry
3947; AVX1-NEXT:    pushq %rbp
3948; AVX1-NEXT:    pushq %r15
3949; AVX1-NEXT:    pushq %r14
3950; AVX1-NEXT:    pushq %r13
3951; AVX1-NEXT:    pushq %r12
3952; AVX1-NEXT:    pushq %rbx
3953; AVX1-NEXT:    movslq (%rdi), %rax
3954; AVX1-NEXT:    movq %rax, %rcx
3955; AVX1-NEXT:    shlq $47, %rcx
3956; AVX1-NEXT:    sarq $63, %rcx
3957; AVX1-NEXT:    vmovd %ecx, %xmm0
3958; AVX1-NEXT:    movq %rax, %r8
3959; AVX1-NEXT:    movq %rax, %rdx
3960; AVX1-NEXT:    movq %rax, %rcx
3961; AVX1-NEXT:    movq %rax, %rdi
3962; AVX1-NEXT:    movq %rax, %r13
3963; AVX1-NEXT:    movq %rax, %rsi
3964; AVX1-NEXT:    movq %rax, %r10
3965; AVX1-NEXT:    movq %rax, %r11
3966; AVX1-NEXT:    movq %rax, %r9
3967; AVX1-NEXT:    movq %rax, %rbx
3968; AVX1-NEXT:    movq %rax, %r14
3969; AVX1-NEXT:    movq %rax, %r15
3970; AVX1-NEXT:    movq %rax, %r12
3971; AVX1-NEXT:    movq %rax, %rbp
3972; AVX1-NEXT:    shlq $46, %rbp
3973; AVX1-NEXT:    sarq $63, %rbp
3974; AVX1-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
3975; AVX1-NEXT:    movq %rax, %rbp
3976; AVX1-NEXT:    shlq $45, %r8
3977; AVX1-NEXT:    sarq $63, %r8
3978; AVX1-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
3979; AVX1-NEXT:    movq %rax, %r8
3980; AVX1-NEXT:    shlq $44, %rdx
3981; AVX1-NEXT:    sarq $63, %rdx
3982; AVX1-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
3983; AVX1-NEXT:    movq %rax, %rdx
3984; AVX1-NEXT:    shlq $43, %rcx
3985; AVX1-NEXT:    sarq $63, %rcx
3986; AVX1-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
3987; AVX1-NEXT:    movq %rax, %rcx
3988; AVX1-NEXT:    shlq $42, %rdi
3989; AVX1-NEXT:    sarq $63, %rdi
3990; AVX1-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
3991; AVX1-NEXT:    movq %rax, %rdi
3992; AVX1-NEXT:    shlq $41, %r13
3993; AVX1-NEXT:    sarq $63, %r13
3994; AVX1-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
3995; AVX1-NEXT:    movq %rax, %r13
3996; AVX1-NEXT:    shlq $40, %rsi
3997; AVX1-NEXT:    sarq $63, %rsi
3998; AVX1-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
3999; AVX1-NEXT:    movq %rax, %rsi
4000; AVX1-NEXT:    shlq $39, %r10
4001; AVX1-NEXT:    sarq $63, %r10
4002; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
4003; AVX1-NEXT:    movq %rax, %r10
4004; AVX1-NEXT:    shlq $38, %r11
4005; AVX1-NEXT:    sarq $63, %r11
4006; AVX1-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
4007; AVX1-NEXT:    movsbq %al, %r11
4008; AVX1-NEXT:    shlq $37, %r9
4009; AVX1-NEXT:    sarq $63, %r9
4010; AVX1-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
4011; AVX1-NEXT:    movq %rax, %r9
4012; AVX1-NEXT:    shlq $36, %rbx
4013; AVX1-NEXT:    sarq $63, %rbx
4014; AVX1-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
4015; AVX1-NEXT:    movq %rax, %rbx
4016; AVX1-NEXT:    shlq $35, %r14
4017; AVX1-NEXT:    sarq $63, %r14
4018; AVX1-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
4019; AVX1-NEXT:    movq %rax, %r14
4020; AVX1-NEXT:    shlq $34, %r15
4021; AVX1-NEXT:    sarq $63, %r15
4022; AVX1-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
4023; AVX1-NEXT:    movq %rax, %r15
4024; AVX1-NEXT:    shlq $33, %r12
4025; AVX1-NEXT:    sarq $63, %r12
4026; AVX1-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
4027; AVX1-NEXT:    movq %rax, %r12
4028; AVX1-NEXT:    shrq $31, %rbp
4029; AVX1-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
4030; AVX1-NEXT:    movq %rax, %rbp
4031; AVX1-NEXT:    shlq $63, %rdx
4032; AVX1-NEXT:    sarq $63, %rdx
4033; AVX1-NEXT:    vmovd %edx, %xmm1
4034; AVX1-NEXT:    movq %rax, %rdx
4035; AVX1-NEXT:    movswq %ax, %rax
4036; AVX1-NEXT:    shlq $62, %r8
4037; AVX1-NEXT:    sarq $63, %r8
4038; AVX1-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
4039; AVX1-NEXT:    shlq $61, %rcx
4040; AVX1-NEXT:    sarq $63, %rcx
4041; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
4042; AVX1-NEXT:    shlq $60, %rdi
4043; AVX1-NEXT:    sarq $63, %rdi
4044; AVX1-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
4045; AVX1-NEXT:    shlq $59, %r13
4046; AVX1-NEXT:    sarq $63, %r13
4047; AVX1-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
4048; AVX1-NEXT:    shlq $58, %rsi
4049; AVX1-NEXT:    sarq $63, %rsi
4050; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
4051; AVX1-NEXT:    shlq $57, %r10
4052; AVX1-NEXT:    sarq $63, %r10
4053; AVX1-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
4054; AVX1-NEXT:    shrq $7, %r11
4055; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
4056; AVX1-NEXT:    shlq $55, %r9
4057; AVX1-NEXT:    sarq $63, %r9
4058; AVX1-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
4059; AVX1-NEXT:    shlq $54, %rbx
4060; AVX1-NEXT:    sarq $63, %rbx
4061; AVX1-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
4062; AVX1-NEXT:    shlq $53, %r14
4063; AVX1-NEXT:    sarq $63, %r14
4064; AVX1-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
4065; AVX1-NEXT:    shlq $52, %r15
4066; AVX1-NEXT:    sarq $63, %r15
4067; AVX1-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
4068; AVX1-NEXT:    shlq $51, %r12
4069; AVX1-NEXT:    sarq $63, %r12
4070; AVX1-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
4071; AVX1-NEXT:    shlq $50, %rbp
4072; AVX1-NEXT:    sarq $63, %rbp
4073; AVX1-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
4074; AVX1-NEXT:    shlq $49, %rdx
4075; AVX1-NEXT:    sarq $63, %rdx
4076; AVX1-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
4077; AVX1-NEXT:    shrq $15, %rax
4078; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
4079; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4080; AVX1-NEXT:    popq %rbx
4081; AVX1-NEXT:    popq %r12
4082; AVX1-NEXT:    popq %r13
4083; AVX1-NEXT:    popq %r14
4084; AVX1-NEXT:    popq %r15
4085; AVX1-NEXT:    popq %rbp
4086; AVX1-NEXT:    retq
4087;
4088; AVX2-LABEL: load_sext_32i1_to_32i8:
4089; AVX2:       # BB#0: # %entry
4090; AVX2-NEXT:    pushq %rbp
4091; AVX2-NEXT:    pushq %r15
4092; AVX2-NEXT:    pushq %r14
4093; AVX2-NEXT:    pushq %r13
4094; AVX2-NEXT:    pushq %r12
4095; AVX2-NEXT:    pushq %rbx
4096; AVX2-NEXT:    movslq (%rdi), %rax
4097; AVX2-NEXT:    movq %rax, %rcx
4098; AVX2-NEXT:    shlq $47, %rcx
4099; AVX2-NEXT:    sarq $63, %rcx
4100; AVX2-NEXT:    vmovd %ecx, %xmm0
4101; AVX2-NEXT:    movq %rax, %r8
4102; AVX2-NEXT:    movq %rax, %rdx
4103; AVX2-NEXT:    movq %rax, %rcx
4104; AVX2-NEXT:    movq %rax, %rdi
4105; AVX2-NEXT:    movq %rax, %r13
4106; AVX2-NEXT:    movq %rax, %rsi
4107; AVX2-NEXT:    movq %rax, %r10
4108; AVX2-NEXT:    movq %rax, %r11
4109; AVX2-NEXT:    movq %rax, %r9
4110; AVX2-NEXT:    movq %rax, %rbx
4111; AVX2-NEXT:    movq %rax, %r14
4112; AVX2-NEXT:    movq %rax, %r15
4113; AVX2-NEXT:    movq %rax, %r12
4114; AVX2-NEXT:    movq %rax, %rbp
4115; AVX2-NEXT:    shlq $46, %rbp
4116; AVX2-NEXT:    sarq $63, %rbp
4117; AVX2-NEXT:    vpinsrb $1, %ebp, %xmm0, %xmm0
4118; AVX2-NEXT:    movq %rax, %rbp
4119; AVX2-NEXT:    shlq $45, %r8
4120; AVX2-NEXT:    sarq $63, %r8
4121; AVX2-NEXT:    vpinsrb $2, %r8d, %xmm0, %xmm0
4122; AVX2-NEXT:    movq %rax, %r8
4123; AVX2-NEXT:    shlq $44, %rdx
4124; AVX2-NEXT:    sarq $63, %rdx
4125; AVX2-NEXT:    vpinsrb $3, %edx, %xmm0, %xmm0
4126; AVX2-NEXT:    movq %rax, %rdx
4127; AVX2-NEXT:    shlq $43, %rcx
4128; AVX2-NEXT:    sarq $63, %rcx
4129; AVX2-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
4130; AVX2-NEXT:    movq %rax, %rcx
4131; AVX2-NEXT:    shlq $42, %rdi
4132; AVX2-NEXT:    sarq $63, %rdi
4133; AVX2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
4134; AVX2-NEXT:    movq %rax, %rdi
4135; AVX2-NEXT:    shlq $41, %r13
4136; AVX2-NEXT:    sarq $63, %r13
4137; AVX2-NEXT:    vpinsrb $6, %r13d, %xmm0, %xmm0
4138; AVX2-NEXT:    movq %rax, %r13
4139; AVX2-NEXT:    shlq $40, %rsi
4140; AVX2-NEXT:    sarq $63, %rsi
4141; AVX2-NEXT:    vpinsrb $7, %esi, %xmm0, %xmm0
4142; AVX2-NEXT:    movq %rax, %rsi
4143; AVX2-NEXT:    shlq $39, %r10
4144; AVX2-NEXT:    sarq $63, %r10
4145; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
4146; AVX2-NEXT:    movq %rax, %r10
4147; AVX2-NEXT:    shlq $38, %r11
4148; AVX2-NEXT:    sarq $63, %r11
4149; AVX2-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
4150; AVX2-NEXT:    movsbq %al, %r11
4151; AVX2-NEXT:    shlq $37, %r9
4152; AVX2-NEXT:    sarq $63, %r9
4153; AVX2-NEXT:    vpinsrb $10, %r9d, %xmm0, %xmm0
4154; AVX2-NEXT:    movq %rax, %r9
4155; AVX2-NEXT:    shlq $36, %rbx
4156; AVX2-NEXT:    sarq $63, %rbx
4157; AVX2-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
4158; AVX2-NEXT:    movq %rax, %rbx
4159; AVX2-NEXT:    shlq $35, %r14
4160; AVX2-NEXT:    sarq $63, %r14
4161; AVX2-NEXT:    vpinsrb $12, %r14d, %xmm0, %xmm0
4162; AVX2-NEXT:    movq %rax, %r14
4163; AVX2-NEXT:    shlq $34, %r15
4164; AVX2-NEXT:    sarq $63, %r15
4165; AVX2-NEXT:    vpinsrb $13, %r15d, %xmm0, %xmm0
4166; AVX2-NEXT:    movq %rax, %r15
4167; AVX2-NEXT:    shlq $33, %r12
4168; AVX2-NEXT:    sarq $63, %r12
4169; AVX2-NEXT:    vpinsrb $14, %r12d, %xmm0, %xmm0
4170; AVX2-NEXT:    movq %rax, %r12
4171; AVX2-NEXT:    shrq $31, %rbp
4172; AVX2-NEXT:    vpinsrb $15, %ebp, %xmm0, %xmm0
4173; AVX2-NEXT:    movq %rax, %rbp
4174; AVX2-NEXT:    shlq $63, %rdx
4175; AVX2-NEXT:    sarq $63, %rdx
4176; AVX2-NEXT:    vmovd %edx, %xmm1
4177; AVX2-NEXT:    movq %rax, %rdx
4178; AVX2-NEXT:    movswq %ax, %rax
4179; AVX2-NEXT:    shlq $62, %r8
4180; AVX2-NEXT:    sarq $63, %r8
4181; AVX2-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm1
4182; AVX2-NEXT:    shlq $61, %rcx
4183; AVX2-NEXT:    sarq $63, %rcx
4184; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
4185; AVX2-NEXT:    shlq $60, %rdi
4186; AVX2-NEXT:    sarq $63, %rdi
4187; AVX2-NEXT:    vpinsrb $3, %edi, %xmm1, %xmm1
4188; AVX2-NEXT:    shlq $59, %r13
4189; AVX2-NEXT:    sarq $63, %r13
4190; AVX2-NEXT:    vpinsrb $4, %r13d, %xmm1, %xmm1
4191; AVX2-NEXT:    shlq $58, %rsi
4192; AVX2-NEXT:    sarq $63, %rsi
4193; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
4194; AVX2-NEXT:    shlq $57, %r10
4195; AVX2-NEXT:    sarq $63, %r10
4196; AVX2-NEXT:    vpinsrb $6, %r10d, %xmm1, %xmm1
4197; AVX2-NEXT:    shrq $7, %r11
4198; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm1, %xmm1
4199; AVX2-NEXT:    shlq $55, %r9
4200; AVX2-NEXT:    sarq $63, %r9
4201; AVX2-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
4202; AVX2-NEXT:    shlq $54, %rbx
4203; AVX2-NEXT:    sarq $63, %rbx
4204; AVX2-NEXT:    vpinsrb $9, %ebx, %xmm1, %xmm1
4205; AVX2-NEXT:    shlq $53, %r14
4206; AVX2-NEXT:    sarq $63, %r14
4207; AVX2-NEXT:    vpinsrb $10, %r14d, %xmm1, %xmm1
4208; AVX2-NEXT:    shlq $52, %r15
4209; AVX2-NEXT:    sarq $63, %r15
4210; AVX2-NEXT:    vpinsrb $11, %r15d, %xmm1, %xmm1
4211; AVX2-NEXT:    shlq $51, %r12
4212; AVX2-NEXT:    sarq $63, %r12
4213; AVX2-NEXT:    vpinsrb $12, %r12d, %xmm1, %xmm1
4214; AVX2-NEXT:    shlq $50, %rbp
4215; AVX2-NEXT:    sarq $63, %rbp
4216; AVX2-NEXT:    vpinsrb $13, %ebp, %xmm1, %xmm1
4217; AVX2-NEXT:    shlq $49, %rdx
4218; AVX2-NEXT:    sarq $63, %rdx
4219; AVX2-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
4220; AVX2-NEXT:    shrq $15, %rax
4221; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
4222; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
4223; AVX2-NEXT:    popq %rbx
4224; AVX2-NEXT:    popq %r12
4225; AVX2-NEXT:    popq %r13
4226; AVX2-NEXT:    popq %r14
4227; AVX2-NEXT:    popq %r15
4228; AVX2-NEXT:    popq %rbp
4229; AVX2-NEXT:    retq
4230;
4231; AVX512-LABEL: load_sext_32i1_to_32i8:
4232; AVX512:       # BB#0: # %entry
4233; AVX512-NEXT:    kmovw (%rdi), %k1
4234; AVX512-NEXT:    kmovw 2(%rdi), %k2
4235; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
4236; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4237; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
4238; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
4239; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4240; AVX512-NEXT:    retq
4241;
4242; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
4243; X32-SSE41:       # BB#0: # %entry
4244; X32-SSE41-NEXT:    pushl %esi
4245; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4246; X32-SSE41-NEXT:    movswl (%eax), %ecx
4247; X32-SSE41-NEXT:    movl %ecx, %edx
4248; X32-SSE41-NEXT:    shll $30, %edx
4249; X32-SSE41-NEXT:    sarl $31, %edx
4250; X32-SSE41-NEXT:    movl %ecx, %esi
4251; X32-SSE41-NEXT:    shll $31, %esi
4252; X32-SSE41-NEXT:    sarl $31, %esi
4253; X32-SSE41-NEXT:    movd %esi, %xmm0
4254; X32-SSE41-NEXT:    pinsrb $1, %edx, %xmm0
4255; X32-SSE41-NEXT:    movl %ecx, %edx
4256; X32-SSE41-NEXT:    shll $29, %edx
4257; X32-SSE41-NEXT:    sarl $31, %edx
4258; X32-SSE41-NEXT:    pinsrb $2, %edx, %xmm0
4259; X32-SSE41-NEXT:    movl %ecx, %edx
4260; X32-SSE41-NEXT:    shll $28, %edx
4261; X32-SSE41-NEXT:    sarl $31, %edx
4262; X32-SSE41-NEXT:    pinsrb $3, %edx, %xmm0
4263; X32-SSE41-NEXT:    movl %ecx, %edx
4264; X32-SSE41-NEXT:    shll $27, %edx
4265; X32-SSE41-NEXT:    sarl $31, %edx
4266; X32-SSE41-NEXT:    pinsrb $4, %edx, %xmm0
4267; X32-SSE41-NEXT:    movl %ecx, %edx
4268; X32-SSE41-NEXT:    shll $26, %edx
4269; X32-SSE41-NEXT:    sarl $31, %edx
4270; X32-SSE41-NEXT:    pinsrb $5, %edx, %xmm0
4271; X32-SSE41-NEXT:    movl %ecx, %edx
4272; X32-SSE41-NEXT:    shll $25, %edx
4273; X32-SSE41-NEXT:    sarl $31, %edx
4274; X32-SSE41-NEXT:    pinsrb $6, %edx, %xmm0
4275; X32-SSE41-NEXT:    movsbl %cl, %edx
4276; X32-SSE41-NEXT:    shrl $7, %edx
4277; X32-SSE41-NEXT:    pinsrb $7, %edx, %xmm0
4278; X32-SSE41-NEXT:    movl %ecx, %edx
4279; X32-SSE41-NEXT:    shll $23, %edx
4280; X32-SSE41-NEXT:    sarl $31, %edx
4281; X32-SSE41-NEXT:    pinsrb $8, %edx, %xmm0
4282; X32-SSE41-NEXT:    movl %ecx, %edx
4283; X32-SSE41-NEXT:    shll $22, %edx
4284; X32-SSE41-NEXT:    sarl $31, %edx
4285; X32-SSE41-NEXT:    pinsrb $9, %edx, %xmm0
4286; X32-SSE41-NEXT:    movl %ecx, %edx
4287; X32-SSE41-NEXT:    shll $21, %edx
4288; X32-SSE41-NEXT:    sarl $31, %edx
4289; X32-SSE41-NEXT:    pinsrb $10, %edx, %xmm0
4290; X32-SSE41-NEXT:    movl %ecx, %edx
4291; X32-SSE41-NEXT:    shll $20, %edx
4292; X32-SSE41-NEXT:    sarl $31, %edx
4293; X32-SSE41-NEXT:    pinsrb $11, %edx, %xmm0
4294; X32-SSE41-NEXT:    movl %ecx, %edx
4295; X32-SSE41-NEXT:    shll $19, %edx
4296; X32-SSE41-NEXT:    sarl $31, %edx
4297; X32-SSE41-NEXT:    pinsrb $12, %edx, %xmm0
4298; X32-SSE41-NEXT:    movl %ecx, %edx
4299; X32-SSE41-NEXT:    shll $18, %edx
4300; X32-SSE41-NEXT:    sarl $31, %edx
4301; X32-SSE41-NEXT:    pinsrb $13, %edx, %xmm0
4302; X32-SSE41-NEXT:    movl %ecx, %edx
4303; X32-SSE41-NEXT:    shll $17, %edx
4304; X32-SSE41-NEXT:    sarl $31, %edx
4305; X32-SSE41-NEXT:    pinsrb $14, %edx, %xmm0
4306; X32-SSE41-NEXT:    shrl $15, %ecx
4307; X32-SSE41-NEXT:    pinsrb $15, %ecx, %xmm0
4308; X32-SSE41-NEXT:    movswl 2(%eax), %eax
4309; X32-SSE41-NEXT:    movl %eax, %ecx
4310; X32-SSE41-NEXT:    shll $30, %ecx
4311; X32-SSE41-NEXT:    sarl $31, %ecx
4312; X32-SSE41-NEXT:    movl %eax, %edx
4313; X32-SSE41-NEXT:    shll $31, %edx
4314; X32-SSE41-NEXT:    sarl $31, %edx
4315; X32-SSE41-NEXT:    movd %edx, %xmm1
4316; X32-SSE41-NEXT:    pinsrb $1, %ecx, %xmm1
4317; X32-SSE41-NEXT:    movl %eax, %ecx
4318; X32-SSE41-NEXT:    shll $29, %ecx
4319; X32-SSE41-NEXT:    sarl $31, %ecx
4320; X32-SSE41-NEXT:    pinsrb $2, %ecx, %xmm1
4321; X32-SSE41-NEXT:    movl %eax, %ecx
4322; X32-SSE41-NEXT:    shll $28, %ecx
4323; X32-SSE41-NEXT:    sarl $31, %ecx
4324; X32-SSE41-NEXT:    pinsrb $3, %ecx, %xmm1
4325; X32-SSE41-NEXT:    movl %eax, %ecx
4326; X32-SSE41-NEXT:    shll $27, %ecx
4327; X32-SSE41-NEXT:    sarl $31, %ecx
4328; X32-SSE41-NEXT:    pinsrb $4, %ecx, %xmm1
4329; X32-SSE41-NEXT:    movl %eax, %ecx
4330; X32-SSE41-NEXT:    shll $26, %ecx
4331; X32-SSE41-NEXT:    sarl $31, %ecx
4332; X32-SSE41-NEXT:    pinsrb $5, %ecx, %xmm1
4333; X32-SSE41-NEXT:    movl %eax, %ecx
4334; X32-SSE41-NEXT:    shll $25, %ecx
4335; X32-SSE41-NEXT:    sarl $31, %ecx
4336; X32-SSE41-NEXT:    pinsrb $6, %ecx, %xmm1
4337; X32-SSE41-NEXT:    movsbl %al, %ecx
4338; X32-SSE41-NEXT:    shrl $7, %ecx
4339; X32-SSE41-NEXT:    pinsrb $7, %ecx, %xmm1
4340; X32-SSE41-NEXT:    movl %eax, %ecx
4341; X32-SSE41-NEXT:    shll $23, %ecx
4342; X32-SSE41-NEXT:    sarl $31, %ecx
4343; X32-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
4344; X32-SSE41-NEXT:    movl %eax, %ecx
4345; X32-SSE41-NEXT:    shll $22, %ecx
4346; X32-SSE41-NEXT:    sarl $31, %ecx
4347; X32-SSE41-NEXT:    pinsrb $9, %ecx, %xmm1
4348; X32-SSE41-NEXT:    movl %eax, %ecx
4349; X32-SSE41-NEXT:    shll $21, %ecx
4350; X32-SSE41-NEXT:    sarl $31, %ecx
4351; X32-SSE41-NEXT:    pinsrb $10, %ecx, %xmm1
4352; X32-SSE41-NEXT:    movl %eax, %ecx
4353; X32-SSE41-NEXT:    shll $20, %ecx
4354; X32-SSE41-NEXT:    sarl $31, %ecx
4355; X32-SSE41-NEXT:    pinsrb $11, %ecx, %xmm1
4356; X32-SSE41-NEXT:    movl %eax, %ecx
4357; X32-SSE41-NEXT:    shll $19, %ecx
4358; X32-SSE41-NEXT:    sarl $31, %ecx
4359; X32-SSE41-NEXT:    pinsrb $12, %ecx, %xmm1
4360; X32-SSE41-NEXT:    movl %eax, %ecx
4361; X32-SSE41-NEXT:    shll $18, %ecx
4362; X32-SSE41-NEXT:    sarl $31, %ecx
4363; X32-SSE41-NEXT:    pinsrb $13, %ecx, %xmm1
4364; X32-SSE41-NEXT:    movl %eax, %ecx
4365; X32-SSE41-NEXT:    shll $17, %ecx
4366; X32-SSE41-NEXT:    sarl $31, %ecx
4367; X32-SSE41-NEXT:    pinsrb $14, %ecx, %xmm1
4368; X32-SSE41-NEXT:    shrl $15, %eax
4369; X32-SSE41-NEXT:    pinsrb $15, %eax, %xmm1
4370; X32-SSE41-NEXT:    popl %esi
4371; X32-SSE41-NEXT:    retl
4372entry:
4373 %X = load <32 x i1>, <32 x i1>* %ptr
4374 %Y = sext <32 x i1> %X to <32 x i8>
4375 ret <32 x i8> %Y
4376}
4377
4378define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
4379; SSE2-LABEL: load_sext_16i8_to_16i16:
4380; SSE2:       # BB#0: # %entry
4381; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4382; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4383; SSE2-NEXT:    psraw $8, %xmm0
4384; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4385; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4386; SSE2-NEXT:    psraw $8, %xmm1
4387; SSE2-NEXT:    retq
4388;
4389; SSSE3-LABEL: load_sext_16i8_to_16i16:
4390; SSSE3:       # BB#0: # %entry
4391; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4392; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4393; SSSE3-NEXT:    psraw $8, %xmm0
4394; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4395; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4396; SSSE3-NEXT:    psraw $8, %xmm1
4397; SSSE3-NEXT:    retq
4398;
4399; SSE41-LABEL: load_sext_16i8_to_16i16:
4400; SSE41:       # BB#0: # %entry
4401; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
4402; SSE41-NEXT:    pmovsxbw 8(%rdi), %xmm1
4403; SSE41-NEXT:    retq
4404;
4405; AVX1-LABEL: load_sext_16i8_to_16i16:
4406; AVX1:       # BB#0: # %entry
4407; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm0
4408; AVX1-NEXT:    vpmovsxbw 8(%rdi), %xmm1
4409; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4410; AVX1-NEXT:    retq
4411;
4412; AVX2-LABEL: load_sext_16i8_to_16i16:
4413; AVX2:       # BB#0: # %entry
4414; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm0
4415; AVX2-NEXT:    retq
4416;
4417; AVX512-LABEL: load_sext_16i8_to_16i16:
4418; AVX512:       # BB#0: # %entry
4419; AVX512-NEXT:    vpmovsxbw (%rdi), %ymm0
4420; AVX512-NEXT:    retq
4421;
4422; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
4423; X32-SSE41:       # BB#0: # %entry
4424; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4425; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
4426; X32-SSE41-NEXT:    pmovsxbw 8(%eax), %xmm1
4427; X32-SSE41-NEXT:    retl
4428entry:
4429 %X = load <16 x i8>, <16 x i8>* %ptr
4430 %Y = sext <16 x i8> %X to <16 x i16>
4431 ret <16 x i16> %Y
4432}
4433
4434define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
4435; SSE2-LABEL: load_sext_2i16_to_2i64:
4436; SSE2:       # BB#0: # %entry
4437; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4438; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4439; SSE2-NEXT:    movdqa %xmm0, %xmm1
4440; SSE2-NEXT:    psrad $31, %xmm1
4441; SSE2-NEXT:    psrad $16, %xmm0
4442; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4443; SSE2-NEXT:    retq
4444;
4445; SSSE3-LABEL: load_sext_2i16_to_2i64:
4446; SSSE3:       # BB#0: # %entry
4447; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4448; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4449; SSSE3-NEXT:    movdqa %xmm0, %xmm1
4450; SSSE3-NEXT:    psrad $31, %xmm1
4451; SSSE3-NEXT:    psrad $16, %xmm0
4452; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4453; SSSE3-NEXT:    retq
4454;
4455; SSE41-LABEL: load_sext_2i16_to_2i64:
4456; SSE41:       # BB#0: # %entry
4457; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
4458; SSE41-NEXT:    retq
4459;
4460; AVX-LABEL: load_sext_2i16_to_2i64:
4461; AVX:       # BB#0: # %entry
4462; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
4463; AVX-NEXT:    retq
4464;
4465; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
4466; X32-SSE41:       # BB#0: # %entry
4467; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4468; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
4469; X32-SSE41-NEXT:    retl
4470entry:
4471 %X = load <2 x i16>, <2 x i16>* %ptr
4472 %Y = sext <2 x i16> %X to <2 x i64>
4473 ret <2 x i64> %Y
4474}
4475
4476define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
4477; SSE2-LABEL: load_sext_4i16_to_4i32:
4478; SSE2:       # BB#0: # %entry
4479; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4480; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4481; SSE2-NEXT:    psrad $16, %xmm0
4482; SSE2-NEXT:    retq
4483;
4484; SSSE3-LABEL: load_sext_4i16_to_4i32:
4485; SSSE3:       # BB#0: # %entry
4486; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4487; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4488; SSSE3-NEXT:    psrad $16, %xmm0
4489; SSSE3-NEXT:    retq
4490;
4491; SSE41-LABEL: load_sext_4i16_to_4i32:
4492; SSE41:       # BB#0: # %entry
4493; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4494; SSE41-NEXT:    retq
4495;
4496; AVX-LABEL: load_sext_4i16_to_4i32:
4497; AVX:       # BB#0: # %entry
4498; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
4499; AVX-NEXT:    retq
4500;
4501; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
4502; X32-SSE41:       # BB#0: # %entry
4503; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4504; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
4505; X32-SSE41-NEXT:    retl
4506entry:
4507 %X = load <4 x i16>, <4 x i16>* %ptr
4508 %Y = sext <4 x i16> %X to <4 x i32>
4509 ret <4 x i32> %Y
4510}
4511
4512define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
4513; SSE2-LABEL: load_sext_4i16_to_4i64:
4514; SSE2:       # BB#0: # %entry
4515; SSE2-NEXT:    movswq 2(%rdi), %rax
4516; SSE2-NEXT:    movd %rax, %xmm1
4517; SSE2-NEXT:    movswq (%rdi), %rax
4518; SSE2-NEXT:    movd %rax, %xmm0
4519; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4520; SSE2-NEXT:    movswq 6(%rdi), %rax
4521; SSE2-NEXT:    movd %rax, %xmm2
4522; SSE2-NEXT:    movswq 4(%rdi), %rax
4523; SSE2-NEXT:    movd %rax, %xmm1
4524; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4525; SSE2-NEXT:    retq
4526;
4527; SSSE3-LABEL: load_sext_4i16_to_4i64:
4528; SSSE3:       # BB#0: # %entry
4529; SSSE3-NEXT:    movswq 2(%rdi), %rax
4530; SSSE3-NEXT:    movd %rax, %xmm1
4531; SSSE3-NEXT:    movswq (%rdi), %rax
4532; SSSE3-NEXT:    movd %rax, %xmm0
4533; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4534; SSSE3-NEXT:    movswq 6(%rdi), %rax
4535; SSSE3-NEXT:    movd %rax, %xmm2
4536; SSSE3-NEXT:    movswq 4(%rdi), %rax
4537; SSSE3-NEXT:    movd %rax, %xmm1
4538; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
4539; SSSE3-NEXT:    retq
4540;
4541; SSE41-LABEL: load_sext_4i16_to_4i64:
4542; SSE41:       # BB#0: # %entry
4543; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
4544; SSE41-NEXT:    pmovsxwq 4(%rdi), %xmm1
4545; SSE41-NEXT:    retq
4546;
4547; AVX1-LABEL: load_sext_4i16_to_4i64:
4548; AVX1:       # BB#0: # %entry
4549; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
4550; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4551; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4552; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4553; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4554; AVX1-NEXT:    retq
4555;
4556; AVX2-LABEL: load_sext_4i16_to_4i64:
4557; AVX2:       # BB#0: # %entry
4558; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
4559; AVX2-NEXT:    retq
4560;
4561; AVX512-LABEL: load_sext_4i16_to_4i64:
4562; AVX512:       # BB#0: # %entry
4563; AVX512-NEXT:    vpmovsxwq (%rdi), %ymm0
4564; AVX512-NEXT:    retq
4565;
4566; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
4567; X32-SSE41:       # BB#0: # %entry
4568; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4569; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
4570; X32-SSE41-NEXT:    pmovsxwq 4(%eax), %xmm1
4571; X32-SSE41-NEXT:    retl
4572entry:
4573 %X = load <4 x i16>, <4 x i16>* %ptr
4574 %Y = sext <4 x i16> %X to <4 x i64>
4575 ret <4 x i64> %Y
4576}
4577
4578define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
4579; SSE2-LABEL: load_sext_8i16_to_8i32:
4580; SSE2:       # BB#0: # %entry
4581; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4582; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4583; SSE2-NEXT:    psrad $16, %xmm0
4584; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4585; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
4586; SSE2-NEXT:    psrad $16, %xmm1
4587; SSE2-NEXT:    retq
4588;
4589; SSSE3-LABEL: load_sext_8i16_to_8i32:
4590; SSSE3:       # BB#0: # %entry
4591; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4592; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4593; SSSE3-NEXT:    psrad $16, %xmm0
4594; SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
4595; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
4596; SSSE3-NEXT:    psrad $16, %xmm1
4597; SSSE3-NEXT:    retq
4598;
4599; SSE41-LABEL: load_sext_8i16_to_8i32:
4600; SSE41:       # BB#0: # %entry
4601; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
4602; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
4603; SSE41-NEXT:    retq
4604;
4605; AVX1-LABEL: load_sext_8i16_to_8i32:
4606; AVX1:       # BB#0: # %entry
4607; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
4608; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
4609; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4610; AVX1-NEXT:    retq
4611;
4612; AVX2-LABEL: load_sext_8i16_to_8i32:
4613; AVX2:       # BB#0: # %entry
4614; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
4615; AVX2-NEXT:    retq
4616;
4617; AVX512-LABEL: load_sext_8i16_to_8i32:
4618; AVX512:       # BB#0: # %entry
4619; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
4620; AVX512-NEXT:    retq
4621;
4622; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
4623; X32-SSE41:       # BB#0: # %entry
4624; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4625; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
4626; X32-SSE41-NEXT:    pmovsxwd 8(%eax), %xmm1
4627; X32-SSE41-NEXT:    retl
4628entry:
4629 %X = load <8 x i16>, <8 x i16>* %ptr
4630 %Y = sext <8 x i16> %X to <8 x i32>
4631 ret <8 x i32> %Y
4632}
4633
4634define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
4635; SSE2-LABEL: load_sext_2i32_to_2i64:
4636; SSE2:       # BB#0: # %entry
4637; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4638; SSE2-NEXT:    movdqa %xmm0, %xmm1
4639; SSE2-NEXT:    psrad $31, %xmm1
4640; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4641; SSE2-NEXT:    retq
4642;
4643; SSSE3-LABEL: load_sext_2i32_to_2i64:
4644; SSSE3:       # BB#0: # %entry
4645; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4646; SSSE3-NEXT:    movdqa %xmm0, %xmm1
4647; SSSE3-NEXT:    psrad $31, %xmm1
4648; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4649; SSSE3-NEXT:    retq
4650;
4651; SSE41-LABEL: load_sext_2i32_to_2i64:
4652; SSE41:       # BB#0: # %entry
4653; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
4654; SSE41-NEXT:    retq
4655;
4656; AVX-LABEL: load_sext_2i32_to_2i64:
4657; AVX:       # BB#0: # %entry
4658; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
4659; AVX-NEXT:    retq
4660;
4661; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
4662; X32-SSE41:       # BB#0: # %entry
4663; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4664; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
4665; X32-SSE41-NEXT:    retl
4666entry:
4667 %X = load <2 x i32>, <2 x i32>* %ptr
4668 %Y = sext <2 x i32> %X to <2 x i64>
4669 ret <2 x i64> %Y
4670}
4671
4672define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
4673; SSE2-LABEL: load_sext_4i32_to_4i64:
4674; SSE2:       # BB#0: # %entry
4675; SSE2-NEXT:    movdqa (%rdi), %xmm0
4676; SSE2-NEXT:    movdqa %xmm0, %xmm2
4677; SSE2-NEXT:    psrad $31, %xmm2
4678; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4679; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4680; SSE2-NEXT:    movdqa %xmm1, %xmm2
4681; SSE2-NEXT:    psrad $31, %xmm2
4682; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4683; SSE2-NEXT:    retq
4684;
4685; SSSE3-LABEL: load_sext_4i32_to_4i64:
4686; SSSE3:       # BB#0: # %entry
4687; SSSE3-NEXT:    movdqa (%rdi), %xmm0
4688; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4689; SSSE3-NEXT:    psrad $31, %xmm2
4690; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4691; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4692; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4693; SSSE3-NEXT:    psrad $31, %xmm2
4694; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4695; SSSE3-NEXT:    retq
4696;
4697; SSE41-LABEL: load_sext_4i32_to_4i64:
4698; SSE41:       # BB#0: # %entry
4699; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
4700; SSE41-NEXT:    pmovsxdq 8(%rdi), %xmm1
4701; SSE41-NEXT:    retq
4702;
4703; AVX1-LABEL: load_sext_4i32_to_4i64:
4704; AVX1:       # BB#0: # %entry
4705; AVX1-NEXT:    vpmovsxdq (%rdi), %xmm0
4706; AVX1-NEXT:    vpmovsxdq 8(%rdi), %xmm1
4707; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4708; AVX1-NEXT:    retq
4709;
4710; AVX2-LABEL: load_sext_4i32_to_4i64:
4711; AVX2:       # BB#0: # %entry
4712; AVX2-NEXT:    vpmovsxdq (%rdi), %ymm0
4713; AVX2-NEXT:    retq
4714;
4715; AVX512-LABEL: load_sext_4i32_to_4i64:
4716; AVX512:       # BB#0: # %entry
4717; AVX512-NEXT:    vpmovsxdq (%rdi), %ymm0
4718; AVX512-NEXT:    retq
4719;
4720; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
4721; X32-SSE41:       # BB#0: # %entry
4722; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
4723; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
4724; X32-SSE41-NEXT:    pmovsxdq 8(%eax), %xmm1
4725; X32-SSE41-NEXT:    retl
4726entry:
4727 %X = load <4 x i32>, <4 x i32>* %ptr
4728 %Y = sext <4 x i32> %X to <4 x i64>
4729 ret <4 x i64> %Y
4730}
4731
4732define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
4733; SSE2-LABEL: sext_2i8_to_i32:
4734; SSE2:       # BB#0: # %entry
4735; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4736; SSE2-NEXT:    psraw $8, %xmm0
4737; SSE2-NEXT:    movd %xmm0, %eax
4738; SSE2-NEXT:    retq
4739;
4740; SSSE3-LABEL: sext_2i8_to_i32:
4741; SSSE3:       # BB#0: # %entry
4742; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
4743; SSSE3-NEXT:    psraw $8, %xmm0
4744; SSSE3-NEXT:    movd %xmm0, %eax
4745; SSSE3-NEXT:    retq
4746;
4747; SSE41-LABEL: sext_2i8_to_i32:
4748; SSE41:       # BB#0: # %entry
4749; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
4750; SSE41-NEXT:    movd %xmm0, %eax
4751; SSE41-NEXT:    retq
4752;
4753; AVX-LABEL: sext_2i8_to_i32:
4754; AVX:       # BB#0: # %entry
4755; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
4756; AVX-NEXT:    vmovd %xmm0, %eax
4757; AVX-NEXT:    retq
4758;
4759; X32-SSE41-LABEL: sext_2i8_to_i32:
4760; X32-SSE41:       # BB#0: # %entry
4761; X32-SSE41-NEXT:    pushl %eax
4762; X32-SSE41-NEXT:  .Lcfi0:
4763; X32-SSE41-NEXT:    .cfi_def_cfa_offset 8
4764; X32-SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
4765; X32-SSE41-NEXT:    movd %xmm0, %eax
4766; X32-SSE41-NEXT:    popl %ecx
4767; X32-SSE41-NEXT:    retl
4768entry:
4769  %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
4770  %Ex = sext <2 x i8> %Shuf to <2 x i16>
4771  %Bc = bitcast <2 x i16> %Ex to i32
4772  ret i32 %Bc
4773}
4774
4775define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
4776; SSE2-LABEL: sext_4i1_to_4i64:
4777; SSE2:       # BB#0:
4778; SSE2-NEXT:    pslld $31, %xmm0
4779; SSE2-NEXT:    psrad $31, %xmm0
4780; SSE2-NEXT:    movdqa %xmm0, %xmm2
4781; SSE2-NEXT:    psrad $31, %xmm2
4782; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4783; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4784; SSE2-NEXT:    movdqa %xmm1, %xmm2
4785; SSE2-NEXT:    psrad $31, %xmm2
4786; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4787; SSE2-NEXT:    retq
4788;
4789; SSSE3-LABEL: sext_4i1_to_4i64:
4790; SSSE3:       # BB#0:
4791; SSSE3-NEXT:    pslld $31, %xmm0
4792; SSSE3-NEXT:    psrad $31, %xmm0
4793; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4794; SSSE3-NEXT:    psrad $31, %xmm2
4795; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4796; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4797; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4798; SSSE3-NEXT:    psrad $31, %xmm2
4799; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4800; SSSE3-NEXT:    retq
4801;
4802; SSE41-LABEL: sext_4i1_to_4i64:
4803; SSE41:       # BB#0:
4804; SSE41-NEXT:    pslld $31, %xmm0
4805; SSE41-NEXT:    psrad $31, %xmm0
4806; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4807; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4808; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4809; SSE41-NEXT:    movdqa %xmm2, %xmm0
4810; SSE41-NEXT:    retq
4811;
4812; AVX1-LABEL: sext_4i1_to_4i64:
4813; AVX1:       # BB#0:
4814; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
4815; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
4816; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4817; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4818; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4819; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4820; AVX1-NEXT:    retq
4821;
4822; AVX2-LABEL: sext_4i1_to_4i64:
4823; AVX2:       # BB#0:
4824; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
4825; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
4826; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
4827; AVX2-NEXT:    retq
4828;
4829; AVX512-LABEL: sext_4i1_to_4i64:
4830; AVX512:       # BB#0:
4831; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
4832; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
4833; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
4834; AVX512-NEXT:    retq
4835;
4836; X32-SSE41-LABEL: sext_4i1_to_4i64:
4837; X32-SSE41:       # BB#0:
4838; X32-SSE41-NEXT:    pslld $31, %xmm0
4839; X32-SSE41-NEXT:    psrad $31, %xmm0
4840; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4841; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4842; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4843; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
4844; X32-SSE41-NEXT:    retl
4845  %extmask = sext <4 x i1> %mask to <4 x i64>
4846  ret <4 x i64> %extmask
4847}
4848
4849define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
4850; SSE2-LABEL: sext_4i8_to_4i64:
4851; SSE2:       # BB#0:
4852; SSE2-NEXT:    pslld $24, %xmm0
4853; SSE2-NEXT:    psrad $24, %xmm0
4854; SSE2-NEXT:    movdqa %xmm0, %xmm2
4855; SSE2-NEXT:    psrad $31, %xmm2
4856; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4857; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4858; SSE2-NEXT:    movdqa %xmm1, %xmm2
4859; SSE2-NEXT:    psrad $31, %xmm2
4860; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4861; SSE2-NEXT:    retq
4862;
4863; SSSE3-LABEL: sext_4i8_to_4i64:
4864; SSSE3:       # BB#0:
4865; SSSE3-NEXT:    pslld $24, %xmm0
4866; SSSE3-NEXT:    psrad $24, %xmm0
4867; SSSE3-NEXT:    movdqa %xmm0, %xmm2
4868; SSSE3-NEXT:    psrad $31, %xmm2
4869; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4870; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4871; SSSE3-NEXT:    movdqa %xmm1, %xmm2
4872; SSSE3-NEXT:    psrad $31, %xmm2
4873; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4874; SSSE3-NEXT:    retq
4875;
4876; SSE41-LABEL: sext_4i8_to_4i64:
4877; SSE41:       # BB#0:
4878; SSE41-NEXT:    pslld $24, %xmm0
4879; SSE41-NEXT:    psrad $24, %xmm0
4880; SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4881; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4882; SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4883; SSE41-NEXT:    movdqa %xmm2, %xmm0
4884; SSE41-NEXT:    retq
4885;
4886; AVX1-LABEL: sext_4i8_to_4i64:
4887; AVX1:       # BB#0:
4888; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
4889; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
4890; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
4891; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4892; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
4893; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
4894; AVX1-NEXT:    retq
4895;
4896; AVX2-LABEL: sext_4i8_to_4i64:
4897; AVX2:       # BB#0:
4898; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
4899; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
4900; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
4901; AVX2-NEXT:    retq
4902;
4903; AVX512-LABEL: sext_4i8_to_4i64:
4904; AVX512:       # BB#0:
4905; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
4906; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
4907; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
4908; AVX512-NEXT:    retq
4909;
4910; X32-SSE41-LABEL: sext_4i8_to_4i64:
4911; X32-SSE41:       # BB#0:
4912; X32-SSE41-NEXT:    pslld $24, %xmm0
4913; X32-SSE41-NEXT:    psrad $24, %xmm0
4914; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm2
4915; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
4916; X32-SSE41-NEXT:    pmovsxdq %xmm0, %xmm1
4917; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
4918; X32-SSE41-NEXT:    retl
4919  %extmask = sext <4 x i8> %mask to <4 x i64>
4920  ret <4 x i64> %extmask
4921}
4922
4923define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
4924; SSE-LABEL: sext_32xi1_to_32xi8:
4925; SSE:       # BB#0:
4926; SSE-NEXT:    pcmpeqw %xmm5, %xmm1
4927; SSE-NEXT:    pcmpeqw %xmm4, %xmm0
4928; SSE-NEXT:    packsswb %xmm1, %xmm0
4929; SSE-NEXT:    pcmpeqw %xmm7, %xmm3
4930; SSE-NEXT:    pcmpeqw %xmm6, %xmm2
4931; SSE-NEXT:    packsswb %xmm3, %xmm2
4932; SSE-NEXT:    movdqa %xmm2, %xmm1
4933; SSE-NEXT:    retq
4934;
4935; AVX1-LABEL: sext_32xi1_to_32xi8:
4936; AVX1:       # BB#0:
4937; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4938; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
4939; AVX1-NEXT:    vpcmpeqw %xmm4, %xmm5, %xmm4
4940; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
4941; AVX1-NEXT:    vpacksswb %xmm4, %xmm1, %xmm1
4942; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
4943; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
4944; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm4, %xmm3
4945; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
4946; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
4947; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4948; AVX1-NEXT:    retq
4949;
4950; AVX2-LABEL: sext_32xi1_to_32xi8:
4951; AVX2:       # BB#0:
4952; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
4953; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
4954; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
4955; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
4956; AVX2-NEXT:    retq
4957;
4958; AVX512F-LABEL: sext_32xi1_to_32xi8:
4959; AVX512F:       # BB#0:
4960; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
4961; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4962; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4963; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
4964; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
4965; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
4966; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4967; AVX512F-NEXT:    retq
4968;
4969; AVX512BW-LABEL: sext_32xi1_to_32xi8:
4970; AVX512BW:       # BB#0:
4971; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
4972; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
4973; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
4974; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4975; AVX512BW-NEXT:    retq
4976;
4977; X32-SSE41-LABEL: sext_32xi1_to_32xi8:
4978; X32-SSE41:       # BB#0:
4979; X32-SSE41-NEXT:    pushl %ebp
4980; X32-SSE41-NEXT:    movl %esp, %ebp
4981; X32-SSE41-NEXT:    andl $-16, %esp
4982; X32-SSE41-NEXT:    subl $16, %esp
4983; X32-SSE41-NEXT:    movdqa 8(%ebp), %xmm3
4984; X32-SSE41-NEXT:    pcmpeqw 40(%ebp), %xmm1
4985; X32-SSE41-NEXT:    pcmpeqw 24(%ebp), %xmm0
4986; X32-SSE41-NEXT:    packsswb %xmm1, %xmm0
4987; X32-SSE41-NEXT:    pcmpeqw 72(%ebp), %xmm3
4988; X32-SSE41-NEXT:    pcmpeqw 56(%ebp), %xmm2
4989; X32-SSE41-NEXT:    packsswb %xmm3, %xmm2
4990; X32-SSE41-NEXT:    movdqa %xmm2, %xmm1
4991; X32-SSE41-NEXT:    movl %ebp, %esp
4992; X32-SSE41-NEXT:    popl %ebp
4993; X32-SSE41-NEXT:    retl
4994  %a = icmp eq <32 x i16> %c1, %c2
4995  %b = sext <32 x i1> %a to <32 x i8>
4996  ret <32 x i8> %b
4997}
4998