1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
7; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
8
9;
10; vXi8
11;
12
13define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
14; SSE2-LABEL: @loadext_2i8_to_2i64(
15; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
16; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
17; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
18; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
19; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
20; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
21; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
22; SSE2-NEXT:    ret <2 x i64> [[V1]]
23;
24; SLM-LABEL: @loadext_2i8_to_2i64(
25; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <2 x i8>*
26; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
27; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
28; SLM-NEXT:    ret <2 x i64> [[TMP3]]
29;
30; AVX-LABEL: @loadext_2i8_to_2i64(
31; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <2 x i8>*
32; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
33; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
34; AVX-NEXT:    ret <2 x i64> [[TMP3]]
35;
36  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
37  %i0 = load i8, i8* %p0, align 1
38  %i1 = load i8, i8* %p1, align 1
39  %x0 = zext i8 %i0 to i64
40  %x1 = zext i8 %i1 to i64
41  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
42  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
43  ret <2 x i64> %v1
44}
45
46define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
47; SSE2-LABEL: @loadext_4i8_to_4i32(
48; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <4 x i8>*
49; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
50; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
51; SSE2-NEXT:    ret <4 x i32> [[TMP3]]
52;
53; SLM-LABEL: @loadext_4i8_to_4i32(
54; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <4 x i8>*
55; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
56; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
57; SLM-NEXT:    ret <4 x i32> [[TMP3]]
58;
59; AVX-LABEL: @loadext_4i8_to_4i32(
60; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <4 x i8>*
61; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
62; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
63; AVX-NEXT:    ret <4 x i32> [[TMP3]]
64;
65  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
66  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
67  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
68  %i0 = load i8, i8* %p0, align 1
69  %i1 = load i8, i8* %p1, align 1
70  %i2 = load i8, i8* %p2, align 1
71  %i3 = load i8, i8* %p3, align 1
72  %x0 = zext i8 %i0 to i32
73  %x1 = zext i8 %i1 to i32
74  %x2 = zext i8 %i2 to i32
75  %x3 = zext i8 %i3 to i32
76  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
77  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
78  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
79  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
80  ret <4 x i32> %v3
81}
82
83define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
84; SSE2-LABEL: @loadext_4i8_to_4i64(
85; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <4 x i8>*
86; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
87; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
88; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
89;
90; SLM-LABEL: @loadext_4i8_to_4i64(
91; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <4 x i8>*
92; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
93; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
94; SLM-NEXT:    ret <4 x i64> [[TMP3]]
95;
96; AVX-LABEL: @loadext_4i8_to_4i64(
97; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <4 x i8>*
98; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
99; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
100; AVX-NEXT:    ret <4 x i64> [[TMP3]]
101;
102  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
103  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
104  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
105  %i0 = load i8, i8* %p0, align 1
106  %i1 = load i8, i8* %p1, align 1
107  %i2 = load i8, i8* %p2, align 1
108  %i3 = load i8, i8* %p3, align 1
109  %x0 = zext i8 %i0 to i64
110  %x1 = zext i8 %i1 to i64
111  %x2 = zext i8 %i2 to i64
112  %x3 = zext i8 %i3 to i64
113  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
114  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
115  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
116  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
117  ret <4 x i64> %v3
118}
119
120define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
121; SSE2-LABEL: @loadext_8i8_to_8i16(
122; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <8 x i8>*
123; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
124; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
125; SSE2-NEXT:    ret <8 x i16> [[TMP3]]
126;
127; SLM-LABEL: @loadext_8i8_to_8i16(
128; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <8 x i8>*
129; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
130; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
131; SLM-NEXT:    ret <8 x i16> [[TMP3]]
132;
133; AVX-LABEL: @loadext_8i8_to_8i16(
134; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <8 x i8>*
135; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
136; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
137; AVX-NEXT:    ret <8 x i16> [[TMP3]]
138;
139  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
140  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
141  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
142  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
143  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
144  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
145  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
146  %i0 = load i8, i8* %p0, align 1
147  %i1 = load i8, i8* %p1, align 1
148  %i2 = load i8, i8* %p2, align 1
149  %i3 = load i8, i8* %p3, align 1
150  %i4 = load i8, i8* %p4, align 1
151  %i5 = load i8, i8* %p5, align 1
152  %i6 = load i8, i8* %p6, align 1
153  %i7 = load i8, i8* %p7, align 1
154  %x0 = zext i8 %i0 to i16
155  %x1 = zext i8 %i1 to i16
156  %x2 = zext i8 %i2 to i16
157  %x3 = zext i8 %i3 to i16
158  %x4 = zext i8 %i4 to i16
159  %x5 = zext i8 %i5 to i16
160  %x6 = zext i8 %i6 to i16
161  %x7 = zext i8 %i7 to i16
162  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
163  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
164  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
165  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
166  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
167  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
168  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
169  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
170  ret <8 x i16> %v7
171}
172
173define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
174; SSE2-LABEL: @loadext_8i8_to_8i32(
175; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <8 x i8>*
176; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
177; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
178; SSE2-NEXT:    ret <8 x i32> [[TMP3]]
179;
180; SLM-LABEL: @loadext_8i8_to_8i32(
181; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <8 x i8>*
182; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
183; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
184; SLM-NEXT:    ret <8 x i32> [[TMP3]]
185;
186; AVX-LABEL: @loadext_8i8_to_8i32(
187; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <8 x i8>*
188; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
189; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
190; AVX-NEXT:    ret <8 x i32> [[TMP3]]
191;
192  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
193  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
194  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
195  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
196  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
197  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
198  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
199  %i0 = load i8, i8* %p0, align 1
200  %i1 = load i8, i8* %p1, align 1
201  %i2 = load i8, i8* %p2, align 1
202  %i3 = load i8, i8* %p3, align 1
203  %i4 = load i8, i8* %p4, align 1
204  %i5 = load i8, i8* %p5, align 1
205  %i6 = load i8, i8* %p6, align 1
206  %i7 = load i8, i8* %p7, align 1
207  %x0 = zext i8 %i0 to i32
208  %x1 = zext i8 %i1 to i32
209  %x2 = zext i8 %i2 to i32
210  %x3 = zext i8 %i3 to i32
211  %x4 = zext i8 %i4 to i32
212  %x5 = zext i8 %i5 to i32
213  %x6 = zext i8 %i6 to i32
214  %x7 = zext i8 %i7 to i32
215  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
216  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
217  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
218  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
219  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
220  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
221  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
222  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
223  ret <8 x i32> %v7
224}
225
226define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
227; SSE2-LABEL: @loadext_16i8_to_16i16(
228; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <16 x i8>*
229; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
230; SSE2-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
231; SSE2-NEXT:    ret <16 x i16> [[TMP3]]
232;
233; SLM-LABEL: @loadext_16i8_to_16i16(
234; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <16 x i8>*
235; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
236; SLM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
237; SLM-NEXT:    ret <16 x i16> [[TMP3]]
238;
239; AVX-LABEL: @loadext_16i8_to_16i16(
240; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0:%.*]] to <16 x i8>*
241; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
242; AVX-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
243; AVX-NEXT:    ret <16 x i16> [[TMP3]]
244;
245  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
246  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
247  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
248  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
249  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
250  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
251  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
252  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
253  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
254  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
255  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
256  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
257  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
258  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
259  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
260  %i0  = load i8, i8* %p0,  align 1
261  %i1  = load i8, i8* %p1,  align 1
262  %i2  = load i8, i8* %p2,  align 1
263  %i3  = load i8, i8* %p3,  align 1
264  %i4  = load i8, i8* %p4,  align 1
265  %i5  = load i8, i8* %p5,  align 1
266  %i6  = load i8, i8* %p6,  align 1
267  %i7  = load i8, i8* %p7,  align 1
268  %i8  = load i8, i8* %p8,  align 1
269  %i9  = load i8, i8* %p9,  align 1
270  %i10 = load i8, i8* %p10, align 1
271  %i11 = load i8, i8* %p11, align 1
272  %i12 = load i8, i8* %p12, align 1
273  %i13 = load i8, i8* %p13, align 1
274  %i14 = load i8, i8* %p14, align 1
275  %i15 = load i8, i8* %p15, align 1
276  %x0  = zext i8 %i0  to i16
277  %x1  = zext i8 %i1  to i16
278  %x2  = zext i8 %i2  to i16
279  %x3  = zext i8 %i3  to i16
280  %x4  = zext i8 %i4  to i16
281  %x5  = zext i8 %i5  to i16
282  %x6  = zext i8 %i6  to i16
283  %x7  = zext i8 %i7  to i16
284  %x8  = zext i8 %i8  to i16
285  %x9  = zext i8 %i9  to i16
286  %x10 = zext i8 %i10 to i16
287  %x11 = zext i8 %i11 to i16
288  %x12 = zext i8 %i12 to i16
289  %x13 = zext i8 %i13 to i16
290  %x14 = zext i8 %i14 to i16
291  %x15 = zext i8 %i15 to i16
292  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
293  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
294  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
295  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
296  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
297  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
298  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
299  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
300  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
301  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
302  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
303  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
304  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
305  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
306  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
307  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
308  ret <16 x i16> %v15
309}
310
311;
312; vXi16
313;
314
315define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
316; SSE2-LABEL: @loadext_2i16_to_2i64(
317; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <2 x i16>*
318; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
319; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
320; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
321;
322; SLM-LABEL: @loadext_2i16_to_2i64(
323; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <2 x i16>*
324; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
325; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
326; SLM-NEXT:    ret <2 x i64> [[TMP3]]
327;
328; AVX-LABEL: @loadext_2i16_to_2i64(
329; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <2 x i16>*
330; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
331; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
332; AVX-NEXT:    ret <2 x i64> [[TMP3]]
333;
334  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
335  %i0 = load i16, i16* %p0, align 1
336  %i1 = load i16, i16* %p1, align 1
337  %x0 = zext i16 %i0 to i64
338  %x1 = zext i16 %i1 to i64
339  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
340  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
341  ret <2 x i64> %v1
342}
343
344define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
345; SSE2-LABEL: @loadext_4i16_to_4i32(
346; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <4 x i16>*
347; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
348; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
349; SSE2-NEXT:    ret <4 x i32> [[TMP3]]
350;
351; SLM-LABEL: @loadext_4i16_to_4i32(
352; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <4 x i16>*
353; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
354; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
355; SLM-NEXT:    ret <4 x i32> [[TMP3]]
356;
357; AVX-LABEL: @loadext_4i16_to_4i32(
358; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <4 x i16>*
359; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
360; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
361; AVX-NEXT:    ret <4 x i32> [[TMP3]]
362;
363  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
364  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
365  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
366  %i0 = load i16, i16* %p0, align 1
367  %i1 = load i16, i16* %p1, align 1
368  %i2 = load i16, i16* %p2, align 1
369  %i3 = load i16, i16* %p3, align 1
370  %x0 = zext i16 %i0 to i32
371  %x1 = zext i16 %i1 to i32
372  %x2 = zext i16 %i2 to i32
373  %x3 = zext i16 %i3 to i32
374  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
375  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
376  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
377  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
378  ret <4 x i32> %v3
379}
380
381define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
382; SSE2-LABEL: @loadext_4i16_to_4i64(
383; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <4 x i16>*
384; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
385; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
386; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
387;
388; SLM-LABEL: @loadext_4i16_to_4i64(
389; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <4 x i16>*
390; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
391; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
392; SLM-NEXT:    ret <4 x i64> [[TMP3]]
393;
394; AVX-LABEL: @loadext_4i16_to_4i64(
395; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <4 x i16>*
396; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
397; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
398; AVX-NEXT:    ret <4 x i64> [[TMP3]]
399;
400  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
401  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
402  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
403  %i0 = load i16, i16* %p0, align 1
404  %i1 = load i16, i16* %p1, align 1
405  %i2 = load i16, i16* %p2, align 1
406  %i3 = load i16, i16* %p3, align 1
407  %x0 = zext i16 %i0 to i64
408  %x1 = zext i16 %i1 to i64
409  %x2 = zext i16 %i2 to i64
410  %x3 = zext i16 %i3 to i64
411  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
412  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
413  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
414  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
415  ret <4 x i64> %v3
416}
417
418define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
419; SSE2-LABEL: @loadext_8i16_to_8i32(
420; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <8 x i16>*
421; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
422; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
423; SSE2-NEXT:    ret <8 x i32> [[TMP3]]
424;
425; SLM-LABEL: @loadext_8i16_to_8i32(
426; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <8 x i16>*
427; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
428; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
429; SLM-NEXT:    ret <8 x i32> [[TMP3]]
430;
431; AVX-LABEL: @loadext_8i16_to_8i32(
432; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0:%.*]] to <8 x i16>*
433; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
434; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
435; AVX-NEXT:    ret <8 x i32> [[TMP3]]
436;
437  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
438  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
439  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
440  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
441  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
442  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
443  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
444  %i0 = load i16, i16* %p0, align 1
445  %i1 = load i16, i16* %p1, align 1
446  %i2 = load i16, i16* %p2, align 1
447  %i3 = load i16, i16* %p3, align 1
448  %i4 = load i16, i16* %p4, align 1
449  %i5 = load i16, i16* %p5, align 1
450  %i6 = load i16, i16* %p6, align 1
451  %i7 = load i16, i16* %p7, align 1
452  %x0 = zext i16 %i0 to i32
453  %x1 = zext i16 %i1 to i32
454  %x2 = zext i16 %i2 to i32
455  %x3 = zext i16 %i3 to i32
456  %x4 = zext i16 %i4 to i32
457  %x5 = zext i16 %i5 to i32
458  %x6 = zext i16 %i6 to i32
459  %x7 = zext i16 %i7 to i32
460  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
461  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
462  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
463  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
464  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
465  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
466  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
467  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
468  ret <8 x i32> %v7
469}
470
471;
472; vXi32
473;
474
475define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
476; SSE2-LABEL: @loadext_2i32_to_2i64(
477; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0:%.*]] to <2 x i32>*
478; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
479; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
480; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
481;
482; SLM-LABEL: @loadext_2i32_to_2i64(
483; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0:%.*]] to <2 x i32>*
484; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
485; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
486; SLM-NEXT:    ret <2 x i64> [[TMP3]]
487;
488; AVX-LABEL: @loadext_2i32_to_2i64(
489; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0:%.*]] to <2 x i32>*
490; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
491; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
492; AVX-NEXT:    ret <2 x i64> [[TMP3]]
493;
494  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
495  %i0 = load i32, i32* %p0, align 1
496  %i1 = load i32, i32* %p1, align 1
497  %x0 = zext i32 %i0 to i64
498  %x1 = zext i32 %i1 to i64
499  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
500  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
501  ret <2 x i64> %v1
502}
503
504define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
505; SSE2-LABEL: @loadext_4i32_to_4i64(
506; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0:%.*]] to <4 x i32>*
507; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
508; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
509; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
510;
511; SLM-LABEL: @loadext_4i32_to_4i64(
512; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0:%.*]] to <4 x i32>*
513; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
514; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
515; SLM-NEXT:    ret <4 x i64> [[TMP3]]
516;
517; AVX-LABEL: @loadext_4i32_to_4i64(
518; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0:%.*]] to <4 x i32>*
519; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
520; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
521; AVX-NEXT:    ret <4 x i64> [[TMP3]]
522;
523  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
524  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
525  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
526  %i0 = load i32, i32* %p0, align 1
527  %i1 = load i32, i32* %p1, align 1
528  %i2 = load i32, i32* %p2, align 1
529  %i3 = load i32, i32* %p3, align 1
530  %x0 = zext i32 %i0 to i64
531  %x1 = zext i32 %i1 to i64
532  %x2 = zext i32 %i2 to i64
533  %x3 = zext i32 %i3 to i64
534  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
535  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
536  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
537  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
538  ret <4 x i64> %v3
539}
540