1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7
8;
9; Partial Vector Loads - PR16739
10;
11
12define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
13; SSE-LABEL: load_float4_float3:
14; SSE:       # %bb.0:
15; SSE-NEXT:    movups (%rdi), %xmm0
16; SSE-NEXT:    retq
17;
18; AVX-LABEL: load_float4_float3:
19; AVX:       # %bb.0:
20; AVX-NEXT:    vmovups (%rdi), %xmm0
21; AVX-NEXT:    retq
22  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
23  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
24  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
25  %ld0 = load float, float* %p0, align 4
26  %ld1 = load float, float* %p1, align 4
27  %ld2 = load float, float* %p2, align 4
28  %r0 = insertelement <4 x float> undef, float %ld0, i32 0
29  %r1 = insertelement <4 x float> %r0,   float %ld1, i32 1
30  %r2 = insertelement <4 x float> %r1,   float %ld2, i32 2
31  ret <4 x float> %r2
32}
33
34define <4 x float> @load_float4_float3_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
35; SSE-LABEL: load_float4_float3_0122:
36; SSE:       # %bb.0:
37; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
38; SSE-NEXT:    movups (%rdi), %xmm0
39; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: load_float4_float3_0122:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
45; AVX-NEXT:    vmovups (%rdi), %xmm1
46; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
47; AVX-NEXT:    retq
48  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
49  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
50  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
51  %ld0 = load float, float* %p0, align 4
52  %ld1 = load float, float* %p1, align 4
53  %ld2 = load float, float* %p2, align 4
54  %r0 = insertelement <4 x float> undef, float %ld0, i32 0
55  %r1 = insertelement <4 x float> %r0,   float %ld1, i32 1
56  %r2 = insertelement <4 x float> %r1,   float %ld2, i32 2
57  %r3 = insertelement <4 x float> %r2,   float %ld2, i32 3
58  ret <4 x float> %r3
59}
60
61define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
62; SSE-LABEL: load_float8_float3:
63; SSE:       # %bb.0:
64; SSE-NEXT:    movups (%rdi), %xmm0
65; SSE-NEXT:    retq
66;
67; AVX-LABEL: load_float8_float3:
68; AVX:       # %bb.0:
69; AVX-NEXT:    vmovups (%rdi), %xmm0
70; AVX-NEXT:    retq
71  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
72  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
73  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
74  %ld0 = load float, float* %p0, align 4
75  %ld1 = load float, float* %p1, align 4
76  %ld2 = load float, float* %p2, align 4
77  %r0 = insertelement <8 x float> undef, float %ld0, i32 0
78  %r1 = insertelement <8 x float> %r0,   float %ld1, i32 1
79  %r2 = insertelement <8 x float> %r1,   float %ld2, i32 2
80  ret <8 x float> %r2
81}
82
83define <8 x float> @load_float8_float3_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
84; SSE-LABEL: load_float8_float3_0122:
85; SSE:       # %bb.0:
86; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
87; SSE-NEXT:    movups (%rdi), %xmm0
88; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
89; SSE-NEXT:    retq
90;
91; AVX-LABEL: load_float8_float3_0122:
92; AVX:       # %bb.0:
93; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
94; AVX-NEXT:    vmovups (%rdi), %xmm1
95; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
96; AVX-NEXT:    retq
97  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
98  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
99  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
100  %ld0 = load float, float* %p0, align 4
101  %ld1 = load float, float* %p1, align 4
102  %ld2 = load float, float* %p2, align 4
103  %r0 = insertelement <8 x float> undef, float %ld0, i32 0
104  %r1 = insertelement <8 x float> %r0,   float %ld1, i32 1
105  %r2 = insertelement <8 x float> %r1,   float %ld2, i32 2
106  %r3 = insertelement <8 x float> %r2,   float %ld2, i32 3
107  ret <8 x float> %r3
108}
109
110define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
111; SSE-LABEL: load_float4_float3_as_float2_float:
112; SSE:       # %bb.0:
113; SSE-NEXT:    movups (%rdi), %xmm0
114; SSE-NEXT:    retq
115;
116; AVX-LABEL: load_float4_float3_as_float2_float:
117; AVX:       # %bb.0:
118; AVX-NEXT:    vmovups (%rdi), %xmm0
119; AVX-NEXT:    retq
120  %2 = bitcast <4 x float>* %0 to <2 x float>*
121  %3 = load <2 x float>, <2 x float>* %2, align 4
122  %4 = extractelement <2 x float> %3, i32 0
123  %5 = insertelement <4 x float> undef, float %4, i32 0
124  %6 = extractelement <2 x float> %3, i32 1
125  %7 = insertelement <4 x float> %5, float %6, i32 1
126  %8 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
127  %9 = load float, float* %8, align 4
128  %10 = insertelement <4 x float> %7, float %9, i32 2
129  ret <4 x float> %10
130}
131
132define <4 x float> @load_float4_float3_as_float2_float_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
133; SSE-LABEL: load_float4_float3_as_float2_float_0122:
134; SSE:       # %bb.0:
135; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
136; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
137; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
138; SSE-NEXT:    retq
139;
140; AVX-LABEL: load_float4_float3_as_float2_float_0122:
141; AVX:       # %bb.0:
142; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
143; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
144; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
145; AVX-NEXT:    retq
146  %2 = bitcast <4 x float>* %0 to <2 x float>*
147  %3 = load <2 x float>, <2 x float>* %2, align 4
148  %4 = extractelement <2 x float> %3, i32 0
149  %5 = insertelement <4 x float> undef, float %4, i32 0
150  %6 = extractelement <2 x float> %3, i32 1
151  %7 = insertelement <4 x float> %5, float %6, i32 1
152  %8 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
153  %9 = load float, float* %8, align 4
154  %10 = insertelement <4 x float> %7, float %9, i32 2
155  %11 = insertelement <4 x float> %10, float %9, i32 3
156  ret <4 x float> %11
157}
158
159define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) {
160; SSE-LABEL: load_float4_float3_trunc:
161; SSE:       # %bb.0:
162; SSE-NEXT:    movaps (%rdi), %xmm0
163; SSE-NEXT:    retq
164;
165; AVX-LABEL: load_float4_float3_trunc:
166; AVX:       # %bb.0:
167; AVX-NEXT:    vmovaps (%rdi), %xmm0
168; AVX-NEXT:    retq
169  %2 = bitcast <4 x float>* %0 to i64*
170  %3 = load i64, i64* %2, align 16
171  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
172  %5 = bitcast float* %4 to i64*
173  %6 = load i64, i64* %5, align 8
174  %7 = trunc i64 %3 to i32
175  %8 = bitcast i32 %7 to float
176  %9 = insertelement <4 x float> undef, float %8, i32 0
177  %10 = lshr i64 %3, 32
178  %11 = trunc i64 %10 to i32
179  %12 = bitcast i32 %11 to float
180  %13 = insertelement <4 x float> %9, float %12, i32 1
181  %14 = trunc i64 %6 to i32
182  %15 = bitcast i32 %14 to float
183  %16 = insertelement <4 x float> %13, float %15, i32 2
184  ret <4 x float> %16
185}
186
187define <4 x float> @load_float4_float3_trunc_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
188; SSE-LABEL: load_float4_float3_trunc_0122:
189; SSE:       # %bb.0:
190; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
191; SSE-NEXT:    movaps (%rdi), %xmm0
192; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: load_float4_float3_trunc_0122:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
198; AVX-NEXT:    vmovaps (%rdi), %xmm1
199; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
200; AVX-NEXT:    retq
201  %2 = bitcast <4 x float>* %0 to i64*
202  %3 = load i64, i64* %2, align 16
203  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
204  %5 = bitcast float* %4 to i64*
205  %6 = load i64, i64* %5, align 8
206  %7 = trunc i64 %3 to i32
207  %8 = bitcast i32 %7 to float
208  %9 = insertelement <4 x float> undef, float %8, i32 0
209  %10 = lshr i64 %3, 32
210  %11 = trunc i64 %10 to i32
211  %12 = bitcast i32 %11 to float
212  %13 = insertelement <4 x float> %9, float %12, i32 1
213  %14 = trunc i64 %6 to i32
214  %15 = bitcast i32 %14 to float
215  %16 = insertelement <4 x float> %13, float %15, i32 2
216  %17 = insertelement <4 x float> %16, float %15, i32 3
217  ret <4 x float> %17
218}
219
220define <4 x float> @load_float4_float3_trunc_0123(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
221; SSE2-LABEL: load_float4_float3_trunc_0123:
222; SSE2:       # %bb.0:
223; SSE2-NEXT:    movaps (%rdi), %xmm0
224; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
225; SSE2-NEXT:    retq
226;
227; SSSE3-LABEL: load_float4_float3_trunc_0123:
228; SSSE3:       # %bb.0:
229; SSSE3-NEXT:    movaps (%rdi), %xmm0
230; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
231; SSSE3-NEXT:    retq
232;
233; SSE41-LABEL: load_float4_float3_trunc_0123:
234; SSE41:       # %bb.0:
235; SSE41-NEXT:    movaps (%rdi), %xmm0
236; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
237; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
238; SSE41-NEXT:    retq
239;
240; AVX-LABEL: load_float4_float3_trunc_0123:
241; AVX:       # %bb.0:
242; AVX-NEXT:    vmovaps (%rdi), %xmm0
243; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
244; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
245; AVX-NEXT:    retq
246  %2 = bitcast <4 x float>* %0 to i64*
247  %3 = load i64, i64* %2, align 16
248  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
249  %5 = bitcast float* %4 to i64*
250  %6 = load i64, i64* %5, align 8
251  %7 = trunc i64 %3 to i32
252  %8 = bitcast i32 %7 to float
253  %9 = insertelement <4 x float> undef, float %8, i32 0
254  %10 = lshr i64 %3, 32
255  %11 = trunc i64 %10 to i32
256  %12 = bitcast i32 %11 to float
257  %13 = insertelement <4 x float> %9, float %12, i32 1
258  %14 = trunc i64 %6 to i32
259  %15 = bitcast i32 %14 to float
260  %16 = insertelement <4 x float> %13, float %15, i32 2
261  %17 = lshr i64 %6, 32
262  %18 = trunc i64 %17 to i32
263  %19 = bitcast i32 %18 to float
264  %20 = insertelement <4 x float> %16, float %19, i32 3
265  ret <4 x float> %20
266}
267
268define <4 x float> @load_float4_float3_trunc_0123_unaligned(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
269; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned:
270; SSE2:       # %bb.0:
271; SSE2-NEXT:    movups (%rdi), %xmm0
272; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
273; SSE2-NEXT:    retq
274;
275; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned:
276; SSSE3:       # %bb.0:
277; SSSE3-NEXT:    movups (%rdi), %xmm0
278; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
279; SSSE3-NEXT:    retq
280;
281; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned:
282; SSE41:       # %bb.0:
283; SSE41-NEXT:    movups (%rdi), %xmm0
284; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
285; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
286; SSE41-NEXT:    retq
287;
288; AVX-LABEL: load_float4_float3_trunc_0123_unaligned:
289; AVX:       # %bb.0:
290; AVX-NEXT:    vmovups (%rdi), %xmm0
291; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
292; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
293; AVX-NEXT:    retq
294  %2 = bitcast <4 x float>* %0 to i64*
295  %3 = load i64, i64* %2, align 1
296  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
297  %5 = bitcast float* %4 to i64*
298  %6 = load i64, i64* %5, align 1
299  %7 = trunc i64 %3 to i32
300  %8 = bitcast i32 %7 to float
301  %9 = insertelement <4 x float> undef, float %8, i32 0
302  %10 = lshr i64 %3, 32
303  %11 = trunc i64 %10 to i32
304  %12 = bitcast i32 %11 to float
305  %13 = insertelement <4 x float> %9, float %12, i32 1
306  %14 = trunc i64 %6 to i32
307  %15 = bitcast i32 %14 to float
308  %16 = insertelement <4 x float> %13, float %15, i32 2
309  %17 = lshr i64 %6, 32
310  %18 = trunc i64 %17 to i32
311  %19 = bitcast i32 %18 to float
312  %20 = insertelement <4 x float> %16, float %19, i32 3
313  ret <4 x float> %20
314}
315
316; PR21780
317define <4 x double> @load_double4_0u2u(double* nocapture readonly dereferenceable(32)) nofree nosync {
318; SSE2-LABEL: load_double4_0u2u:
319; SSE2:       # %bb.0:
320; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
321; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
322; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
323; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
324; SSE2-NEXT:    retq
325;
326; SSSE3-LABEL: load_double4_0u2u:
327; SSSE3:       # %bb.0:
328; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
329; SSSE3-NEXT:    movddup {{.*#+}} xmm1 = mem[0,0]
330; SSSE3-NEXT:    retq
331;
332; SSE41-LABEL: load_double4_0u2u:
333; SSE41:       # %bb.0:
334; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
335; SSE41-NEXT:    movddup {{.*#+}} xmm1 = mem[0,0]
336; SSE41-NEXT:    retq
337;
338; AVX-LABEL: load_double4_0u2u:
339; AVX:       # %bb.0:
340; AVX-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
341; AVX-NEXT:    retq
342  %2 = load double, double* %0, align 8
343  %3 = insertelement <4 x double> undef, double %2, i32 0
344  %4 = getelementptr inbounds double, double* %0, i64 2
345  %5 = load double, double* %4, align 8
346  %6 = insertelement <4 x double> %3, double %5, i32 2
347  %7 = shufflevector <4 x double> %6, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
348  ret <4 x double> %7
349}
350
351; Test case identified in rL366501
352@h = dso_local local_unnamed_addr global i8 0, align 1
353define dso_local i32 @load_partial_illegal_type()  {
354; SSE2-LABEL: load_partial_illegal_type:
355; SSE2:       # %bb.0:
356; SSE2-NEXT:    movzwl h(%rip), %eax
357; SSE2-NEXT:    movd %eax, %xmm0
358; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
359; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
360; SSE2-NEXT:    movd %xmm0, %eax
361; SSE2-NEXT:    retq
362;
363; SSSE3-LABEL: load_partial_illegal_type:
364; SSSE3:       # %bb.0:
365; SSSE3-NEXT:    movzwl h(%rip), %eax
366; SSSE3-NEXT:    movd %eax, %xmm0
367; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u]
368; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
369; SSSE3-NEXT:    movd %xmm0, %eax
370; SSSE3-NEXT:    retq
371;
372; SSE41-LABEL: load_partial_illegal_type:
373; SSE41:       # %bb.0:
374; SSE41-NEXT:    movzwl h(%rip), %eax
375; SSE41-NEXT:    movd %eax, %xmm0
376; SSE41-NEXT:    movl $2, %eax
377; SSE41-NEXT:    pinsrb $2, %eax, %xmm0
378; SSE41-NEXT:    movd %xmm0, %eax
379; SSE41-NEXT:    retq
380;
381; AVX-LABEL: load_partial_illegal_type:
382; AVX:       # %bb.0:
383; AVX-NEXT:    movzwl h(%rip), %eax
384; AVX-NEXT:    vmovd %eax, %xmm0
385; AVX-NEXT:    movl $2, %eax
386; AVX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
387; AVX-NEXT:    vmovd %xmm0, %eax
388; AVX-NEXT:    retq
389  %1 = load <2 x i8>, <2 x i8>* bitcast (i8* @h to <2 x i8>*), align 1
390  %2 = shufflevector <2 x i8> %1, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
391  %3 = insertelement <4 x i8> %2, i8 2, i32 2
392  %4 = bitcast <4 x i8> %3 to i32
393  ret i32 %4
394}
395
396define dso_local void @PR43227(i32* %explicit_0, <8 x i32>* %explicit_1) {
397; SSE-LABEL: PR43227:
398; SSE:       # %bb.0:
399; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
400; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
401; SSE-NEXT:    psrlq $32, %xmm0
402; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
403; SSE-NEXT:    pxor %xmm1, %xmm1
404; SSE-NEXT:    movdqa %xmm1, 672(%rsi)
405; SSE-NEXT:    movdqa %xmm0, 688(%rsi)
406; SSE-NEXT:    retq
407;
408; AVX1-LABEL: PR43227:
409; AVX1:       # %bb.0:
410; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
411; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
412; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
413; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
414; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
415; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
416; AVX1-NEXT:    vmovaps %ymm0, 672(%rsi)
417; AVX1-NEXT:    vzeroupper
418; AVX1-NEXT:    retq
419;
420; AVX2-LABEL: PR43227:
421; AVX2:       # %bb.0:
422; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
423; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
424; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
425; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
426; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
427; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
428; AVX2-NEXT:    vmovdqa %ymm0, 672(%rsi)
429; AVX2-NEXT:    vzeroupper
430; AVX2-NEXT:    retq
431  %1 = getelementptr i32, i32* %explicit_0, i64 63
432  %2 = bitcast i32* %1 to <3 x i32>*
433  %3 = load <3 x i32>, <3 x i32>* %2, align 1
434  %4 = shufflevector <3 x i32> %3, <3 x i32> undef, <2 x i32> <i32 1, i32 2>
435  %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
436  %6 = shufflevector <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 0>, <8 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 9, i32 7>
437  %7 = getelementptr inbounds <8 x i32>, <8 x i32>* %explicit_1, i64 21
438  store <8 x i32> %6, <8 x i32>* %7, align 32
439  ret void
440}
441