1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
8; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512
9; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP
10
11define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind {
12; SSE-LABEL: insert_v7i8_v2i16_2:
13; SSE:       # %bb.0:
14; SSE-NEXT:    movl (%rsi), %eax
15; SSE-NEXT:    movd %eax, %xmm0
16; SSE-NEXT:    movq (%rdi), %rcx
17; SSE-NEXT:    movq %rcx, %xmm1
18; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
19; SSE-NEXT:    shrq $48, %rcx
20; SSE-NEXT:    movb %cl, 6(%rdi)
21; SSE-NEXT:    shrl $16, %eax
22; SSE-NEXT:    movw %ax, 4(%rdi)
23; SSE-NEXT:    movd %xmm1, (%rdi)
24; SSE-NEXT:    retq
25;
26; AVX-LABEL: insert_v7i8_v2i16_2:
27; AVX:       # %bb.0:
28; AVX-NEXT:    movl (%rsi), %eax
29; AVX-NEXT:    vmovd %eax, %xmm0
30; AVX-NEXT:    movq (%rdi), %rcx
31; AVX-NEXT:    vmovq %rcx, %xmm1
32; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
33; AVX-NEXT:    shrq $48, %rcx
34; AVX-NEXT:    movb %cl, 6(%rdi)
35; AVX-NEXT:    shrl $16, %eax
36; AVX-NEXT:    movw %ax, 4(%rdi)
37; AVX-NEXT:    vmovd %xmm0, (%rdi)
38; AVX-NEXT:    retq
39;
40; AVX512-LABEL: insert_v7i8_v2i16_2:
41; AVX512:       # %bb.0:
42; AVX512-NEXT:    movl (%rsi), %eax
43; AVX512-NEXT:    vmovd %eax, %xmm0
44; AVX512-NEXT:    movq (%rdi), %rcx
45; AVX512-NEXT:    vmovq %rcx, %xmm1
46; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
47; AVX512-NEXT:    shrq $48, %rcx
48; AVX512-NEXT:    movb %cl, 6(%rdi)
49; AVX512-NEXT:    shrl $16, %eax
50; AVX512-NEXT:    movw %ax, 4(%rdi)
51; AVX512-NEXT:    vmovd %xmm0, (%rdi)
52; AVX512-NEXT:    retq
53  %1 = load <2 x i16>, <2 x i16> *%a1
54  %2 = bitcast <2 x i16> %1 to <4 x i8>
55  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
56  %4 = load <7 x i8>, <7 x i8> *%a0
57  %5 = shufflevector <7 x i8> %4, <7 x i8> %3, <7 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9, i32 10, i32 6>
58  store <7 x i8> %5, <7 x i8>* %a0
59  ret void
60}
61
62%struct.Mat4 = type { %struct.storage }
63%struct.storage = type { [16 x float] }
64
65define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %struct.Mat4* nocapture dereferenceable(64)) {
66; SSE-LABEL: PR40815:
67; SSE:       # %bb.0:
68; SSE-NEXT:    movaps (%rdi), %xmm0
69; SSE-NEXT:    movaps 16(%rdi), %xmm1
70; SSE-NEXT:    movaps 32(%rdi), %xmm2
71; SSE-NEXT:    movaps 48(%rdi), %xmm3
72; SSE-NEXT:    movaps %xmm3, (%rsi)
73; SSE-NEXT:    movaps %xmm2, 16(%rsi)
74; SSE-NEXT:    movaps %xmm1, 32(%rsi)
75; SSE-NEXT:    movaps %xmm0, 48(%rsi)
76; SSE-NEXT:    retq
77;
78; AVX-LABEL: PR40815:
79; AVX:       # %bb.0:
80; AVX-NEXT:    vmovaps (%rdi), %xmm0
81; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
82; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
83; AVX-NEXT:    vmovaps 48(%rdi), %xmm3
84; AVX-NEXT:    vmovaps %xmm2, 16(%rsi)
85; AVX-NEXT:    vmovaps %xmm3, (%rsi)
86; AVX-NEXT:    vmovaps %xmm0, 48(%rsi)
87; AVX-NEXT:    vmovaps %xmm1, 32(%rsi)
88; AVX-NEXT:    retq
89;
90; AVX512-LABEL: PR40815:
91; AVX512:       # %bb.0:
92; AVX512-NEXT:    vmovaps 16(%rdi), %xmm0
93; AVX512-NEXT:    vmovaps 48(%rdi), %xmm1
94; AVX512-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
95; AVX512-NEXT:    vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
96; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
97; AVX512-NEXT:    vmovups %zmm0, (%rsi)
98; AVX512-NEXT:    vzeroupper
99; AVX512-NEXT:    retq
100  %3 = bitcast %struct.Mat4* %0 to <16 x float>*
101  %4 = load <16 x float>, <16 x float>* %3, align 64
102  %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
103  %6 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 4
104  %7 = bitcast <16 x float> %4 to <4 x i128>
105  %8 = extractelement <4 x i128> %7, i32 1
106  %9 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 8
107  %10 = bitcast <16 x float> %4 to <4 x i128>
108  %11 = extractelement <4 x i128> %10, i32 2
109  %12 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 12
110  %13 = bitcast float* %12 to <4 x float>*
111  %14 = bitcast <16 x float> %4 to <4 x i128>
112  %15 = extractelement <4 x i128> %14, i32 3
113  %16 = bitcast %struct.Mat4* %1 to i128*
114  store i128 %15, i128* %16, align 16
115  %17 = bitcast float* %6 to i128*
116  store i128 %11, i128* %17, align 16
117  %18 = bitcast float* %9 to i128*
118  store i128 %8, i128* %18, align 16
119  store <4 x float> %5, <4 x float>* %13, align 16
120  ret void
121}
122
123define <16 x i32> @PR42819(<8 x i32>* %a0) {
124; SSE-LABEL: PR42819:
125; SSE:       # %bb.0:
126; SSE-NEXT:    movdqu (%rdi), %xmm3
127; SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11]
128; SSE-NEXT:    xorps %xmm0, %xmm0
129; SSE-NEXT:    xorps %xmm1, %xmm1
130; SSE-NEXT:    xorps %xmm2, %xmm2
131; SSE-NEXT:    retq
132;
133; AVX-LABEL: PR42819:
134; AVX:       # %bb.0:
135; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,1,2]
136; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
137; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
138; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7]
139; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
140; AVX-NEXT:    retq
141;
142; AVX512-LABEL: PR42819:
143; AVX512:       # %bb.0:
144; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
145; AVX512-NEXT:    movw $-8192, %ax # imm = 0xE000
146; AVX512-NEXT:    kmovw %eax, %k1
147; AVX512-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
148; AVX512-NEXT:    retq
149  %1 = load <8 x i32>, <8 x i32>* %a0, align 4
150  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
151  %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
152  ret <16 x i32> %3
153}
154
155@b = dso_local local_unnamed_addr global i32 0, align 4
156@c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
157@d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
158
159define void @PR42833() {
160; SSE2-LABEL: PR42833:
161; SSE2:       # %bb.0:
162; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm1
163; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm0
164; SSE2-NEXT:    movd %xmm0, %eax
165; SSE2-NEXT:    addl {{.*}}(%rip), %eax
166; SSE2-NEXT:    movd %eax, %xmm2
167; SSE2-NEXT:    movd %eax, %xmm3
168; SSE2-NEXT:    paddd %xmm0, %xmm3
169; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm4
170; SSE2-NEXT:    psubd %xmm1, %xmm4
171; SSE2-NEXT:    paddd %xmm1, %xmm1
172; SSE2-NEXT:    movdqa %xmm0, %xmm5
173; SSE2-NEXT:    paddd %xmm0, %xmm5
174; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
175; SSE2-NEXT:    movdqa %xmm1, c+{{.*}}(%rip)
176; SSE2-NEXT:    movaps %xmm5, c+{{.*}}(%rip)
177; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm1
178; SSE2-NEXT:    movdqa c+{{.*}}(%rip), %xmm3
179; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm5
180; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm6
181; SSE2-NEXT:    movdqa d+{{.*}}(%rip), %xmm7
182; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
183; SSE2-NEXT:    psubd %xmm0, %xmm7
184; SSE2-NEXT:    psubd %xmm3, %xmm6
185; SSE2-NEXT:    psubd %xmm1, %xmm5
186; SSE2-NEXT:    movdqa %xmm5, d+{{.*}}(%rip)
187; SSE2-NEXT:    movdqa %xmm6, d+{{.*}}(%rip)
188; SSE2-NEXT:    movdqa %xmm4, d+{{.*}}(%rip)
189; SSE2-NEXT:    movdqa %xmm7, d+{{.*}}(%rip)
190; SSE2-NEXT:    paddd %xmm3, %xmm3
191; SSE2-NEXT:    paddd %xmm1, %xmm1
192; SSE2-NEXT:    movdqa %xmm1, c+{{.*}}(%rip)
193; SSE2-NEXT:    movdqa %xmm3, c+{{.*}}(%rip)
194; SSE2-NEXT:    retq
195;
196; SSE42-LABEL: PR42833:
197; SSE42:       # %bb.0:
198; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm0
199; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm1
200; SSE42-NEXT:    movd %xmm1, %eax
201; SSE42-NEXT:    addl {{.*}}(%rip), %eax
202; SSE42-NEXT:    movd %eax, %xmm2
203; SSE42-NEXT:    paddd %xmm1, %xmm2
204; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm3
205; SSE42-NEXT:    psubd %xmm0, %xmm3
206; SSE42-NEXT:    paddd %xmm0, %xmm0
207; SSE42-NEXT:    movdqa %xmm1, %xmm4
208; SSE42-NEXT:    paddd %xmm1, %xmm4
209; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
210; SSE42-NEXT:    movdqa %xmm0, c+{{.*}}(%rip)
211; SSE42-NEXT:    movdqa %xmm4, c+{{.*}}(%rip)
212; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm0
213; SSE42-NEXT:    movdqa c+{{.*}}(%rip), %xmm2
214; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm4
215; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm5
216; SSE42-NEXT:    movdqa d+{{.*}}(%rip), %xmm6
217; SSE42-NEXT:    pinsrd $0, %eax, %xmm1
218; SSE42-NEXT:    psubd %xmm1, %xmm6
219; SSE42-NEXT:    psubd %xmm2, %xmm5
220; SSE42-NEXT:    psubd %xmm0, %xmm4
221; SSE42-NEXT:    movdqa %xmm4, d+{{.*}}(%rip)
222; SSE42-NEXT:    movdqa %xmm5, d+{{.*}}(%rip)
223; SSE42-NEXT:    movdqa %xmm3, d+{{.*}}(%rip)
224; SSE42-NEXT:    movdqa %xmm6, d+{{.*}}(%rip)
225; SSE42-NEXT:    paddd %xmm2, %xmm2
226; SSE42-NEXT:    paddd %xmm0, %xmm0
227; SSE42-NEXT:    movdqa %xmm0, c+{{.*}}(%rip)
228; SSE42-NEXT:    movdqa %xmm2, c+{{.*}}(%rip)
229; SSE42-NEXT:    retq
230;
231; AVX1-LABEL: PR42833:
232; AVX1:       # %bb.0:
233; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm0
234; AVX1-NEXT:    vmovd %xmm0, %eax
235; AVX1-NEXT:    addl {{.*}}(%rip), %eax
236; AVX1-NEXT:    vmovd %eax, %xmm1
237; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
238; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
239; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
240; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
241; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
242; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
243; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm2
244; AVX1-NEXT:    vpsubd c+{{.*}}(%rip), %xmm2, %xmm2
245; AVX1-NEXT:    vmovups %ymm1, c+{{.*}}(%rip)
246; AVX1-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
247; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
248; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
249; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
250; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
251; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
252; AVX1-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm4
253; AVX1-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm5
254; AVX1-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
255; AVX1-NEXT:    vmovdqa %xmm2, d+{{.*}}(%rip)
256; AVX1-NEXT:    vmovdqa %xmm4, d+{{.*}}(%rip)
257; AVX1-NEXT:    vmovdqa %xmm1, d+{{.*}}(%rip)
258; AVX1-NEXT:    vmovdqa %xmm0, d+{{.*}}(%rip)
259; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
260; AVX1-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
261; AVX1-NEXT:    vmovdqa %xmm1, c+{{.*}}(%rip)
262; AVX1-NEXT:    vmovdqa %xmm0, c+{{.*}}(%rip)
263; AVX1-NEXT:    vzeroupper
264; AVX1-NEXT:    retq
265;
266; AVX2-LABEL: PR42833:
267; AVX2:       # %bb.0:
268; AVX2-NEXT:    movl {{.*}}(%rip), %eax
269; AVX2-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm0
270; AVX2-NEXT:    addl c+{{.*}}(%rip), %eax
271; AVX2-NEXT:    vmovd %eax, %xmm1
272; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
273; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
274; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
275; AVX2-NEXT:    vmovdqu %ymm2, c+{{.*}}(%rip)
276; AVX2-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm2
277; AVX2-NEXT:    vmovdqu d+{{.*}}(%rip), %ymm3
278; AVX2-NEXT:    vmovdqu d+{{.*}}(%rip), %ymm4
279; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
280; AVX2-NEXT:    vpsubd %ymm0, %ymm4, %ymm0
281; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm1
282; AVX2-NEXT:    vmovdqu %ymm1, d+{{.*}}(%rip)
283; AVX2-NEXT:    vmovdqu %ymm0, d+{{.*}}(%rip)
284; AVX2-NEXT:    vpaddd %ymm2, %ymm2, %ymm0
285; AVX2-NEXT:    vmovdqu %ymm0, c+{{.*}}(%rip)
286; AVX2-NEXT:    vzeroupper
287; AVX2-NEXT:    retq
288;
289; AVX512-LABEL: PR42833:
290; AVX512:       # %bb.0:
291; AVX512-NEXT:    movl {{.*}}(%rip), %eax
292; AVX512-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm0
293; AVX512-NEXT:    vmovdqu64 c+{{.*}}(%rip), %zmm1
294; AVX512-NEXT:    addl c+{{.*}}(%rip), %eax
295; AVX512-NEXT:    vmovd %eax, %xmm2
296; AVX512-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
297; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
298; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
299; AVX512-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm2
300; AVX512-NEXT:    vmovdqu %ymm0, c+{{.*}}(%rip)
301; AVX512-NEXT:    vmovdqu c+{{.*}}(%rip), %ymm0
302; AVX512-NEXT:    vmovdqu64 d+{{.*}}(%rip), %zmm3
303; AVX512-NEXT:    vpinsrd $0, %eax, %xmm2, %xmm2
304; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
305; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm1
306; AVX512-NEXT:    vpsubd %zmm1, %zmm3, %zmm1
307; AVX512-NEXT:    vmovdqu64 %zmm1, d+{{.*}}(%rip)
308; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
309; AVX512-NEXT:    vmovdqu %ymm0, c+{{.*}}(%rip)
310; AVX512-NEXT:    vzeroupper
311; AVX512-NEXT:    retq
312;
313; XOP-LABEL: PR42833:
314; XOP:       # %bb.0:
315; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm0
316; XOP-NEXT:    vmovd %xmm0, %eax
317; XOP-NEXT:    addl {{.*}}(%rip), %eax
318; XOP-NEXT:    vmovd %eax, %xmm1
319; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
320; XOP-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
321; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
322; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
323; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
324; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
325; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm2
326; XOP-NEXT:    vpsubd c+{{.*}}(%rip), %xmm2, %xmm2
327; XOP-NEXT:    vmovups %ymm1, c+{{.*}}(%rip)
328; XOP-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
329; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
330; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
331; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm1
332; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm3
333; XOP-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
334; XOP-NEXT:    vmovdqa d+{{.*}}(%rip), %xmm4
335; XOP-NEXT:    vmovdqa c+{{.*}}(%rip), %xmm5
336; XOP-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
337; XOP-NEXT:    vmovdqa %xmm2, d+{{.*}}(%rip)
338; XOP-NEXT:    vmovdqa %xmm4, d+{{.*}}(%rip)
339; XOP-NEXT:    vmovdqa %xmm1, d+{{.*}}(%rip)
340; XOP-NEXT:    vmovdqa %xmm0, d+{{.*}}(%rip)
341; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
342; XOP-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
343; XOP-NEXT:    vmovdqa %xmm1, c+{{.*}}(%rip)
344; XOP-NEXT:    vmovdqa %xmm0, c+{{.*}}(%rip)
345; XOP-NEXT:    vzeroupper
346; XOP-NEXT:    retq
347  %1 = load i32, i32* @b, align 4
348  %2 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
349  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
350  %4 = extractelement <8 x i32> %2, i32 0
351  %5 = add i32 %1, %4
352  %6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0
353  %7 = add <8 x i32> %2, %6
354  %8 = shl <8 x i32> %2, %6
355  %9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
356  store <8 x i32> %9, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
357  %10 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
358  %11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
359  %12 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
360  %13 = insertelement <16 x i32> %3, i32 %5, i32 0
361  %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
362  %15 = sub <16 x i32> %12, %14
363  store <16 x i32> %15, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
364  %16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
365  store <8 x i32> %16, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
366  ret void
367}
368