1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
8; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
9; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512
10; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512
11; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP
12
13define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind {
14; SSE-LABEL: insert_v7i8_v2i16_2:
15; SSE:       # %bb.0:
16; SSE-NEXT:    movl (%rsi), %eax
17; SSE-NEXT:    movd %eax, %xmm0
18; SSE-NEXT:    movq (%rdi), %rcx
19; SSE-NEXT:    movq %rcx, %xmm1
20; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
21; SSE-NEXT:    shrq $48, %rcx
22; SSE-NEXT:    movb %cl, 6(%rdi)
23; SSE-NEXT:    shrl $16, %eax
24; SSE-NEXT:    movw %ax, 4(%rdi)
25; SSE-NEXT:    movd %xmm1, (%rdi)
26; SSE-NEXT:    retq
27;
28; AVX-LABEL: insert_v7i8_v2i16_2:
29; AVX:       # %bb.0:
30; AVX-NEXT:    movl (%rsi), %eax
31; AVX-NEXT:    vmovd %eax, %xmm0
32; AVX-NEXT:    movq (%rdi), %rcx
33; AVX-NEXT:    vmovq %rcx, %xmm1
34; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
35; AVX-NEXT:    shrq $48, %rcx
36; AVX-NEXT:    movb %cl, 6(%rdi)
37; AVX-NEXT:    shrl $16, %eax
38; AVX-NEXT:    movw %ax, 4(%rdi)
39; AVX-NEXT:    vmovd %xmm0, (%rdi)
40; AVX-NEXT:    retq
41;
42; AVX512-LABEL: insert_v7i8_v2i16_2:
43; AVX512:       # %bb.0:
44; AVX512-NEXT:    movl (%rsi), %eax
45; AVX512-NEXT:    vmovd %eax, %xmm0
46; AVX512-NEXT:    movq (%rdi), %rcx
47; AVX512-NEXT:    vmovq %rcx, %xmm1
48; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
49; AVX512-NEXT:    shrq $48, %rcx
50; AVX512-NEXT:    movb %cl, 6(%rdi)
51; AVX512-NEXT:    shrl $16, %eax
52; AVX512-NEXT:    movw %ax, 4(%rdi)
53; AVX512-NEXT:    vmovd %xmm0, (%rdi)
54; AVX512-NEXT:    retq
55  %1 = load <2 x i16>, <2 x i16> *%a1
56  %2 = bitcast <2 x i16> %1 to <4 x i8>
57  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
58  %4 = load <7 x i8>, <7 x i8> *%a0
59  %5 = shufflevector <7 x i8> %4, <7 x i8> %3, <7 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9, i32 10, i32 6>
60  store <7 x i8> %5, <7 x i8>* %a0
61  ret void
62}
63
64%struct.Mat4 = type { %struct.storage }
65%struct.storage = type { [16 x float] }
66
67define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %struct.Mat4* nocapture dereferenceable(64)) {
68; SSE-LABEL: PR40815:
69; SSE:       # %bb.0:
70; SSE-NEXT:    movaps (%rdi), %xmm0
71; SSE-NEXT:    movaps 16(%rdi), %xmm1
72; SSE-NEXT:    movaps 32(%rdi), %xmm2
73; SSE-NEXT:    movaps 48(%rdi), %xmm3
74; SSE-NEXT:    movaps %xmm3, (%rsi)
75; SSE-NEXT:    movaps %xmm2, 16(%rsi)
76; SSE-NEXT:    movaps %xmm1, 32(%rsi)
77; SSE-NEXT:    movaps %xmm0, 48(%rsi)
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: PR40815:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vmovaps (%rdi), %xmm0
83; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
84; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
85; AVX-NEXT:    vmovaps 48(%rdi), %xmm3
86; AVX-NEXT:    vmovaps %xmm2, 16(%rsi)
87; AVX-NEXT:    vmovaps %xmm3, (%rsi)
88; AVX-NEXT:    vmovaps %xmm0, 48(%rsi)
89; AVX-NEXT:    vmovaps %xmm1, 32(%rsi)
90; AVX-NEXT:    retq
91;
92; AVX512-LABEL: PR40815:
93; AVX512:       # %bb.0:
94; AVX512-NEXT:    vmovaps 16(%rdi), %xmm0
95; AVX512-NEXT:    vmovaps 48(%rdi), %xmm1
96; AVX512-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
97; AVX512-NEXT:    vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
98; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
99; AVX512-NEXT:    vmovups %zmm0, (%rsi)
100; AVX512-NEXT:    vzeroupper
101; AVX512-NEXT:    retq
102  %3 = bitcast %struct.Mat4* %0 to <16 x float>*
103  %4 = load <16 x float>, <16 x float>* %3, align 64
104  %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
105  %6 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 4
106  %7 = bitcast <16 x float> %4 to <4 x i128>
107  %8 = extractelement <4 x i128> %7, i32 1
108  %9 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 8
109  %10 = bitcast <16 x float> %4 to <4 x i128>
110  %11 = extractelement <4 x i128> %10, i32 2
111  %12 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 12
112  %13 = bitcast float* %12 to <4 x float>*
113  %14 = bitcast <16 x float> %4 to <4 x i128>
114  %15 = extractelement <4 x i128> %14, i32 3
115  %16 = bitcast %struct.Mat4* %1 to i128*
116  store i128 %15, i128* %16, align 16
117  %17 = bitcast float* %6 to i128*
118  store i128 %11, i128* %17, align 16
119  %18 = bitcast float* %9 to i128*
120  store i128 %8, i128* %18, align 16
121  store <4 x float> %5, <4 x float>* %13, align 16
122  ret void
123}
124
125define <16 x i32> @PR42819(<8 x i32>* %a0) {
126; SSE-LABEL: PR42819:
127; SSE:       # %bb.0:
128; SSE-NEXT:    movdqu (%rdi), %xmm3
129; SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11]
130; SSE-NEXT:    xorps %xmm0, %xmm0
131; SSE-NEXT:    xorps %xmm1, %xmm1
132; SSE-NEXT:    xorps %xmm2, %xmm2
133; SSE-NEXT:    retq
134;
135; AVX-LABEL: PR42819:
136; AVX:       # %bb.0:
137; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,1,2]
138; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
139; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
140; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7]
141; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
142; AVX-NEXT:    retq
143;
144; AVX512-LABEL: PR42819:
145; AVX512:       # %bb.0:
146; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
147; AVX512-NEXT:    movw $-8192, %ax # imm = 0xE000
148; AVX512-NEXT:    kmovw %eax, %k1
149; AVX512-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
150; AVX512-NEXT:    retq
151  %1 = load <8 x i32>, <8 x i32>* %a0, align 4
152  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
153  %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
154  ret <16 x i32> %3
155}
156
157@b = dso_local local_unnamed_addr global i32 0, align 4
158@c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
159@d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
160
161define void @PR42833() {
162; SSE2-LABEL: PR42833:
163; SSE2:       # %bb.0:
164; SSE2-NEXT:    movdqa c+144(%rip), %xmm1
165; SSE2-NEXT:    movdqa c+128(%rip), %xmm0
166; SSE2-NEXT:    movd %xmm0, %eax
167; SSE2-NEXT:    addl b(%rip), %eax
168; SSE2-NEXT:    movd %eax, %xmm2
169; SSE2-NEXT:    movd %eax, %xmm3
170; SSE2-NEXT:    paddd %xmm0, %xmm3
171; SSE2-NEXT:    movdqa d+144(%rip), %xmm4
172; SSE2-NEXT:    psubd %xmm1, %xmm4
173; SSE2-NEXT:    paddd %xmm1, %xmm1
174; SSE2-NEXT:    movdqa %xmm0, %xmm5
175; SSE2-NEXT:    paddd %xmm0, %xmm5
176; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
177; SSE2-NEXT:    movdqa %xmm1, c+144(%rip)
178; SSE2-NEXT:    movaps %xmm5, c+128(%rip)
179; SSE2-NEXT:    movdqa c+160(%rip), %xmm1
180; SSE2-NEXT:    movdqa c+176(%rip), %xmm3
181; SSE2-NEXT:    movdqa d+160(%rip), %xmm5
182; SSE2-NEXT:    movdqa d+176(%rip), %xmm6
183; SSE2-NEXT:    movdqa d+128(%rip), %xmm7
184; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
185; SSE2-NEXT:    psubd %xmm0, %xmm7
186; SSE2-NEXT:    psubd %xmm3, %xmm6
187; SSE2-NEXT:    psubd %xmm1, %xmm5
188; SSE2-NEXT:    movdqa %xmm5, d+160(%rip)
189; SSE2-NEXT:    movdqa %xmm6, d+176(%rip)
190; SSE2-NEXT:    movdqa %xmm4, d+144(%rip)
191; SSE2-NEXT:    movdqa %xmm7, d+128(%rip)
192; SSE2-NEXT:    paddd %xmm3, %xmm3
193; SSE2-NEXT:    paddd %xmm1, %xmm1
194; SSE2-NEXT:    movdqa %xmm1, c+160(%rip)
195; SSE2-NEXT:    movdqa %xmm3, c+176(%rip)
196; SSE2-NEXT:    retq
197;
198; SSE42-LABEL: PR42833:
199; SSE42:       # %bb.0:
200; SSE42-NEXT:    movdqa c+144(%rip), %xmm0
201; SSE42-NEXT:    movdqa c+128(%rip), %xmm1
202; SSE42-NEXT:    movd %xmm1, %eax
203; SSE42-NEXT:    addl b(%rip), %eax
204; SSE42-NEXT:    movd %eax, %xmm2
205; SSE42-NEXT:    paddd %xmm1, %xmm2
206; SSE42-NEXT:    movdqa d+144(%rip), %xmm3
207; SSE42-NEXT:    psubd %xmm0, %xmm3
208; SSE42-NEXT:    paddd %xmm0, %xmm0
209; SSE42-NEXT:    movdqa %xmm1, %xmm4
210; SSE42-NEXT:    paddd %xmm1, %xmm4
211; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
212; SSE42-NEXT:    movdqa %xmm0, c+144(%rip)
213; SSE42-NEXT:    movdqa %xmm4, c+128(%rip)
214; SSE42-NEXT:    movdqa c+160(%rip), %xmm0
215; SSE42-NEXT:    movdqa c+176(%rip), %xmm2
216; SSE42-NEXT:    movdqa d+160(%rip), %xmm4
217; SSE42-NEXT:    movdqa d+176(%rip), %xmm5
218; SSE42-NEXT:    movdqa d+128(%rip), %xmm6
219; SSE42-NEXT:    pinsrd $0, %eax, %xmm1
220; SSE42-NEXT:    psubd %xmm1, %xmm6
221; SSE42-NEXT:    psubd %xmm2, %xmm5
222; SSE42-NEXT:    psubd %xmm0, %xmm4
223; SSE42-NEXT:    movdqa %xmm4, d+160(%rip)
224; SSE42-NEXT:    movdqa %xmm5, d+176(%rip)
225; SSE42-NEXT:    movdqa %xmm3, d+144(%rip)
226; SSE42-NEXT:    movdqa %xmm6, d+128(%rip)
227; SSE42-NEXT:    paddd %xmm2, %xmm2
228; SSE42-NEXT:    paddd %xmm0, %xmm0
229; SSE42-NEXT:    movdqa %xmm0, c+160(%rip)
230; SSE42-NEXT:    movdqa %xmm2, c+176(%rip)
231; SSE42-NEXT:    retq
232;
233; AVX1-LABEL: PR42833:
234; AVX1:       # %bb.0:
235; AVX1-NEXT:    vmovdqa c+128(%rip), %xmm0
236; AVX1-NEXT:    vmovd %xmm0, %eax
237; AVX1-NEXT:    addl b(%rip), %eax
238; AVX1-NEXT:    vmovd %eax, %xmm1
239; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
240; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
241; AVX1-NEXT:    vmovdqa c+144(%rip), %xmm3
242; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
243; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
244; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
245; AVX1-NEXT:    vmovdqa d+144(%rip), %xmm2
246; AVX1-NEXT:    vpsubd c+144(%rip), %xmm2, %xmm2
247; AVX1-NEXT:    vmovups %ymm1, c+128(%rip)
248; AVX1-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
249; AVX1-NEXT:    vmovdqa d+128(%rip), %xmm1
250; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
251; AVX1-NEXT:    vmovdqa d+176(%rip), %xmm1
252; AVX1-NEXT:    vmovdqa c+176(%rip), %xmm3
253; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
254; AVX1-NEXT:    vmovdqa d+160(%rip), %xmm4
255; AVX1-NEXT:    vmovdqa c+160(%rip), %xmm5
256; AVX1-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
257; AVX1-NEXT:    vmovdqa %xmm2, d+144(%rip)
258; AVX1-NEXT:    vmovdqa %xmm4, d+160(%rip)
259; AVX1-NEXT:    vmovdqa %xmm1, d+176(%rip)
260; AVX1-NEXT:    vmovdqa %xmm0, d+128(%rip)
261; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
262; AVX1-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
263; AVX1-NEXT:    vmovdqa %xmm1, c+160(%rip)
264; AVX1-NEXT:    vmovdqa %xmm0, c+176(%rip)
265; AVX1-NEXT:    vzeroupper
266; AVX1-NEXT:    retq
267;
268; AVX2-LABEL: PR42833:
269; AVX2:       # %bb.0:
270; AVX2-NEXT:    movl b(%rip), %eax
271; AVX2-NEXT:    vmovdqu c+128(%rip), %ymm0
272; AVX2-NEXT:    addl c+128(%rip), %eax
273; AVX2-NEXT:    vmovd %eax, %xmm1
274; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
275; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
276; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
277; AVX2-NEXT:    vmovdqu %ymm2, c+128(%rip)
278; AVX2-NEXT:    vmovdqu c+160(%rip), %ymm2
279; AVX2-NEXT:    vmovdqu d+160(%rip), %ymm3
280; AVX2-NEXT:    vmovdqu d+128(%rip), %ymm4
281; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
282; AVX2-NEXT:    vpsubd %ymm0, %ymm4, %ymm0
283; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm1
284; AVX2-NEXT:    vmovdqu %ymm1, d+160(%rip)
285; AVX2-NEXT:    vmovdqu %ymm0, d+128(%rip)
286; AVX2-NEXT:    vpaddd %ymm2, %ymm2, %ymm0
287; AVX2-NEXT:    vmovdqu %ymm0, c+160(%rip)
288; AVX2-NEXT:    vzeroupper
289; AVX2-NEXT:    retq
290;
291; AVX512-LABEL: PR42833:
292; AVX512:       # %bb.0:
293; AVX512-NEXT:    movl b(%rip), %eax
294; AVX512-NEXT:    vmovdqu c+128(%rip), %ymm0
295; AVX512-NEXT:    vmovdqu64 c+128(%rip), %zmm1
296; AVX512-NEXT:    addl c+128(%rip), %eax
297; AVX512-NEXT:    vmovd %eax, %xmm2
298; AVX512-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
299; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
300; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
301; AVX512-NEXT:    vmovdqa c+128(%rip), %xmm2
302; AVX512-NEXT:    vmovdqu %ymm0, c+128(%rip)
303; AVX512-NEXT:    vmovdqu c+160(%rip), %ymm0
304; AVX512-NEXT:    vmovdqu64 d+128(%rip), %zmm3
305; AVX512-NEXT:    vpinsrd $0, %eax, %xmm2, %xmm2
306; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
307; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm1
308; AVX512-NEXT:    vpsubd %zmm1, %zmm3, %zmm1
309; AVX512-NEXT:    vmovdqu64 %zmm1, d+128(%rip)
310; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
311; AVX512-NEXT:    vmovdqu %ymm0, c+160(%rip)
312; AVX512-NEXT:    vzeroupper
313; AVX512-NEXT:    retq
314;
315; XOP-LABEL: PR42833:
316; XOP:       # %bb.0:
317; XOP-NEXT:    vmovdqa c+128(%rip), %xmm0
318; XOP-NEXT:    vmovd %xmm0, %eax
319; XOP-NEXT:    addl b(%rip), %eax
320; XOP-NEXT:    vmovd %eax, %xmm1
321; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
322; XOP-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
323; XOP-NEXT:    vmovdqa c+144(%rip), %xmm3
324; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
325; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
326; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
327; XOP-NEXT:    vmovdqa d+144(%rip), %xmm2
328; XOP-NEXT:    vpsubd c+144(%rip), %xmm2, %xmm2
329; XOP-NEXT:    vmovups %ymm1, c+128(%rip)
330; XOP-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
331; XOP-NEXT:    vmovdqa d+128(%rip), %xmm1
332; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
333; XOP-NEXT:    vmovdqa d+176(%rip), %xmm1
334; XOP-NEXT:    vmovdqa c+176(%rip), %xmm3
335; XOP-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
336; XOP-NEXT:    vmovdqa d+160(%rip), %xmm4
337; XOP-NEXT:    vmovdqa c+160(%rip), %xmm5
338; XOP-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
339; XOP-NEXT:    vmovdqa %xmm2, d+144(%rip)
340; XOP-NEXT:    vmovdqa %xmm4, d+160(%rip)
341; XOP-NEXT:    vmovdqa %xmm1, d+176(%rip)
342; XOP-NEXT:    vmovdqa %xmm0, d+128(%rip)
343; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
344; XOP-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
345; XOP-NEXT:    vmovdqa %xmm1, c+160(%rip)
346; XOP-NEXT:    vmovdqa %xmm0, c+176(%rip)
347; XOP-NEXT:    vzeroupper
348; XOP-NEXT:    retq
349  %1 = load i32, i32* @b, align 4
350  %2 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
351  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
352  %4 = extractelement <8 x i32> %2, i32 0
353  %5 = add i32 %1, %4
354  %6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0
355  %7 = add <8 x i32> %2, %6
356  %8 = shl <8 x i32> %2, %6
357  %9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
358  store <8 x i32> %9, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
359  %10 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
360  %11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
361  %12 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
362  %13 = insertelement <16 x i32> %3, i32 %5, i32 0
363  %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
364  %15 = sub <16 x i32> %12, %14
365  store <16 x i32> %15, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
366  %16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
367  store <8 x i32> %16, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
368  ret void
369}
370