1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
8; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
9; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512
10; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512
11; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP
12
13define void @insert_v7i8_v2i16_2(ptr%a0, ptr%a1) nounwind {
14; SSE-LABEL: insert_v7i8_v2i16_2:
15; SSE:       # %bb.0:
16; SSE-NEXT:    movl (%rsi), %eax
17; SSE-NEXT:    movd %eax, %xmm0
18; SSE-NEXT:    movq (%rdi), %rcx
19; SSE-NEXT:    movq %rcx, %xmm1
20; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
21; SSE-NEXT:    shrq $48, %rcx
22; SSE-NEXT:    movb %cl, 6(%rdi)
23; SSE-NEXT:    shrl $16, %eax
24; SSE-NEXT:    movw %ax, 4(%rdi)
25; SSE-NEXT:    movd %xmm1, (%rdi)
26; SSE-NEXT:    retq
27;
28; AVX-LABEL: insert_v7i8_v2i16_2:
29; AVX:       # %bb.0:
30; AVX-NEXT:    movl (%rsi), %eax
31; AVX-NEXT:    vmovd %eax, %xmm0
32; AVX-NEXT:    movq (%rdi), %rcx
33; AVX-NEXT:    vmovq %rcx, %xmm1
34; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
35; AVX-NEXT:    shrq $48, %rcx
36; AVX-NEXT:    movb %cl, 6(%rdi)
37; AVX-NEXT:    shrl $16, %eax
38; AVX-NEXT:    movw %ax, 4(%rdi)
39; AVX-NEXT:    vmovd %xmm0, (%rdi)
40; AVX-NEXT:    retq
41;
42; AVX512-LABEL: insert_v7i8_v2i16_2:
43; AVX512:       # %bb.0:
44; AVX512-NEXT:    movl (%rsi), %eax
45; AVX512-NEXT:    vmovd %eax, %xmm0
46; AVX512-NEXT:    movq (%rdi), %rcx
47; AVX512-NEXT:    vmovq %rcx, %xmm1
48; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
49; AVX512-NEXT:    shrq $48, %rcx
50; AVX512-NEXT:    movb %cl, 6(%rdi)
51; AVX512-NEXT:    shrl $16, %eax
52; AVX512-NEXT:    movw %ax, 4(%rdi)
53; AVX512-NEXT:    vmovd %xmm0, (%rdi)
54; AVX512-NEXT:    retq
55  %1 = load <2 x i16>, ptr%a1
56  %2 = bitcast <2 x i16> %1 to <4 x i8>
57  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
58  %4 = load <7 x i8>, ptr%a0
59  %5 = shufflevector <7 x i8> %4, <7 x i8> %3, <7 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9, i32 10, i32 6>
60  store <7 x i8> %5, ptr %a0
61  ret void
62}
63
64%struct.Mat4 = type { %struct.storage }
65%struct.storage = type { [16 x float] }
66
67define void @PR40815(ptr nocapture readonly dereferenceable(64), ptr nocapture dereferenceable(64)) {
68; SSE-LABEL: PR40815:
69; SSE:       # %bb.0:
70; SSE-NEXT:    movaps (%rdi), %xmm0
71; SSE-NEXT:    movaps 16(%rdi), %xmm1
72; SSE-NEXT:    movaps 32(%rdi), %xmm2
73; SSE-NEXT:    movaps 48(%rdi), %xmm3
74; SSE-NEXT:    movaps %xmm3, (%rsi)
75; SSE-NEXT:    movaps %xmm2, 16(%rsi)
76; SSE-NEXT:    movaps %xmm1, 32(%rsi)
77; SSE-NEXT:    movaps %xmm0, 48(%rsi)
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: PR40815:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vmovaps (%rdi), %xmm0
83; AVX-NEXT:    vmovaps 16(%rdi), %xmm1
84; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
85; AVX-NEXT:    vmovaps 48(%rdi), %xmm3
86; AVX-NEXT:    vmovaps %xmm2, 16(%rsi)
87; AVX-NEXT:    vmovaps %xmm3, (%rsi)
88; AVX-NEXT:    vmovaps %xmm0, 48(%rsi)
89; AVX-NEXT:    vmovaps %xmm1, 32(%rsi)
90; AVX-NEXT:    retq
91;
92; AVX512-LABEL: PR40815:
93; AVX512:       # %bb.0:
94; AVX512-NEXT:    vmovaps 48(%rdi), %xmm0
95; AVX512-NEXT:    vmovups 16(%rdi), %ymm1
96; AVX512-NEXT:    vinsertf128 $1, (%rdi), %ymm1, %ymm1
97; AVX512-NEXT:    vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
98; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
99; AVX512-NEXT:    vmovups %zmm0, (%rsi)
100; AVX512-NEXT:    vzeroupper
101; AVX512-NEXT:    retq
102  %3 = load <16 x float>, ptr %0, align 64
103  %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
104  %5 = getelementptr inbounds %struct.Mat4, ptr %1, i64 0, i32 0, i32 0, i64 4
105  %6 = bitcast <16 x float> %3 to <4 x i128>
106  %7 = extractelement <4 x i128> %6, i32 1
107  %8 = getelementptr inbounds %struct.Mat4, ptr %1, i64 0, i32 0, i32 0, i64 8
108  %9 = bitcast <16 x float> %3 to <4 x i128>
109  %10 = extractelement <4 x i128> %9, i32 2
110  %11 = getelementptr inbounds %struct.Mat4, ptr %1, i64 0, i32 0, i32 0, i64 12
111  %12 = bitcast <16 x float> %3 to <4 x i128>
112  %13 = extractelement <4 x i128> %12, i32 3
113  store i128 %13, ptr %1, align 16
114  store i128 %10, ptr %5, align 16
115  store i128 %7, ptr %8, align 16
116  store <4 x float> %4, ptr %11, align 16
117  ret void
118}
119
120define <16 x i32> @PR42819(ptr %a0) {
121; SSE-LABEL: PR42819:
122; SSE:       # %bb.0:
123; SSE-NEXT:    movdqu (%rdi), %xmm3
124; SSE-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11]
125; SSE-NEXT:    xorps %xmm0, %xmm0
126; SSE-NEXT:    xorps %xmm1, %xmm1
127; SSE-NEXT:    xorps %xmm2, %xmm2
128; SSE-NEXT:    retq
129;
130; AVX-LABEL: PR42819:
131; AVX:       # %bb.0:
132; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,1,2]
133; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
134; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
135; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7]
136; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
137; AVX-NEXT:    retq
138;
139; AVX512-LABEL: PR42819:
140; AVX512:       # %bb.0:
141; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
142; AVX512-NEXT:    movw $-8192, %ax # imm = 0xE000
143; AVX512-NEXT:    kmovw %eax, %k1
144; AVX512-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
145; AVX512-NEXT:    retq
146  %1 = load <8 x i32>, ptr %a0, align 4
147  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
148  %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
149  ret <16 x i32> %3
150}
151
152@b = dso_local local_unnamed_addr global i32 0, align 4
153@c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
154@d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
155
156define void @PR42833() {
157; SSE2-LABEL: PR42833:
158; SSE2:       # %bb.0:
159; SSE2-NEXT:    movl b(%rip), %eax
160; SSE2-NEXT:    movdqa c+144(%rip), %xmm0
161; SSE2-NEXT:    movdqa c+128(%rip), %xmm1
162; SSE2-NEXT:    addl c+128(%rip), %eax
163; SSE2-NEXT:    movd %eax, %xmm2
164; SSE2-NEXT:    movd %eax, %xmm3
165; SSE2-NEXT:    paddd %xmm1, %xmm3
166; SSE2-NEXT:    movdqa d+144(%rip), %xmm4
167; SSE2-NEXT:    psubd %xmm0, %xmm4
168; SSE2-NEXT:    paddd %xmm0, %xmm0
169; SSE2-NEXT:    movdqa %xmm1, %xmm5
170; SSE2-NEXT:    paddd %xmm1, %xmm5
171; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
172; SSE2-NEXT:    movdqa %xmm0, c+144(%rip)
173; SSE2-NEXT:    movaps %xmm5, c+128(%rip)
174; SSE2-NEXT:    movdqa c+160(%rip), %xmm0
175; SSE2-NEXT:    movdqa c+176(%rip), %xmm3
176; SSE2-NEXT:    movdqa d+160(%rip), %xmm5
177; SSE2-NEXT:    movdqa d+176(%rip), %xmm6
178; SSE2-NEXT:    movdqa d+128(%rip), %xmm7
179; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
180; SSE2-NEXT:    psubd %xmm1, %xmm7
181; SSE2-NEXT:    psubd %xmm3, %xmm6
182; SSE2-NEXT:    psubd %xmm0, %xmm5
183; SSE2-NEXT:    movdqa %xmm5, d+160(%rip)
184; SSE2-NEXT:    movdqa %xmm6, d+176(%rip)
185; SSE2-NEXT:    movdqa %xmm4, d+144(%rip)
186; SSE2-NEXT:    movdqa %xmm7, d+128(%rip)
187; SSE2-NEXT:    paddd %xmm3, %xmm3
188; SSE2-NEXT:    paddd %xmm0, %xmm0
189; SSE2-NEXT:    movdqa %xmm0, c+160(%rip)
190; SSE2-NEXT:    movdqa %xmm3, c+176(%rip)
191; SSE2-NEXT:    retq
192;
193; SSE42-LABEL: PR42833:
194; SSE42:       # %bb.0:
195; SSE42-NEXT:    movl b(%rip), %eax
196; SSE42-NEXT:    movdqa c+144(%rip), %xmm0
197; SSE42-NEXT:    movdqa c+128(%rip), %xmm1
198; SSE42-NEXT:    addl c+128(%rip), %eax
199; SSE42-NEXT:    movd %eax, %xmm2
200; SSE42-NEXT:    paddd %xmm1, %xmm2
201; SSE42-NEXT:    movdqa d+144(%rip), %xmm3
202; SSE42-NEXT:    psubd %xmm0, %xmm3
203; SSE42-NEXT:    paddd %xmm0, %xmm0
204; SSE42-NEXT:    movdqa %xmm1, %xmm4
205; SSE42-NEXT:    paddd %xmm1, %xmm4
206; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
207; SSE42-NEXT:    movdqa %xmm0, c+144(%rip)
208; SSE42-NEXT:    movdqa %xmm4, c+128(%rip)
209; SSE42-NEXT:    movdqa c+160(%rip), %xmm0
210; SSE42-NEXT:    movdqa c+176(%rip), %xmm2
211; SSE42-NEXT:    movdqa d+160(%rip), %xmm4
212; SSE42-NEXT:    movdqa d+176(%rip), %xmm5
213; SSE42-NEXT:    movdqa d+128(%rip), %xmm6
214; SSE42-NEXT:    pinsrd $0, %eax, %xmm1
215; SSE42-NEXT:    psubd %xmm1, %xmm6
216; SSE42-NEXT:    psubd %xmm2, %xmm5
217; SSE42-NEXT:    psubd %xmm0, %xmm4
218; SSE42-NEXT:    movdqa %xmm4, d+160(%rip)
219; SSE42-NEXT:    movdqa %xmm5, d+176(%rip)
220; SSE42-NEXT:    movdqa %xmm3, d+144(%rip)
221; SSE42-NEXT:    movdqa %xmm6, d+128(%rip)
222; SSE42-NEXT:    paddd %xmm2, %xmm2
223; SSE42-NEXT:    paddd %xmm0, %xmm0
224; SSE42-NEXT:    movdqa %xmm0, c+160(%rip)
225; SSE42-NEXT:    movdqa %xmm2, c+176(%rip)
226; SSE42-NEXT:    retq
227;
228; AVX1-LABEL: PR42833:
229; AVX1:       # %bb.0:
230; AVX1-NEXT:    movl b(%rip), %eax
231; AVX1-NEXT:    addl c+128(%rip), %eax
232; AVX1-NEXT:    vmovd %eax, %xmm0
233; AVX1-NEXT:    vmovdqa c+128(%rip), %xmm1
234; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
235; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm2
236; AVX1-NEXT:    vmovdqa c+144(%rip), %xmm3
237; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
238; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
239; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
240; AVX1-NEXT:    vmovdqa d+144(%rip), %xmm2
241; AVX1-NEXT:    vpsubd c+144(%rip), %xmm2, %xmm2
242; AVX1-NEXT:    vmovups %ymm0, c+128(%rip)
243; AVX1-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm0
244; AVX1-NEXT:    vmovdqa d+128(%rip), %xmm1
245; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
246; AVX1-NEXT:    vmovdqa d+176(%rip), %xmm1
247; AVX1-NEXT:    vmovdqa c+176(%rip), %xmm3
248; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
249; AVX1-NEXT:    vmovdqa d+160(%rip), %xmm4
250; AVX1-NEXT:    vmovdqa c+160(%rip), %xmm5
251; AVX1-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
252; AVX1-NEXT:    vmovdqa %xmm2, d+144(%rip)
253; AVX1-NEXT:    vmovdqa %xmm4, d+160(%rip)
254; AVX1-NEXT:    vmovdqa %xmm1, d+176(%rip)
255; AVX1-NEXT:    vmovdqa %xmm0, d+128(%rip)
256; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
257; AVX1-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
258; AVX1-NEXT:    vmovdqa %xmm1, c+160(%rip)
259; AVX1-NEXT:    vmovdqa %xmm0, c+176(%rip)
260; AVX1-NEXT:    vzeroupper
261; AVX1-NEXT:    retq
262;
263; AVX2-LABEL: PR42833:
264; AVX2:       # %bb.0:
265; AVX2-NEXT:    movl b(%rip), %eax
266; AVX2-NEXT:    vmovdqu c+128(%rip), %ymm0
267; AVX2-NEXT:    addl c+128(%rip), %eax
268; AVX2-NEXT:    vmovd %eax, %xmm1
269; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
270; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
271; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
272; AVX2-NEXT:    vmovdqu %ymm2, c+128(%rip)
273; AVX2-NEXT:    vmovdqu c+160(%rip), %ymm2
274; AVX2-NEXT:    vmovdqu d+160(%rip), %ymm3
275; AVX2-NEXT:    vmovdqu d+128(%rip), %ymm4
276; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
277; AVX2-NEXT:    vpsubd %ymm0, %ymm4, %ymm0
278; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm1
279; AVX2-NEXT:    vmovdqu %ymm1, d+160(%rip)
280; AVX2-NEXT:    vmovdqu %ymm0, d+128(%rip)
281; AVX2-NEXT:    vpaddd %ymm2, %ymm2, %ymm0
282; AVX2-NEXT:    vmovdqu %ymm0, c+160(%rip)
283; AVX2-NEXT:    vzeroupper
284; AVX2-NEXT:    retq
285;
286; AVX512-LABEL: PR42833:
287; AVX512:       # %bb.0:
288; AVX512-NEXT:    movl b(%rip), %eax
289; AVX512-NEXT:    vmovdqu c+128(%rip), %ymm0
290; AVX512-NEXT:    vmovdqu64 c+128(%rip), %zmm1
291; AVX512-NEXT:    addl c+128(%rip), %eax
292; AVX512-NEXT:    vmovd %eax, %xmm2
293; AVX512-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
294; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
295; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
296; AVX512-NEXT:    vmovdqa c+128(%rip), %xmm2
297; AVX512-NEXT:    vmovdqu %ymm0, c+128(%rip)
298; AVX512-NEXT:    vmovdqu c+160(%rip), %ymm0
299; AVX512-NEXT:    vmovdqu64 d+128(%rip), %zmm3
300; AVX512-NEXT:    vpinsrd $0, %eax, %xmm2, %xmm2
301; AVX512-NEXT:    vinserti32x4 $0, %xmm2, %zmm1, %zmm1
302; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm1
303; AVX512-NEXT:    vpsubd %zmm1, %zmm3, %zmm1
304; AVX512-NEXT:    vmovdqu64 %zmm1, d+128(%rip)
305; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
306; AVX512-NEXT:    vmovdqu %ymm0, c+160(%rip)
307; AVX512-NEXT:    vzeroupper
308; AVX512-NEXT:    retq
309;
310; XOP-LABEL: PR42833:
311; XOP:       # %bb.0:
312; XOP-NEXT:    movl b(%rip), %eax
313; XOP-NEXT:    addl c+128(%rip), %eax
314; XOP-NEXT:    vmovd %eax, %xmm0
315; XOP-NEXT:    vmovdqa c+128(%rip), %xmm1
316; XOP-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
317; XOP-NEXT:    vpaddd %xmm1, %xmm1, %xmm2
318; XOP-NEXT:    vmovdqa c+144(%rip), %xmm3
319; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
320; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
321; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
322; XOP-NEXT:    vmovdqa d+144(%rip), %xmm2
323; XOP-NEXT:    vpsubd c+144(%rip), %xmm2, %xmm2
324; XOP-NEXT:    vmovups %ymm0, c+128(%rip)
325; XOP-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm0
326; XOP-NEXT:    vmovdqa d+128(%rip), %xmm1
327; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
328; XOP-NEXT:    vmovdqa d+176(%rip), %xmm1
329; XOP-NEXT:    vmovdqa c+176(%rip), %xmm3
330; XOP-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
331; XOP-NEXT:    vmovdqa d+160(%rip), %xmm4
332; XOP-NEXT:    vmovdqa c+160(%rip), %xmm5
333; XOP-NEXT:    vpsubd %xmm5, %xmm4, %xmm4
334; XOP-NEXT:    vmovdqa %xmm2, d+144(%rip)
335; XOP-NEXT:    vmovdqa %xmm4, d+160(%rip)
336; XOP-NEXT:    vmovdqa %xmm1, d+176(%rip)
337; XOP-NEXT:    vmovdqa %xmm0, d+128(%rip)
338; XOP-NEXT:    vpaddd %xmm3, %xmm3, %xmm0
339; XOP-NEXT:    vpaddd %xmm5, %xmm5, %xmm1
340; XOP-NEXT:    vmovdqa %xmm1, c+160(%rip)
341; XOP-NEXT:    vmovdqa %xmm0, c+176(%rip)
342; XOP-NEXT:    vzeroupper
343; XOP-NEXT:    retq
344  %1 = load i32, ptr @b, align 4
345  %2 = load <8 x i32>, ptr getelementptr inbounds ([49 x i32], ptr @c, i64 0, i64 32), align 16
346  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
347  %4 = extractelement <8 x i32> %2, i32 0
348  %5 = add i32 %1, %4
349  %6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0
350  %7 = add <8 x i32> %2, %6
351  %8 = shl <8 x i32> %2, %6
352  %9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
353  store <8 x i32> %9, ptr getelementptr inbounds ([49 x i32], ptr @c, i64 0, i64 32), align 16
354  %10 = load <8 x i32>, ptr getelementptr inbounds ([49 x i32], ptr @c, i64 0, i64 40), align 16
355  %11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
356  %12 = load <16 x i32>, ptr getelementptr inbounds ([49 x i32], ptr @d, i64 0, i64 32), align 16
357  %13 = insertelement <16 x i32> %3, i32 %5, i32 0
358  %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
359  %15 = sub <16 x i32> %12, %14
360  store <16 x i32> %15, ptr getelementptr inbounds ([49 x i32], ptr @d, i64 0, i64 32), align 16
361  %16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
362  store <8 x i32> %16, ptr getelementptr inbounds ([49 x i32], ptr @c, i64 0, i64 40), align 16
363  ret void
364}
365