1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
6
7target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
8
9define i32 @t(ptr %val) nounwind  {
10; X32-SSE2-LABEL: t:
11; X32-SSE2:       # %bb.0:
12; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
13; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
14; X32-SSE2-NEXT:    movd %xmm0, %eax
15; X32-SSE2-NEXT:    retl
16;
17; X64-SSSE3-LABEL: t:
18; X64-SSSE3:       # %bb.0:
19; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
20; X64-SSSE3-NEXT:    movd %xmm0, %eax
21; X64-SSSE3-NEXT:    retq
22;
23; X64-AVX-LABEL: t:
24; X64-AVX:       # %bb.0:
25; X64-AVX-NEXT:    movl 8(%rdi), %eax
26; X64-AVX-NEXT:    retq
27  %tmp2 = load <2 x i64>, ptr %val, align 16		; <<2 x i64>> [#uses=1]
28  %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32>		; <<4 x i32>> [#uses=1]
29  %tmp4 = extractelement <4 x i32> %tmp3, i32 2		; <i32> [#uses=1]
30  ret i32 %tmp4
31}
32
33; Case where extractelement of load ends up as undef.
34; (Making sure this doesn't crash.)
35define i32 @t2(ptr %xp) {
36; X32-SSE2-LABEL: t2:
37; X32-SSE2:       # %bb.0:
38; X32-SSE2-NEXT:    retl
39;
40; X64-LABEL: t2:
41; X64:       # %bb.0:
42; X64-NEXT:    retq
43  %x = load <8 x i32>, ptr %xp
44  %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
45  %y = extractelement <8 x i32> %Shuff68, i32 0
46  ret i32 %y
47}
48
49; This case could easily end up inf-looping in the DAG combiner due to an
50; low alignment load of the vector which prevents us from reliably forming a
51; narrow load.
52
53define void @t3(ptr %a0) {
54; X32-SSE2-LABEL: t3:
55; X32-SSE2:       # %bb.0: # %bb
56; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
57; X32-SSE2-NEXT:    movups (%eax), %xmm0
58; X32-SSE2-NEXT:    movhps %xmm0, (%eax)
59; X32-SSE2-NEXT:    retl
60;
61; X64-SSSE3-LABEL: t3:
62; X64-SSSE3:       # %bb.0: # %bb
63; X64-SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
64; X64-SSSE3-NEXT:    movsd %xmm0, (%rax)
65; X64-SSSE3-NEXT:    retq
66;
67; X64-AVX-LABEL: t3:
68; X64-AVX:       # %bb.0: # %bb
69; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
70; X64-AVX-NEXT:    vmovsd %xmm0, (%rax)
71; X64-AVX-NEXT:    retq
72bb:
73  %tmp13 = load <2 x double>, ptr %a0, align 1
74  %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
75  store double %.sroa.3.24.vec.extract, ptr undef, align 8
76  ret void
77}
78
79; Case where a load is unary shuffled, then bitcast (to a type with the same
80; number of elements) before extractelement.
81; This is testing for an assertion - the extraction was assuming that the undef
82; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
83define i64 @t4(ptr %a) {
84; X32-SSE2-LABEL: t4:
85; X32-SSE2:       # %bb.0:
86; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
87; X32-SSE2-NEXT:    movdqa (%eax), %xmm0
88; X32-SSE2-NEXT:    movd %xmm0, %eax
89; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
90; X32-SSE2-NEXT:    movd %xmm0, %edx
91; X32-SSE2-NEXT:    retl
92;
93; X64-LABEL: t4:
94; X64:       # %bb.0:
95; X64-NEXT:    movq (%rdi), %rax
96; X64-NEXT:    retq
97  %b = load <2 x double>, ptr %a, align 16
98  %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
99  %d = bitcast <2 x double> %c to <2 x i64>
100  %e = extractelement <2 x i64> %d, i32 1
101  ret i64 %e
102}
103
104; Don't extract from a volatile.
105define void @t5(ptr%a0, ptr%a1) {
106; X32-SSE2-LABEL: t5:
107; X32-SSE2:       # %bb.0:
108; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
109; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
110; X32-SSE2-NEXT:    movaps (%ecx), %xmm0
111; X32-SSE2-NEXT:    movhps %xmm0, (%eax)
112; X32-SSE2-NEXT:    retl
113;
114; X64-SSSE3-LABEL: t5:
115; X64-SSSE3:       # %bb.0:
116; X64-SSSE3-NEXT:    movaps (%rdi), %xmm0
117; X64-SSSE3-NEXT:    movhps %xmm0, (%rsi)
118; X64-SSSE3-NEXT:    retq
119;
120; X64-AVX-LABEL: t5:
121; X64-AVX:       # %bb.0:
122; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
123; X64-AVX-NEXT:    vmovhps %xmm0, (%rsi)
124; X64-AVX-NEXT:    retq
125  %vecload = load volatile <2 x double>, ptr %a0, align 16
126  %vecext = extractelement <2 x double> %vecload, i32 1
127  store volatile double %vecext, ptr %a1, align 8
128  ret void
129}
130
131; Check for multiuse.
132define float @t6(ptr%a0) {
133; X32-SSE2-LABEL: t6:
134; X32-SSE2:       # %bb.0:
135; X32-SSE2-NEXT:    pushl %eax
136; X32-SSE2-NEXT:    .cfi_def_cfa_offset 8
137; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
138; X32-SSE2-NEXT:    movaps (%eax), %xmm0
139; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
140; X32-SSE2-NEXT:    xorps %xmm1, %xmm1
141; X32-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
142; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
143; X32-SSE2-NEXT:    andps %xmm1, %xmm2
144; X32-SSE2-NEXT:    andnps %xmm0, %xmm1
145; X32-SSE2-NEXT:    orps %xmm2, %xmm1
146; X32-SSE2-NEXT:    movss %xmm1, (%esp)
147; X32-SSE2-NEXT:    flds (%esp)
148; X32-SSE2-NEXT:    popl %eax
149; X32-SSE2-NEXT:    .cfi_def_cfa_offset 4
150; X32-SSE2-NEXT:    retl
151;
152; X64-SSSE3-LABEL: t6:
153; X64-SSSE3:       # %bb.0:
154; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
155; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
156; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
157; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
158; X64-SSSE3-NEXT:    andps %xmm0, %xmm2
159; X64-SSSE3-NEXT:    andnps %xmm1, %xmm0
160; X64-SSSE3-NEXT:    orps %xmm2, %xmm0
161; X64-SSSE3-NEXT:    retq
162;
163; X64-AVX1-LABEL: t6:
164; X64-AVX1:       # %bb.0:
165; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
166; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
167; X64-AVX1-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm1
168; X64-AVX1-NEXT:    vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
169; X64-AVX1-NEXT:    retq
170;
171; X64-AVX2-LABEL: t6:
172; X64-AVX2:       # %bb.0:
173; X64-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
174; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
175; X64-AVX2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm1
176; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
177; X64-AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
178; X64-AVX2-NEXT:    retq
179  %vecload = load <8 x float>, ptr %a0, align 32
180  %vecext = extractelement <8 x float> %vecload, i32 1
181  %cmp = fcmp oeq float %vecext, 0.000000e+00
182  %cond = select i1 %cmp, float 1.000000e+00, float %vecext
183  ret float %cond
184}
185
186define void @PR43971(ptr%a0, ptr%a1) {
187; X32-SSE2-LABEL: PR43971:
188; X32-SSE2:       # %bb.0: # %entry
189; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
190; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
191; X32-SSE2-NEXT:    movaps 16(%ecx), %xmm0
192; X32-SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
193; X32-SSE2-NEXT:    xorps %xmm1, %xmm1
194; X32-SSE2-NEXT:    cmpltss %xmm0, %xmm1
195; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
196; X32-SSE2-NEXT:    andps %xmm1, %xmm2
197; X32-SSE2-NEXT:    andnps %xmm0, %xmm1
198; X32-SSE2-NEXT:    orps %xmm2, %xmm1
199; X32-SSE2-NEXT:    movss %xmm1, (%eax)
200; X32-SSE2-NEXT:    retl
201;
202; X64-SSSE3-LABEL: PR43971:
203; X64-SSSE3:       # %bb.0: # %entry
204; X64-SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
205; X64-SSSE3-NEXT:    xorps %xmm1, %xmm1
206; X64-SSSE3-NEXT:    cmpltss %xmm0, %xmm1
207; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
208; X64-SSSE3-NEXT:    andps %xmm1, %xmm2
209; X64-SSSE3-NEXT:    andnps %xmm0, %xmm1
210; X64-SSSE3-NEXT:    orps %xmm2, %xmm1
211; X64-SSSE3-NEXT:    movss %xmm1, (%rsi)
212; X64-SSSE3-NEXT:    retq
213;
214; X64-AVX-LABEL: PR43971:
215; X64-AVX:       # %bb.0: # %entry
216; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
217; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
218; X64-AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1
219; X64-AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
220; X64-AVX-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
221; X64-AVX-NEXT:    vmovss %xmm0, (%rsi)
222; X64-AVX-NEXT:    retq
223entry:
224  %0 = load <8 x float>, ptr %a0, align 32
225  %vecext = extractelement <8 x float> %0, i32 6
226  %cmp = fcmp ogt float %vecext, 0.000000e+00
227  %1 = load float, ptr %a1, align 4
228  %cond = select i1 %cmp, float %1, float %vecext
229  store float %cond, ptr %a1, align 4
230  ret void
231}
232
233define float @PR43971_1(ptr%a0) nounwind {
234; X32-SSE2-LABEL: PR43971_1:
235; X32-SSE2:       # %bb.0: # %entry
236; X32-SSE2-NEXT:    pushl %eax
237; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
238; X32-SSE2-NEXT:    movaps (%eax), %xmm0
239; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
240; X32-SSE2-NEXT:    xorps %xmm1, %xmm1
241; X32-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
242; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
243; X32-SSE2-NEXT:    andps %xmm1, %xmm2
244; X32-SSE2-NEXT:    andnps %xmm0, %xmm1
245; X32-SSE2-NEXT:    orps %xmm2, %xmm1
246; X32-SSE2-NEXT:    movss %xmm1, (%esp)
247; X32-SSE2-NEXT:    flds (%esp)
248; X32-SSE2-NEXT:    popl %eax
249; X32-SSE2-NEXT:    retl
250;
251; X64-SSSE3-LABEL: PR43971_1:
252; X64-SSSE3:       # %bb.0: # %entry
253; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
254; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
255; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
256; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
257; X64-SSSE3-NEXT:    andps %xmm0, %xmm2
258; X64-SSSE3-NEXT:    andnps %xmm1, %xmm0
259; X64-SSSE3-NEXT:    orps %xmm2, %xmm0
260; X64-SSSE3-NEXT:    retq
261;
262; X64-AVX1-LABEL: PR43971_1:
263; X64-AVX1:       # %bb.0: # %entry
264; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
265; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
266; X64-AVX1-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm1
267; X64-AVX1-NEXT:    vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
268; X64-AVX1-NEXT:    retq
269;
270; X64-AVX2-LABEL: PR43971_1:
271; X64-AVX2:       # %bb.0: # %entry
272; X64-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
273; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
274; X64-AVX2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm1
275; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
276; X64-AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
277; X64-AVX2-NEXT:    retq
278entry:
279  %0 = load <8 x float>, ptr %a0, align 32
280  %vecext = extractelement <8 x float> %0, i32 1
281  %cmp = fcmp oeq float %vecext, 0.000000e+00
282  %cond = select i1 %cmp, float 1.000000e+00, float %vecext
283  ret float %cond
284}
285
286; Test for bad extractions from a VBROADCAST_LOAD of the <2 x i16> non-uniform constant bitcast as <4 x i32>.
287define void @subextract_broadcast_load_constant(ptr nocapture %0, ptr nocapture %1, ptr nocapture %2) nounwind {
288; X32-SSE2-LABEL: subextract_broadcast_load_constant:
289; X32-SSE2:       # %bb.0:
290; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
291; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
292; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
293; X32-SSE2-NEXT:    movl $-1583308898, (%edx) # imm = 0xA1A09F9E
294; X32-SSE2-NEXT:    movw $-24674, (%ecx) # imm = 0x9F9E
295; X32-SSE2-NEXT:    movw $-24160, (%eax) # imm = 0xA1A0
296; X32-SSE2-NEXT:    retl
297;
298; X64-LABEL: subextract_broadcast_load_constant:
299; X64:       # %bb.0:
300; X64-NEXT:    movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
301; X64-NEXT:    movw $-24674, (%rsi) # imm = 0x9F9E
302; X64-NEXT:    movw $-24160, (%rdx) # imm = 0xA1A0
303; X64-NEXT:    retq
304  store i8 -98, ptr %0, align 1
305  %4 = getelementptr inbounds i8, ptr %0, i64 1
306  store i8 -97, ptr %4, align 1
307  %5 = getelementptr inbounds i8, ptr %0, i64 2
308  store i8 -96, ptr %5, align 1
309  %6 = getelementptr inbounds i8, ptr %0, i64 3
310  store i8 -95, ptr %6, align 1
311  %7 = load <2 x i16>, ptr %0, align 4
312  %8 = extractelement <2 x i16> %7, i32 0
313  store i16 %8, ptr %1, align 2
314  %9 = extractelement <2 x i16> %7, i32 1
315  store i16 %9, ptr %2, align 2
316  ret void
317}
318
319; A scalar load is favored over a XMM->GPR register transfer in this example.
320
321define i32 @multi_use_load_scalarization(ptr %p) nounwind {
322; X32-SSE2-LABEL: multi_use_load_scalarization:
323; X32-SSE2:       # %bb.0:
324; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
325; X32-SSE2-NEXT:    movl (%ecx), %eax
326; X32-SSE2-NEXT:    movdqu (%ecx), %xmm0
327; X32-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
328; X32-SSE2-NEXT:    psubd %xmm1, %xmm0
329; X32-SSE2-NEXT:    movdqa %xmm0, (%ecx)
330; X32-SSE2-NEXT:    retl
331;
332; X64-SSSE3-LABEL: multi_use_load_scalarization:
333; X64-SSSE3:       # %bb.0:
334; X64-SSSE3-NEXT:    movl (%rdi), %eax
335; X64-SSSE3-NEXT:    movdqu (%rdi), %xmm0
336; X64-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
337; X64-SSSE3-NEXT:    psubd %xmm1, %xmm0
338; X64-SSSE3-NEXT:    movdqa %xmm0, (%rdi)
339; X64-SSSE3-NEXT:    retq
340;
341; X64-AVX-LABEL: multi_use_load_scalarization:
342; X64-AVX:       # %bb.0:
343; X64-AVX-NEXT:    movl (%rdi), %eax
344; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
345; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
346; X64-AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
347; X64-AVX-NEXT:    vmovdqa %xmm0, (%rdi)
348; X64-AVX-NEXT:    retq
349  %v = load <4 x i32>, ptr %p, align 1
350  %v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>
351  store <4 x i32> %v1, ptr %p
352  %r = extractelement <4 x i32> %v, i64 0
353  ret i32 %r
354}
355
356define i32 @multi_use_volatile_load_scalarization(ptr %p) nounwind {
357; X32-SSE2-LABEL: multi_use_volatile_load_scalarization:
358; X32-SSE2:       # %bb.0:
359; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
360; X32-SSE2-NEXT:    movdqu (%ecx), %xmm0
361; X32-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
362; X32-SSE2-NEXT:    movd %xmm0, %eax
363; X32-SSE2-NEXT:    psubd %xmm1, %xmm0
364; X32-SSE2-NEXT:    movdqa %xmm0, (%ecx)
365; X32-SSE2-NEXT:    retl
366;
367; X64-SSSE3-LABEL: multi_use_volatile_load_scalarization:
368; X64-SSSE3:       # %bb.0:
369; X64-SSSE3-NEXT:    movdqu (%rdi), %xmm0
370; X64-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
371; X64-SSSE3-NEXT:    movd %xmm0, %eax
372; X64-SSSE3-NEXT:    psubd %xmm1, %xmm0
373; X64-SSSE3-NEXT:    movdqa %xmm0, (%rdi)
374; X64-SSSE3-NEXT:    retq
375;
376; X64-AVX-LABEL: multi_use_volatile_load_scalarization:
377; X64-AVX:       # %bb.0:
378; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
379; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
380; X64-AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
381; X64-AVX-NEXT:    vmovdqa %xmm1, (%rdi)
382; X64-AVX-NEXT:    vmovd %xmm0, %eax
383; X64-AVX-NEXT:    retq
384  %v = load volatile <4 x i32>, ptr %p, align 1
385  %v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>
386  store <4 x i32> %v1, ptr %p
387  %r = extractelement <4 x i32> %v, i64 0
388  ret i32 %r
389}
390
391; This test is reduced from a C source example that showed a miscompile:
392; https://github.com/llvm/llvm-project/issues/53695
393; The scalarized loads from 'zero' in the AVX asm must occur before
394; the vector store to 'zero' overwrites the values.
395; If compiled to a binary, this test should return 0 if correct.
396
397@n1 = local_unnamed_addr global <8 x i32> <i32 0, i32 42, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0>, align 32
398@zero = internal unnamed_addr global <8 x i32> zeroinitializer, align 32
399
400define i32 @main() nounwind {
401; X32-SSE2-LABEL: main:
402; X32-SSE2:       # %bb.0:
403; X32-SSE2-NEXT:    pushl %ebp
404; X32-SSE2-NEXT:    movl %esp, %ebp
405; X32-SSE2-NEXT:    pushl %esi
406; X32-SSE2-NEXT:    andl $-32, %esp
407; X32-SSE2-NEXT:    subl $64, %esp
408; X32-SSE2-NEXT:    movdqa zero, %xmm0
409; X32-SSE2-NEXT:    movaps n1+16, %xmm1
410; X32-SSE2-NEXT:    movaps n1, %xmm2
411; X32-SSE2-NEXT:    movaps %xmm2, zero
412; X32-SSE2-NEXT:    movaps %xmm1, zero+16
413; X32-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,2,2,2]
414; X32-SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
415; X32-SSE2-NEXT:    movaps %xmm1, (%esp)
416; X32-SSE2-NEXT:    movdqa (%esp), %xmm1
417; X32-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
418; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
419; X32-SSE2-NEXT:    movd %xmm2, %eax
420; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
421; X32-SSE2-NEXT:    movd %xmm2, %ecx
422; X32-SSE2-NEXT:    xorl %edx, %edx
423; X32-SSE2-NEXT:    divl %ecx
424; X32-SSE2-NEXT:    movl %eax, %ecx
425; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
426; X32-SSE2-NEXT:    movd %xmm0, %eax
427; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
428; X32-SSE2-NEXT:    movd %xmm0, %esi
429; X32-SSE2-NEXT:    xorl %edx, %edx
430; X32-SSE2-NEXT:    divl %esi
431; X32-SSE2-NEXT:    addl %ecx, %eax
432; X32-SSE2-NEXT:    leal -4(%ebp), %esp
433; X32-SSE2-NEXT:    popl %esi
434; X32-SSE2-NEXT:    popl %ebp
435; X32-SSE2-NEXT:    retl
436;
437; X64-SSSE3-LABEL: main:
438; X64-SSSE3:       # %bb.0:
439; X64-SSSE3-NEXT:    pushq %rbp
440; X64-SSSE3-NEXT:    movq %rsp, %rbp
441; X64-SSSE3-NEXT:    andq $-32, %rsp
442; X64-SSSE3-NEXT:    subq $64, %rsp
443; X64-SSSE3-NEXT:    movdqa zero(%rip), %xmm0
444; X64-SSSE3-NEXT:    movq n1@GOTPCREL(%rip), %rax
445; X64-SSSE3-NEXT:    movaps (%rax), %xmm1
446; X64-SSSE3-NEXT:    movaps 16(%rax), %xmm2
447; X64-SSSE3-NEXT:    movaps %xmm1, zero(%rip)
448; X64-SSSE3-NEXT:    movaps %xmm2, zero+16(%rip)
449; X64-SSSE3-NEXT:    movaps {{.*#+}} xmm1 = [2,2,2,2]
450; X64-SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
451; X64-SSSE3-NEXT:    movaps %xmm1, (%rsp)
452; X64-SSSE3-NEXT:    movdqa (%rsp), %xmm1
453; X64-SSSE3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
454; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
455; X64-SSSE3-NEXT:    movd %xmm2, %eax
456; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
457; X64-SSSE3-NEXT:    movd %xmm2, %ecx
458; X64-SSSE3-NEXT:    xorl %edx, %edx
459; X64-SSSE3-NEXT:    divl %ecx
460; X64-SSSE3-NEXT:    movl %eax, %ecx
461; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
462; X64-SSSE3-NEXT:    movd %xmm0, %eax
463; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
464; X64-SSSE3-NEXT:    movd %xmm0, %esi
465; X64-SSSE3-NEXT:    xorl %edx, %edx
466; X64-SSSE3-NEXT:    divl %esi
467; X64-SSSE3-NEXT:    addl %ecx, %eax
468; X64-SSSE3-NEXT:    movq %rbp, %rsp
469; X64-SSSE3-NEXT:    popq %rbp
470; X64-SSSE3-NEXT:    retq
471;
472; X64-AVX1-LABEL: main:
473; X64-AVX1:       # %bb.0:
474; X64-AVX1-NEXT:    pushq %rbp
475; X64-AVX1-NEXT:    movq %rsp, %rbp
476; X64-AVX1-NEXT:    andq $-32, %rsp
477; X64-AVX1-NEXT:    subq $64, %rsp
478; X64-AVX1-NEXT:    movq n1@GOTPCREL(%rip), %rax
479; X64-AVX1-NEXT:    vmovaps (%rax), %ymm0
480; X64-AVX1-NEXT:    movl zero+4(%rip), %ecx
481; X64-AVX1-NEXT:    movl zero+8(%rip), %eax
482; X64-AVX1-NEXT:    vmovaps %ymm0, zero(%rip)
483; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
484; X64-AVX1-NEXT:    vmovaps %ymm0, (%rsp)
485; X64-AVX1-NEXT:    vmovaps (%rsp), %ymm0
486; X64-AVX1-NEXT:    vextractps $2, %xmm0, %esi
487; X64-AVX1-NEXT:    xorl %edx, %edx
488; X64-AVX1-NEXT:    divl %esi
489; X64-AVX1-NEXT:    movl %eax, %esi
490; X64-AVX1-NEXT:    vextractps $1, %xmm0, %edi
491; X64-AVX1-NEXT:    movl %ecx, %eax
492; X64-AVX1-NEXT:    xorl %edx, %edx
493; X64-AVX1-NEXT:    divl %edi
494; X64-AVX1-NEXT:    addl %esi, %eax
495; X64-AVX1-NEXT:    movq %rbp, %rsp
496; X64-AVX1-NEXT:    popq %rbp
497; X64-AVX1-NEXT:    vzeroupper
498; X64-AVX1-NEXT:    retq
499;
500; X64-AVX2-LABEL: main:
501; X64-AVX2:       # %bb.0:
502; X64-AVX2-NEXT:    pushq %rbp
503; X64-AVX2-NEXT:    movq %rsp, %rbp
504; X64-AVX2-NEXT:    andq $-32, %rsp
505; X64-AVX2-NEXT:    subq $64, %rsp
506; X64-AVX2-NEXT:    movq n1@GOTPCREL(%rip), %rax
507; X64-AVX2-NEXT:    vmovaps (%rax), %ymm0
508; X64-AVX2-NEXT:    movl zero+4(%rip), %ecx
509; X64-AVX2-NEXT:    movl zero+8(%rip), %eax
510; X64-AVX2-NEXT:    vmovaps %ymm0, zero(%rip)
511; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
512; X64-AVX2-NEXT:    vmovaps %ymm0, (%rsp)
513; X64-AVX2-NEXT:    vmovaps (%rsp), %ymm0
514; X64-AVX2-NEXT:    vextractps $2, %xmm0, %esi
515; X64-AVX2-NEXT:    xorl %edx, %edx
516; X64-AVX2-NEXT:    divl %esi
517; X64-AVX2-NEXT:    movl %eax, %esi
518; X64-AVX2-NEXT:    vextractps $1, %xmm0, %edi
519; X64-AVX2-NEXT:    movl %ecx, %eax
520; X64-AVX2-NEXT:    xorl %edx, %edx
521; X64-AVX2-NEXT:    divl %edi
522; X64-AVX2-NEXT:    addl %esi, %eax
523; X64-AVX2-NEXT:    movq %rbp, %rsp
524; X64-AVX2-NEXT:    popq %rbp
525; X64-AVX2-NEXT:    vzeroupper
526; X64-AVX2-NEXT:    retq
527  %stackptr = alloca <8 x i32>, align 32
528  %z = load <8 x i32>, ptr @zero, align 32
529  %t1 = load <8 x i32>, ptr @n1, align 32
530  store <8 x i32> %t1, ptr @zero, align 32
531  store volatile <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, ptr %stackptr, align 32
532  %stackload = load volatile <8 x i32>, ptr %stackptr, align 32
533  %div = udiv <8 x i32> %z, %stackload
534  %e1 = extractelement <8 x i32> %div, i64 1
535  %e2 = extractelement <8 x i32> %div, i64 2
536  %r = add i32 %e1, %e2
537  ret i32 %r
538}
539