1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
3
4define void @julia_2xdouble([2 x double]* sret([2 x double]), [2 x double]*, [2 x double]*, [2 x double]*) {
5; CHECK-LABEL: @julia_2xdouble(
6; CHECK-NEXT:  top:
7; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
8; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
9; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
10; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
11; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
12; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
13; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
14; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
15; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
16; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
17; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
18; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
19; CHECK-NEXT:    [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
20; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
21; CHECK-NEXT:    [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
22; CHECK-NEXT:    store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
23; CHECK-NEXT:    ret void
24;
25top:
26  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
27  %x0 = load double, double* %px0, align 4
28  %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
29  %y0 = load double, double* %py0, align 4
30  %m0 = fmul double %x0, %y0
31  %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
32  %x1 = load double, double* %px1, align 4
33  %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
34  %y1 = load double, double* %py1, align 4
35  %m1 = fmul double %x1, %y1
36  %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
37  %z0 = load double, double* %pz0, align 4
38  %a0 = fadd double %m0, %z0
39  %i0 = insertvalue [2 x double] undef, double %a0, 0
40  %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
41  %z1 = load double, double* %pz1, align 4
42  %a1 = fadd double %m1, %z1
43  %i1 = insertvalue [2 x double] %i0, double %a1, 1
44  store [2 x double] %i1, [2 x double]* %0, align 4
45  ret void
46}
47
48define void @julia_4xfloat([4 x float]* sret([4 x float]), [4 x float]*, [4 x float]*, [4 x float]*) {
49; CHECK-LABEL: @julia_4xfloat(
50; CHECK-NEXT:  top:
51; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2:%.*]], i64 0, i64 0
52; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3:%.*]], i64 0, i64 0
53; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
54; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
55; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
56; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
57; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
58; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
59; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
60; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
61; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
62; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
63; CHECK-NEXT:    [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
64; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
65; CHECK-NEXT:    [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
66; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
67; CHECK-NEXT:    [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
68; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
69; CHECK-NEXT:    [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
70; CHECK-NEXT:    store [4 x float] [[I3]], [4 x float]* [[TMP0:%.*]], align 4
71; CHECK-NEXT:    ret void
72;
73top:
74  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
75  %x0 = load float, float* %px0, align 4
76  %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
77  %y0 = load float, float* %py0, align 4
78  %m0 = fmul float %x0, %y0
79  %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
80  %x1 = load float, float* %px1, align 4
81  %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
82  %y1 = load float, float* %py1, align 4
83  %m1 = fmul float %x1, %y1
84  %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
85  %x2 = load float, float* %px2, align 4
86  %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
87  %y2 = load float, float* %py2, align 4
88  %m2 = fmul float %x2, %y2
89  %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
90  %x3 = load float, float* %px3, align 4
91  %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
92  %y3 = load float, float* %py3, align 4
93  %m3 = fmul float %x3, %y3
94  %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
95  %z0 = load float, float* %pz0, align 4
96  %a0 = fadd float %m0, %z0
97  %i0 = insertvalue [4 x float] undef, float %a0, 0
98  %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
99  %z1 = load float, float* %pz1, align 4
100  %a1 = fadd float %m1, %z1
101  %i1 = insertvalue [4 x float] %i0, float %a1, 1
102  %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
103  %z2 = load float, float* %pz2, align 4
104  %a2 = fadd float %m2, %z2
105  %i2 = insertvalue [4 x float] %i1, float %a2, 2
106  %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
107  %z3 = load float, float* %pz3, align 4
108  %a3 = fadd float %m3, %z3
109  %i3 = insertvalue [4 x float] %i2, float %a3, 3
110  store [4 x float] %i3, [4 x float]* %0, align 4
111  ret void
112}
113
114define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
115; CHECK-LABEL: @julia_load_array_of_float(
116; CHECK-NEXT:  top:
117; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x float]* [[A:%.*]] to <4 x float>*
118; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
119; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x float]* [[B:%.*]] to <4 x float>*
120; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
121; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
122; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
123; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
124; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
125; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
126; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
127; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
128; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
129; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
130; CHECK-NEXT:    store [4 x float] [[C_ARR3]], [4 x float]* [[C:%.*]], align 4
131; CHECK-NEXT:    ret void
132;
133top:
134  %a_arr = load [4 x float], [4 x float]* %a, align 4
135  %a0 = extractvalue [4 x float] %a_arr, 0
136  %a2 = extractvalue [4 x float] %a_arr, 2
137  %a1 = extractvalue [4 x float] %a_arr, 1
138  %b_arr = load [4 x float], [4 x float]* %b, align 4
139  %b0 = extractvalue [4 x float] %b_arr, 0
140  %b2 = extractvalue [4 x float] %b_arr, 2
141  %b1 = extractvalue [4 x float] %b_arr, 1
142  %a3 = extractvalue [4 x float] %a_arr, 3
143  %c1 = fsub float %a1, %b1
144  %b3 = extractvalue [4 x float] %b_arr, 3
145  %c0 = fsub float %a0, %b0
146  %c2 = fsub float %a2, %b2
147  %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
148  %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
149  %c3 = fsub float %a3, %b3
150  %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
151  %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
152  store [4 x float] %c_arr3, [4 x float]* %c, align 4
153  ret void
154}
155
156define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
157; CHECK-LABEL: @julia_load_array_of_i32(
158; CHECK-NEXT:  top:
159; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x i32]* [[A:%.*]] to <4 x i32>*
160; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
161; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x i32]* [[B:%.*]] to <4 x i32>*
162; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
163; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
164; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
165; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
166; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
167; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
168; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
169; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
170; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
171; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
172; CHECK-NEXT:    store [4 x i32] [[C_ARR3]], [4 x i32]* [[C:%.*]], align 4
173; CHECK-NEXT:    ret void
174;
175top:
176  %a_arr = load [4 x i32], [4 x i32]* %a, align 4
177  %a0 = extractvalue [4 x i32] %a_arr, 0
178  %a2 = extractvalue [4 x i32] %a_arr, 2
179  %a1 = extractvalue [4 x i32] %a_arr, 1
180  %b_arr = load [4 x i32], [4 x i32]* %b, align 4
181  %b0 = extractvalue [4 x i32] %b_arr, 0
182  %b2 = extractvalue [4 x i32] %b_arr, 2
183  %b1 = extractvalue [4 x i32] %b_arr, 1
184  %a3 = extractvalue [4 x i32] %a_arr, 3
185  %c1 = sub i32 %a1, %b1
186  %b3 = extractvalue [4 x i32] %b_arr, 3
187  %c0 = sub i32 %a0, %b0
188  %c2 = sub i32 %a2, %b2
189  %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
190  %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
191  %c3 = sub i32 %a3, %b3
192  %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
193  %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
194  store [4 x i32] %c_arr3, [4 x i32]* %c, align 4
195  ret void
196}
197
198; Almost identical to previous test, but for type that should NOT be vectorized.
199;
200define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
201; CHECK-LABEL: @julia_load_array_of_i16(
202; CHECK-NEXT:  top:
203; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4
204; CHECK-NEXT:    [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
205; CHECK-NEXT:    [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
206; CHECK-NEXT:    [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
207; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4
208; CHECK-NEXT:    [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
209; CHECK-NEXT:    [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
210; CHECK-NEXT:    [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
211; CHECK-NEXT:    [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
212; CHECK-NEXT:    [[C1:%.*]] = sub i16 [[A1]], [[B1]]
213; CHECK-NEXT:    [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
214; CHECK-NEXT:    [[C0:%.*]] = sub i16 [[A0]], [[B0]]
215; CHECK-NEXT:    [[C2:%.*]] = sub i16 [[A2]], [[B2]]
216; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
217; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
218; CHECK-NEXT:    [[C3:%.*]] = sub i16 [[A3]], [[B3]]
219; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
220; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
221; CHECK-NEXT:    store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4
222; CHECK-NEXT:    ret void
223;
224top:
225  %a_arr = load [4 x i16], [4 x i16]* %a, align 4
226  %a0 = extractvalue [4 x i16] %a_arr, 0
227  %a2 = extractvalue [4 x i16] %a_arr, 2
228  %a1 = extractvalue [4 x i16] %a_arr, 1
229  %b_arr = load [4 x i16], [4 x i16]* %b, align 4
230  %b0 = extractvalue [4 x i16] %b_arr, 0
231  %b2 = extractvalue [4 x i16] %b_arr, 2
232  %b1 = extractvalue [4 x i16] %b_arr, 1
233  %a3 = extractvalue [4 x i16] %a_arr, 3
234  %c1 = sub i16 %a1, %b1
235  %b3 = extractvalue [4 x i16] %b_arr, 3
236  %c0 = sub i16 %a0, %b0
237  %c2 = sub i16 %a2, %b2
238  %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
239  %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
240  %c3 = sub i16 %a3, %b3
241  %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
242  %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
243  store [4 x i16] %c_arr3, [4 x i16]* %c, align 4
244  ret void
245}
246
247%pseudovec = type { float, float, float, float }
248
249define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
250; CHECK-LABEL: @julia_load_struct_of_float(
251; CHECK-NEXT:  top:
252; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %pseudovec* [[A:%.*]] to <4 x float>*
253; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
254; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %pseudovec* [[B:%.*]] to <4 x float>*
255; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
256; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
257; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
258; CHECK-NEXT:    [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC:%.*]] undef, float [[TMP5]], 0
259; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
260; CHECK-NEXT:    [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT0]], float [[TMP6]], 1
261; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
262; CHECK-NEXT:    [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT1]], float [[TMP7]], 2
263; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
264; CHECK-NEXT:    [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT2]], float [[TMP8]], 3
265; CHECK-NEXT:    store [[PSEUDOVEC]] [[C_STRUCT3]], %pseudovec* [[C:%.*]], align 4
266; CHECK-NEXT:    ret void
267;
268top:
269  %a_struct = load %pseudovec, %pseudovec* %a, align 4
270  %a0 = extractvalue %pseudovec %a_struct, 0
271  %a1 = extractvalue %pseudovec %a_struct, 1
272  %b_struct = load %pseudovec, %pseudovec* %b, align 4
273  %a2 = extractvalue %pseudovec %a_struct, 2
274  %b0 = extractvalue %pseudovec %b_struct, 0
275  %a3 = extractvalue %pseudovec %a_struct, 3
276  %c0 = fsub float %a0, %b0
277  %b1 = extractvalue %pseudovec %b_struct, 1
278  %b2 = extractvalue %pseudovec %b_struct, 2
279  %c1 = fsub float %a1, %b1
280  %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
281  %b3 = extractvalue %pseudovec %b_struct, 3
282  %c3 = fsub float %a3, %b3
283  %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
284  %c2 = fsub float %a2, %b2
285  %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
286  %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
287  store %pseudovec %c_struct3, %pseudovec* %c, align 4
288  ret void
289}
290