1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -verify | FileCheck %s -check-prefix=ENABLED
3;
4; Without supernode operand reordering, this does not get fully vectorized.
5; S[0] = (A[0] + B[0]) + C[0]
6; S[1] = (B[1] + C[1]) + A[1]
7define void @test_supernode_add(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) {
8; ENABLED-LABEL: @test_supernode_add(
9; ENABLED-NEXT:  entry:
10; ENABLED-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0
11; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1
12; ENABLED-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0
13; ENABLED-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0
14; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1
15; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
16; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
17; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
18; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
19; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
20; ENABLED-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
21; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
22; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
23; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
24; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
25; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
26; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1
27; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
28; ENABLED-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
29; ENABLED-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
30; ENABLED-NEXT:    ret void
31;
32entry:
33  %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0
34  %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1
35  %idxB0 = getelementptr inbounds double, double* %Barray, i64 0
36  %idxB1 = getelementptr inbounds double, double* %Barray, i64 1
37  %idxC0 = getelementptr inbounds double, double* %Carray, i64 0
38  %idxC1 = getelementptr inbounds double, double* %Carray, i64 1
39  %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0
40  %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1
41
42  %A0 = load double, double *%idxA0, align 8
43  %A1 = load double, double *%idxA1, align 8
44
45  %B0 = load double, double *%idxB0, align 8
46  %B1 = load double, double *%idxB1, align 8
47
48  %C0 = load double, double *%idxC0, align 8
49  %C1 = load double, double *%idxC1, align 8
50
51  %addA0B0 = fadd fast double %A0, %B0
52  %addB1C1 = fadd fast double %B1, %C1
53  %add0 = fadd fast double %addA0B0, %C0
54  %add1 = fadd fast double %addB1C1, %A1
55  store double %add0, double *%idxS0, align 8
56  store double %add1, double *%idxS1, align 8
57  ret void
58}
59
60
61; Without supernode operand reordering, this does not get fully vectorized.
62; S[0] = (A[0] - B[0]) + C[0]
63; S[1] = (C[1] - B[1]) + A[1]
64define void @test_supernode_addsub(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) {
65; ENABLED-LABEL: @test_supernode_addsub(
66; ENABLED-NEXT:  entry:
67; ENABLED-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0
68; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1
69; ENABLED-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0
70; ENABLED-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0
71; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1
72; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
73; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
74; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
75; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
76; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
77; ENABLED-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
78; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
79; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
80; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
81; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
82; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
83; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1
84; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
85; ENABLED-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
86; ENABLED-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
87; ENABLED-NEXT:    ret void
88;
89entry:
90  %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0
91  %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1
92  %idxB0 = getelementptr inbounds double, double* %Barray, i64 0
93  %idxB1 = getelementptr inbounds double, double* %Barray, i64 1
94  %idxC0 = getelementptr inbounds double, double* %Carray, i64 0
95  %idxC1 = getelementptr inbounds double, double* %Carray, i64 1
96  %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0
97  %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1
98
99  %A0 = load double, double *%idxA0, align 8
100  %A1 = load double, double *%idxA1, align 8
101
102  %B0 = load double, double *%idxB0, align 8
103  %B1 = load double, double *%idxB1, align 8
104
105  %C0 = load double, double *%idxC0, align 8
106  %C1 = load double, double *%idxC1, align 8
107
108  %subA0B0 = fsub fast double %A0, %B0
109  %subC1B1 = fsub fast double %C1, %B1
110  %add0 = fadd fast double %subA0B0, %C0
111  %add1 = fadd fast double %subC1B1, %A1
112  store double %add0, double *%idxS0, align 8
113  store double %add1, double *%idxS1, align 8
114  ret void
115}
116
117; Without supernode operand reordering, this does not get fully vectorized.
118; This checks that the super-node works with alternate sequences.
119;
120; S[0] = (A[0] - B[0]) - C[0]
121; S[1] = (B[1] + C[1]) + A[1]
122define void @test_supernode_addsub_alt(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) {
123; ENABLED-LABEL: @test_supernode_addsub_alt(
124; ENABLED-NEXT:  entry:
125; ENABLED-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0
126; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1
127; ENABLED-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0
128; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1
129; ENABLED-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0
130; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1
131; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
132; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1
133; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
134; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
135; ENABLED-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
136; ENABLED-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
137; ENABLED-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
138; ENABLED-NEXT:    [[C1:%.*]] = load double, double* [[IDXC1]], align 8
139; ENABLED-NEXT:    [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
140; ENABLED-NEXT:    [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]]
141; ENABLED-NEXT:    [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]]
142; ENABLED-NEXT:    [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]]
143; ENABLED-NEXT:    store double [[SUB0]], double* [[IDXS0]], align 8
144; ENABLED-NEXT:    store double [[ADD1]], double* [[IDXS1]], align 8
145; ENABLED-NEXT:    ret void
146;
147entry:
148  %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0
149  %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1
150  %idxB0 = getelementptr inbounds double, double* %Barray, i64 0
151  %idxB1 = getelementptr inbounds double, double* %Barray, i64 1
152  %idxC0 = getelementptr inbounds double, double* %Carray, i64 0
153  %idxC1 = getelementptr inbounds double, double* %Carray, i64 1
154  %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0
155  %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1
156
157  %A0 = load double, double *%idxA0, align 8
158  %A1 = load double, double *%idxA1, align 8
159
160  %B0 = load double, double *%idxB0, align 8
161  %B1 = load double, double *%idxB1, align 8
162
163  %C0 = load double, double *%idxC0, align 8
164  %C1 = load double, double *%idxC1, align 8
165
166  %subA0B0 = fsub fast double %A0, %B0
167  %addB1C1 = fadd fast double %B1, %C1
168  %sub0 = fsub fast double %subA0B0, %C0
169  %add1 = fadd fast double %addB1C1, %A1
170  store double %sub0, double *%idxS0, align 8
171  store double %add1, double *%idxS1, align 8
172  ret void
173}
174
175; This checks that vectorizeTree() works correctly with the supernode
176; and does not generate uses before defs.
177; If all of the operands of the supernode are vectorizable, then the scheduler
178; will fix their position in the program. If not, then the scheduler may not
179; touch them, leading to uses before defs.
180;
181; A0 = ...
182; C = ...
183; t1 = A0 + C
184; B0 = ...
185; t2 = t1 + B0
186; A1 = ...
187; B1 = ...
188; t3 = A1 + B1
189; D = ...
190; t4 = t3 + D
191;
192;
193;  A0  C   A1  B1              A0  C    A1  D            A0:1  C,D
194;   \ /      \ /    Reorder      \ /      \ /    Bundles     \ /
195; t1 + B0  t3 + D   ------->   t1 + B0  t3 + B1  ------> t1:3 + B0:1
196;    |/       |/                  |/       |/                 |/
197; t2 +     t4 +                t2 +     t4 +             t2:4 +
198;
199; After reordering, 'D' conceptually becomes an operand of t3:
200; t3 = A1 + D
201; But D is defined *after* its use.
202;
203define void @supernode_scheduling(double* %Aarray, double* %Barray, double *%Carray, double *%Darray, double *%Sarray) {
204; ENABLED-LABEL: @supernode_scheduling(
205; ENABLED-NEXT:  entry:
206; ENABLED-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0
207; ENABLED-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0
208; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1
209; ENABLED-NEXT:    [[IDXC:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0
210; ENABLED-NEXT:    [[IDXD:%.*]] = getelementptr inbounds double, double* [[DARRAY:%.*]], i64 0
211; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
212; ENABLED-NEXT:    [[C:%.*]] = load double, double* [[IDXC]], align 8
213; ENABLED-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
214; ENABLED-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
215; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
216; ENABLED-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
217; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
218; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1
219; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
220; ENABLED-NEXT:    [[D:%.*]] = load double, double* [[IDXD]], align 8
221; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
222; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1
223; ENABLED-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]]
224; ENABLED-NEXT:    [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
225; ENABLED-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
226; ENABLED-NEXT:    ret void
227;
228entry:
229  %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0
230  %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1
231  %idxB0 = getelementptr inbounds double, double* %Barray, i64 0
232  %idxB1 = getelementptr inbounds double, double* %Barray, i64 1
233  %idxC = getelementptr inbounds double, double* %Carray, i64 0
234  %idxD = getelementptr inbounds double, double* %Darray, i64 0
235  %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0
236  %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1
237
238
239  %A0 = load double, double *%idxA0, align 8
240  %C = load double, double *%idxC, align 8
241  %t1 = fadd fast double %A0, %C
242  %B0 = load double, double *%idxB0, align 8
243  %t2 = fadd fast double %t1, %B0
244  %A1 = load double, double *%idxA1, align 8
245  %B1 = load double, double *%idxB1, align 8
246  %t3 = fadd fast double %A1, %B1
247  %D = load double, double *%idxD, align 8
248  %t4 = fadd fast double %t3, %D
249
250  store double %t2, double *%idxS0, align 8
251  store double %t4, double *%idxS1, align 8
252  ret void
253}
254
255
256; The SLP scheduler has trouble moving instructions across blocks.
257; Even though we can build a SuperNode for this example, we should not because the scheduler
258; cannot handle the cross-block instruction motion that is required once the operands of the
259; SuperNode are reordered.
260;
261; bb1:
262;  A0 = ...
263;  B1 = ...
264;  Tmp0 = A0 + 2.0
265;  Tmp1 = B1 + 2.0
266;
267; bb2:
268;  A1 = ...
269;  B0 = ...
270;  S[0] = Tmp0 + B0
271;  S[1] = Tmp1 + A1
272define void @supernode_scheduling_cross_block(double* %Aarray, double* %Barray, double *%Sarray) {
273; ENABLED-LABEL: @supernode_scheduling_cross_block(
274; ENABLED-NEXT:  entry:
275; ENABLED-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0
276; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1
277; ENABLED-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0
278; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1
279; ENABLED-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0
280; ENABLED-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
281; ENABLED-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
282; ENABLED-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
283; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1
284; ENABLED-NEXT:    [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], <double 2.000000e+00, double 2.000000e+00>
285; ENABLED-NEXT:    br label [[BB:%.*]]
286; ENABLED:       bb:
287; ENABLED-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
288; ENABLED-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
289; ENABLED-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
290; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1
291; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
292; ENABLED-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
293; ENABLED-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
294; ENABLED-NEXT:    ret void
295;
296entry:
297  %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0
298  %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1
299  %idxB0 = getelementptr inbounds double, double* %Barray, i64 0
300  %idxB1 = getelementptr inbounds double, double* %Barray, i64 1
301  %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0
302  %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1
303
304  %A0 = load double, double *%idxA0, align 8
305  %B1 = load double, double *%idxB1, align 8
306  %Tmp0 = fadd fast double %A0, 2.0
307  %Tmp1 = fadd fast double %B1, 2.0
308br label %bb
309
310bb:
311  %A1 = load double, double *%idxA1, align 8
312  %B0 = load double, double *%idxB0, align 8
313
314  %Sum0 = fadd fast double %Tmp0, %B0
315  %Sum1 = fadd fast double %Tmp1, %A1
316
317  store double %Sum0, double *%idxS0, align 8
318  store double %Sum1, double *%idxS1, align 8
319  ret void
320}
321