1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \
3; RUN:   -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
4
5; addresses:
6; 1: base1 + offset
7; 2: + offset
8; 3: + offset
9; 4: + offset
10;
11; chains:
12; 1: base: base1 + offset, offsets: (0, offset)
13; 2: base: base1 + 3*offset, offsets: (0, offset)
14;
15; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) {
16;   long long o1 = base1 + offset;
17;   long long o2 = base1 + 2 * offset;
18;   long long o3 = base1 + 3 * offset;
19;   long long o4 = base1 + 4 * offset;
20;   char *p1 = p + o1;
21;   char *p2 = p + o2;
22;   char *p3 = p + o3;
23;   char *p4 = p + o4;
24;   long long sum = 0;
25;   for (long long i = 0; i < n; ++i) {
26;     unsigned long x1 = *(unsigned long *)(p1 + i);
27;     unsigned long x2 = *(unsigned long *)(p2 + i);
28;     unsigned long x3 = *(unsigned long *)(p3 + i);
29;     unsigned long x4 = *(unsigned long *)(p4 + i);
30;     sum += x1 * x2 * x3 * x4;
31;   }
32;   return sum;
33; }
34;
35define i64 @two_chain_same_offset_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) {
36; CHECK-LABEL: two_chain_same_offset_succ:
37; CHECK:       # %bb.0: # %entry
38; CHECK-NEXT:    cmpdi r6, 1
39; CHECK-NEXT:    blt cr0, .LBB0_4
40; CHECK-NEXT:  # %bb.1: # %for.body.preheader
41; CHECK-NEXT:    sldi r7, r4, 1
42; CHECK-NEXT:    mtctr r6
43; CHECK-NEXT:    add r8, r4, r7
44; CHECK-NEXT:    add r7, r5, r4
45; CHECK-NEXT:    add r5, r5, r8
46; CHECK-NEXT:    add r7, r3, r7
47; CHECK-NEXT:    add r5, r3, r5
48; CHECK-NEXT:    li r3, 0
49; CHECK-NEXT:    .p2align 4
50; CHECK-NEXT:  .LBB0_2: # %for.body
51; CHECK-NEXT:    #
52; CHECK-NEXT:    ld r6, 0(r7)
53; CHECK-NEXT:    ldx r8, r7, r4
54; CHECK-NEXT:    ld r9, 0(r5)
55; CHECK-NEXT:    ldx r10, r5, r4
56; CHECK-NEXT:    addi r7, r7, 1
57; CHECK-NEXT:    addi r5, r5, 1
58; CHECK-NEXT:    mulld r6, r8, r6
59; CHECK-NEXT:    mulld r6, r6, r9
60; CHECK-NEXT:    maddld r3, r6, r10, r3
61; CHECK-NEXT:    bdnz .LBB0_2
62; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
63; CHECK-NEXT:    blr
64; CHECK-NEXT:  .LBB0_4:
65; CHECK-NEXT:    li r3, 0
66; CHECK-NEXT:    blr
67entry:
68  %mul = shl nsw i64 %offset, 1
69  %mul2 = mul nsw i64 %offset, 3
70  %mul4 = shl nsw i64 %offset, 2
71  %cmp46 = icmp sgt i64 %n, 0
72  br i1 %cmp46, label %for.body, label %for.cond.cleanup
73
74for.cond.cleanup:                                 ; preds = %for.body, %entry
75  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
76  ret i64 %sum.0.lcssa
77
78for.body:                                         ; preds = %entry, %for.body
79  %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
80  %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
81  %add = add i64 %i.047, %base1
82  %add.ptr9.idx = add i64 %add, %offset
83  %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
84  %0 = bitcast i8* %add.ptr9 to i64*
85  %1 = load i64, i64* %0, align 8
86  %add.ptr10.idx = add i64 %add, %mul
87  %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
88  %2 = bitcast i8* %add.ptr10 to i64*
89  %3 = load i64, i64* %2, align 8
90  %add.ptr11.idx = add i64 %add, %mul2
91  %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
92  %4 = bitcast i8* %add.ptr11 to i64*
93  %5 = load i64, i64* %4, align 8
94  %add.ptr12.idx = add i64 %add, %mul4
95  %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
96  %6 = bitcast i8* %add.ptr12 to i64*
97  %7 = load i64, i64* %6, align 8
98  %mul13 = mul i64 %3, %1
99  %mul14 = mul i64 %mul13, %5
100  %mul15 = mul i64 %mul14, %7
101  %add16 = add i64 %mul15, %sum.048
102  %inc = add nuw nsw i64 %i.047, 1
103  %exitcond.not = icmp eq i64 %inc, %n
104  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
105}
106
107; addresses:
108; 1: base1 + offset
109; 2: + offset
110; 3: + offset
111; 4: + offset
112; 5: + offset
113;
114; It can not be commoned to chains because we need a chain for a single address.
115; It is not profitable to common chains if not all addresses are in chains.
116;
117; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
118;   long long o1 = base1 + offset;
119;   long long o2 = base1 + 2 * offset;
120;   long long o3 = base1 + 3 * offset;
121;   long long o4 = base1 + 4 * offset;
122;   long long o5 = base1 + 5 * offset;
123;   char *p1 = p + o1;
124;   char *p2 = p + o2;
125;   char *p3 = p + o3;
126;   char *p4 = p + o4;
127;   char *p5 = p + o5;
128;   long long sum = 0;
129;   for (long long i = 0; i < n; ++i) {
130;     unsigned long x1 = *(unsigned long *)(p1 + i);
131;     unsigned long x2 = *(unsigned long *)(p2 + i);
132;     unsigned long x3 = *(unsigned long *)(p3 + i);
133;     unsigned long x4 = *(unsigned long *)(p4 + i);
134;     unsigned long x5 = *(unsigned long *)(p5 + i);
135;     sum += x1 * x2 * x3 * x4 * x5;
136;   }
137;   return sum;
138; }
139;
140define i64 @not_perfect_chain_all_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
141; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
142; CHECK:       # %bb.0: # %entry
143; CHECK-NEXT:    cmpdi r6, 1
144; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
145; CHECK-NEXT:    blt cr0, .LBB1_4
146; CHECK-NEXT:  # %bb.1: # %for.body.preheader
147; CHECK-NEXT:    sldi r7, r4, 1
148; CHECK-NEXT:    sldi r9, r4, 2
149; CHECK-NEXT:    add r5, r3, r5
150; CHECK-NEXT:    li r3, 0
151; CHECK-NEXT:    add r8, r4, r7
152; CHECK-NEXT:    mtctr r6
153; CHECK-NEXT:    add r10, r4, r9
154; CHECK-NEXT:    .p2align 4
155; CHECK-NEXT:  .LBB1_2: # %for.body
156; CHECK-NEXT:    #
157; CHECK-NEXT:    ldx r6, r5, r4
158; CHECK-NEXT:    ldx r11, r5, r7
159; CHECK-NEXT:    ldx r12, r5, r8
160; CHECK-NEXT:    ldx r0, r5, r9
161; CHECK-NEXT:    mulld r6, r11, r6
162; CHECK-NEXT:    ldx r30, r5, r10
163; CHECK-NEXT:    addi r5, r5, 1
164; CHECK-NEXT:    mulld r6, r6, r12
165; CHECK-NEXT:    mulld r6, r6, r0
166; CHECK-NEXT:    maddld r3, r6, r30, r3
167; CHECK-NEXT:    bdnz .LBB1_2
168; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
169; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
170; CHECK-NEXT:    blr
171; CHECK-NEXT:  .LBB1_4:
172; CHECK-NEXT:    li r3, 0
173; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
174; CHECK-NEXT:    blr
175entry:
176  %mul = shl nsw i64 %offset, 1
177  %mul2 = mul nsw i64 %offset, 3
178  %mul4 = shl nsw i64 %offset, 2
179  %mul6 = mul nsw i64 %offset, 5
180  %cmp58 = icmp sgt i64 %n, 0
181  br i1 %cmp58, label %for.body, label %for.cond.cleanup
182
183for.cond.cleanup:                                 ; preds = %for.body, %entry
184  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ]
185  ret i64 %sum.0.lcssa
186
187for.body:                                         ; preds = %entry, %for.body
188  %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ]
189  %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
190  %add = add i64 %i.059, %base1
191  %add.ptr12.idx = add i64 %add, %offset
192  %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
193  %0 = bitcast i8* %add.ptr12 to i64*
194  %1 = load i64, i64* %0, align 8
195  %add.ptr13.idx = add i64 %add, %mul
196  %add.ptr13 = getelementptr inbounds i8, i8* %p, i64 %add.ptr13.idx
197  %2 = bitcast i8* %add.ptr13 to i64*
198  %3 = load i64, i64* %2, align 8
199  %add.ptr14.idx = add i64 %add, %mul2
200  %add.ptr14 = getelementptr inbounds i8, i8* %p, i64 %add.ptr14.idx
201  %4 = bitcast i8* %add.ptr14 to i64*
202  %5 = load i64, i64* %4, align 8
203  %add.ptr15.idx = add i64 %add, %mul4
204  %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx
205  %6 = bitcast i8* %add.ptr15 to i64*
206  %7 = load i64, i64* %6, align 8
207  %add.ptr16.idx = add i64 %add, %mul6
208  %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx
209  %8 = bitcast i8* %add.ptr16 to i64*
210  %9 = load i64, i64* %8, align 8
211  %mul17 = mul i64 %3, %1
212  %mul18 = mul i64 %mul17, %5
213  %mul19 = mul i64 %mul18, %7
214  %mul20 = mul i64 %mul19, %9
215  %add21 = add i64 %mul20, %sum.060
216  %inc = add nuw nsw i64 %i.059, 1
217  %exitcond.not = icmp eq i64 %inc, %n
218  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
219}
220
221; addresses:
222; 1: base1
223; 2: + 2*offset
224; 3: + offset
225;
226; We need at least 4 addresses to common 2 chains to reuse at least 1 offset.
227;
228; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) {
229;   long long o1 = base1;
230;   long long o2 = base1 + 2 * offset;
231;   long long o3 = base1 + 3 * offset;
232;   char *p1 = p + o1;
233;   char *p2 = p + o2;
234;   char *p3 = p + o3;
235;   long long sum = 0;
236;   for (long long i = 0; i < n; ++i) {
237;     unsigned long x1 = *(unsigned long *)(p1 + i);
238;     unsigned long x2 = *(unsigned long *)(p2 + i);
239;     unsigned long x3 = *(unsigned long *)(p3 + i);
240;     sum += x1 * x2 * x3;
241;   }
242;   return sum;
243; }
244;
245define i64 @no_enough_elements_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
246; CHECK-LABEL: no_enough_elements_fail:
247; CHECK:       # %bb.0: # %entry
248; CHECK-NEXT:    cmpdi r6, 1
249; CHECK-NEXT:    blt cr0, .LBB2_4
250; CHECK-NEXT:  # %bb.1: # %for.body.preheader
251; CHECK-NEXT:    sldi r7, r4, 1
252; CHECK-NEXT:    mtctr r6
253; CHECK-NEXT:    add r5, r3, r5
254; CHECK-NEXT:    li r3, 0
255; CHECK-NEXT:    add r4, r4, r7
256; CHECK-NEXT:    .p2align 5
257; CHECK-NEXT:  .LBB2_2: # %for.body
258; CHECK-NEXT:    #
259; CHECK-NEXT:    ld r6, 0(r5)
260; CHECK-NEXT:    ldx r8, r5, r7
261; CHECK-NEXT:    ldx r9, r5, r4
262; CHECK-NEXT:    addi r5, r5, 1
263; CHECK-NEXT:    mulld r6, r8, r6
264; CHECK-NEXT:    maddld r3, r6, r9, r3
265; CHECK-NEXT:    bdnz .LBB2_2
266; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
267; CHECK-NEXT:    blr
268; CHECK-NEXT:  .LBB2_4:
269; CHECK-NEXT:    li r3, 0
270; CHECK-NEXT:    blr
271entry:
272  %mul = shl nsw i64 %offset, 1
273  %mul1 = mul nsw i64 %offset, 3
274  %cmp32 = icmp sgt i64 %n, 0
275  br i1 %cmp32, label %for.body, label %for.cond.cleanup
276
277for.cond.cleanup:                                 ; preds = %for.body, %entry
278  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ]
279  ret i64 %sum.0.lcssa
280
281for.body:                                         ; preds = %entry, %for.body
282  %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ]
283  %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
284  %add.ptr5.idx = add i64 %i.033, %base1
285  %add.ptr5 = getelementptr inbounds i8, i8* %p, i64 %add.ptr5.idx
286  %0 = bitcast i8* %add.ptr5 to i64*
287  %1 = load i64, i64* %0, align 8
288  %add.ptr6.idx = add i64 %add.ptr5.idx, %mul
289  %add.ptr6 = getelementptr inbounds i8, i8* %p, i64 %add.ptr6.idx
290  %2 = bitcast i8* %add.ptr6 to i64*
291  %3 = load i64, i64* %2, align 8
292  %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1
293  %add.ptr7 = getelementptr inbounds i8, i8* %p, i64 %add.ptr7.idx
294  %4 = bitcast i8* %add.ptr7 to i64*
295  %5 = load i64, i64* %4, align 8
296  %mul8 = mul i64 %3, %1
297  %mul9 = mul i64 %mul8, %5
298  %add10 = add i64 %mul9, %sum.034
299  %inc = add nuw nsw i64 %i.033, 1
300  %exitcond.not = icmp eq i64 %inc, %n
301  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
302}
303
304; addresses:
305; 1: base1
306; 2: + 2*offset
307; 3: + 2*offset
308; 4: + 3*offset
309;
310; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
311; so we can not common any chains.
312;
313; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
314;   long long o1 = base1;
315;   long long o2 = base1 + 2 * offset;
316;   long long o3 = base1 + 4 * offset;
317;   long long o4 = base1 + 7 * offset;
318;   char *p1 = p + o1;
319;   char *p2 = p + o2;
320;   char *p3 = p + o3;
321;   char *p4 = p + o4;
322;   long long sum = 0;
323;   for (long long i = 0; i < n; ++i) {
324;     unsigned long x1 = *(unsigned long *)(p1 + i);
325;     unsigned long x2 = *(unsigned long *)(p2 + i);
326;     unsigned long x3 = *(unsigned long *)(p3 + i);
327;     unsigned long x4 = *(unsigned long *)(p4 + i);
328;     sum += x1 * x2 * x3 * x4;
329;   }
330;   return sum;
331; }
332;
333define i64 @no_reuseable_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
334; CHECK-LABEL: no_reuseable_offset_fail:
335; CHECK:       # %bb.0: # %entry
336; CHECK-NEXT:    cmpdi r6, 1
337; CHECK-NEXT:    blt cr0, .LBB3_4
338; CHECK-NEXT:  # %bb.1: # %for.body.preheader
339; CHECK-NEXT:    sldi r9, r4, 3
340; CHECK-NEXT:    mtctr r6
341; CHECK-NEXT:    add r5, r3, r5
342; CHECK-NEXT:    li r3, 0
343; CHECK-NEXT:    sldi r7, r4, 1
344; CHECK-NEXT:    sldi r8, r4, 2
345; CHECK-NEXT:    sub r4, r9, r4
346; CHECK-NEXT:    .p2align 4
347; CHECK-NEXT:  .LBB3_2: # %for.body
348; CHECK-NEXT:    #
349; CHECK-NEXT:    ld r6, 0(r5)
350; CHECK-NEXT:    ldx r9, r5, r7
351; CHECK-NEXT:    ldx r10, r5, r8
352; CHECK-NEXT:    ldx r11, r5, r4
353; CHECK-NEXT:    addi r5, r5, 1
354; CHECK-NEXT:    mulld r6, r9, r6
355; CHECK-NEXT:    mulld r6, r6, r10
356; CHECK-NEXT:    maddld r3, r6, r11, r3
357; CHECK-NEXT:    bdnz .LBB3_2
358; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
359; CHECK-NEXT:    blr
360; CHECK-NEXT:  .LBB3_4:
361; CHECK-NEXT:    li r3, 0
362; CHECK-NEXT:    blr
363entry:
364  %mul = shl nsw i64 %offset, 1
365  %mul1 = shl nsw i64 %offset, 2
366  %mul3 = mul nsw i64 %offset, 7
367  %cmp44 = icmp sgt i64 %n, 0
368  br i1 %cmp44, label %for.body, label %for.cond.cleanup
369
370for.cond.cleanup:                                 ; preds = %for.body, %entry
371  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
372  ret i64 %sum.0.lcssa
373
374for.body:                                         ; preds = %entry, %for.body
375  %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
376  %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
377  %add.ptr8.idx = add i64 %i.045, %base1
378  %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx
379  %0 = bitcast i8* %add.ptr8 to i64*
380  %1 = load i64, i64* %0, align 8
381  %add.ptr9.idx = add i64 %add.ptr8.idx, %mul
382  %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
383  %2 = bitcast i8* %add.ptr9 to i64*
384  %3 = load i64, i64* %2, align 8
385  %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1
386  %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
387  %4 = bitcast i8* %add.ptr10 to i64*
388  %5 = load i64, i64* %4, align 8
389  %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3
390  %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
391  %6 = bitcast i8* %add.ptr11 to i64*
392  %7 = load i64, i64* %6, align 8
393  %mul12 = mul i64 %3, %1
394  %mul13 = mul i64 %mul12, %5
395  %mul14 = mul i64 %mul13, %7
396  %add15 = add i64 %mul14, %sum.046
397  %inc = add nuw nsw i64 %i.045, 1
398  %exitcond.not = icmp eq i64 %inc, %n
399  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
400}
401
402; addresses:
403; 1: base1 + offset
404; 2: + offset
405; 3: + 3*offset
406; 4: + 2*offset
407; 5: + 1*offset
408; 6: + 2*offset
409;
410; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
411; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6
412; and address 5(2*offset), so we can not common chains for these addresses.
413;
414; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) {
415;   long long o1 = base1 + offset;
416;   long long o2 = base1 + 2 * offset;
417;   long long o3 = base1 + 5 * offset;
418;   long long o4 = base1 + 7 * offset;
419;   long long o5 = base1 + 8 * offset;
420;   long long o6 = base1 + 10 * offset;
421;   char *p1 = p + o1;
422;   char *p2 = p + o2;
423;   char *p3 = p + o3;
424;   char *p4 = p + o4;
425;   char *p5 = p + o5;
426;   char *p6 = p + o6;
427;   long long sum = 0;
428;   for (long long i = 0; i < n; ++i) {
429;     unsigned long x1 = *(unsigned long *)(p1 + i);
430;     unsigned long x2 = *(unsigned long *)(p2 + i);
431;     unsigned long x3 = *(unsigned long *)(p3 + i);
432;     unsigned long x4 = *(unsigned long *)(p4 + i);
433;     unsigned long x5 = *(unsigned long *)(p5 + i);
434;     unsigned long x6 = *(unsigned long *)(p6 + i);
435;     sum += x1 * x2 * x3 * x4 * x5 * x6;
436;   }
437;   return sum;
438; }
439;
440define i64 @not_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
441; CHECK-LABEL: not_same_offset_fail:
442; CHECK:       # %bb.0: # %entry
443; CHECK-NEXT:    cmpdi r6, 1
444; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
445; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
446; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
447; CHECK-NEXT:    blt cr0, .LBB4_3
448; CHECK-NEXT:  # %bb.1: # %for.body.preheader
449; CHECK-NEXT:    mulli r11, r4, 10
450; CHECK-NEXT:    sldi r8, r4, 2
451; CHECK-NEXT:    add r5, r3, r5
452; CHECK-NEXT:    li r3, 0
453; CHECK-NEXT:    add r8, r4, r8
454; CHECK-NEXT:    sldi r9, r4, 3
455; CHECK-NEXT:    mtctr r6
456; CHECK-NEXT:    sldi r7, r4, 1
457; CHECK-NEXT:    sub r10, r9, r4
458; CHECK-NEXT:    .p2align 4
459; CHECK-NEXT:  .LBB4_2: # %for.body
460; CHECK-NEXT:    #
461; CHECK-NEXT:    ldx r6, r5, r4
462; CHECK-NEXT:    ldx r12, r5, r7
463; CHECK-NEXT:    ldx r0, r5, r8
464; CHECK-NEXT:    ldx r30, r5, r10
465; CHECK-NEXT:    mulld r6, r12, r6
466; CHECK-NEXT:    ldx r29, r5, r9
467; CHECK-NEXT:    ldx r28, r5, r11
468; CHECK-NEXT:    addi r5, r5, 1
469; CHECK-NEXT:    mulld r6, r6, r0
470; CHECK-NEXT:    mulld r6, r6, r30
471; CHECK-NEXT:    mulld r6, r6, r29
472; CHECK-NEXT:    maddld r3, r6, r28, r3
473; CHECK-NEXT:    bdnz .LBB4_2
474; CHECK-NEXT:    b .LBB4_4
475; CHECK-NEXT:  .LBB4_3:
476; CHECK-NEXT:    li r3, 0
477; CHECK-NEXT:  .LBB4_4: # %for.cond.cleanup
478; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
479; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
480; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
481; CHECK-NEXT:    blr
482entry:
483  %mul = shl nsw i64 %offset, 1
484  %mul2 = mul nsw i64 %offset, 5
485  %mul4 = mul nsw i64 %offset, 7
486  %mul6 = shl nsw i64 %offset, 3
487  %mul8 = mul nsw i64 %offset, 10
488  %cmp70 = icmp sgt i64 %n, 0
489  br i1 %cmp70, label %for.body, label %for.cond.cleanup
490
491for.cond.cleanup:                                 ; preds = %for.body, %entry
492  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ]
493  ret i64 %sum.0.lcssa
494
495for.body:                                         ; preds = %entry, %for.body
496  %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ]
497  %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
498  %add = add i64 %i.071, %base1
499  %add.ptr15.idx = add i64 %add, %offset
500  %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx
501  %0 = bitcast i8* %add.ptr15 to i64*
502  %1 = load i64, i64* %0, align 8
503  %add.ptr16.idx = add i64 %add, %mul
504  %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx
505  %2 = bitcast i8* %add.ptr16 to i64*
506  %3 = load i64, i64* %2, align 8
507  %add.ptr17.idx = add i64 %add, %mul2
508  %add.ptr17 = getelementptr inbounds i8, i8* %p, i64 %add.ptr17.idx
509  %4 = bitcast i8* %add.ptr17 to i64*
510  %5 = load i64, i64* %4, align 8
511  %add.ptr18.idx = add i64 %add, %mul4
512  %add.ptr18 = getelementptr inbounds i8, i8* %p, i64 %add.ptr18.idx
513  %6 = bitcast i8* %add.ptr18 to i64*
514  %7 = load i64, i64* %6, align 8
515  %add.ptr19.idx = add i64 %add, %mul6
516  %add.ptr19 = getelementptr inbounds i8, i8* %p, i64 %add.ptr19.idx
517  %8 = bitcast i8* %add.ptr19 to i64*
518  %9 = load i64, i64* %8, align 8
519  %add.ptr20.idx = add i64 %add, %mul8
520  %add.ptr20 = getelementptr inbounds i8, i8* %p, i64 %add.ptr20.idx
521  %10 = bitcast i8* %add.ptr20 to i64*
522  %11 = load i64, i64* %10, align 8
523  %mul21 = mul i64 %3, %1
524  %mul22 = mul i64 %mul21, %5
525  %mul23 = mul i64 %mul22, %7
526  %mul24 = mul i64 %mul23, %9
527  %mul25 = mul i64 %mul24, %11
528  %add26 = add i64 %mul25, %sum.072
529  %inc = add nuw nsw i64 %i.071, 1
530  %exitcond.not = icmp eq i64 %inc, %n
531  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
532}
533
534; addresses:
535; 1: base1 + offset
536; 2: + offset
537; 3: + 3*offset
538; 4: + 2*offset
539;
540; chains:
541; 1: base1 + offset, offsets: (0, 2*offset)
542; 2: base1 + 4*offset, offsets: (0, 2*offset)
543;
544; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) {
545;   long long o1 = base1 + offset;
546;   long long o2 = base1 + 3 * offset;
547;   long long o3 = base1 + 4 * offset;
548;   long long o4 = base1 + 6 * offset;
549;   char *p1 = p + o1;
550;   char *p2 = p + o2;
551;   char *p3 = p + o3;
552;   char *p4 = p + o4;
553;   long long sum = 0;
554;   for (long long i = 0; i < n; ++i) {
555;     unsigned long x1 = *(unsigned long *)(p1 + i);
556;     unsigned long x2 = *(unsigned long *)(p2 + i);
557;     unsigned long x3 = *(unsigned long *)(p3 + i);
558;     unsigned long x4 = *(unsigned long *)(p4 + i);
559;     sum += x1 * x2 * x3 * x4;
560;   }
561;   return sum;
562; }
563;
564define i64 @two_chain_different_offsets_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) {
565; CHECK-LABEL: two_chain_different_offsets_succ:
566; CHECK:       # %bb.0: # %entry
567; CHECK-NEXT:    cmpdi r6, 1
568; CHECK-NEXT:    blt cr0, .LBB5_4
569; CHECK-NEXT:  # %bb.1: # %for.body.preheader
570; CHECK-NEXT:    sldi r8, r4, 2
571; CHECK-NEXT:    add r7, r5, r4
572; CHECK-NEXT:    mtctr r6
573; CHECK-NEXT:    add r5, r5, r8
574; CHECK-NEXT:    add r7, r3, r7
575; CHECK-NEXT:    sldi r4, r4, 1
576; CHECK-NEXT:    add r5, r3, r5
577; CHECK-NEXT:    li r3, 0
578; CHECK-NEXT:    .p2align 4
579; CHECK-NEXT:  .LBB5_2: # %for.body
580; CHECK-NEXT:    #
581; CHECK-NEXT:    ld r6, 0(r7)
582; CHECK-NEXT:    ldx r8, r7, r4
583; CHECK-NEXT:    ld r9, 0(r5)
584; CHECK-NEXT:    ldx r10, r5, r4
585; CHECK-NEXT:    addi r7, r7, 1
586; CHECK-NEXT:    addi r5, r5, 1
587; CHECK-NEXT:    mulld r6, r8, r6
588; CHECK-NEXT:    mulld r6, r6, r9
589; CHECK-NEXT:    maddld r3, r6, r10, r3
590; CHECK-NEXT:    bdnz .LBB5_2
591; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
592; CHECK-NEXT:    blr
593; CHECK-NEXT:  .LBB5_4:
594; CHECK-NEXT:    li r3, 0
595; CHECK-NEXT:    blr
596entry:
597  %mul = mul nsw i64 %offset, 3
598  %mul2 = shl nsw i64 %offset, 2
599  %mul4 = mul nsw i64 %offset, 6
600  %cmp46 = icmp sgt i64 %n, 0
601  br i1 %cmp46, label %for.body, label %for.cond.cleanup
602
603for.cond.cleanup:                                 ; preds = %for.body, %entry
604  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
605  ret i64 %sum.0.lcssa
606
607for.body:                                         ; preds = %entry, %for.body
608  %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
609  %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
610  %add = add i64 %i.047, %base1
611  %add.ptr9.idx = add i64 %add, %offset
612  %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
613  %0 = bitcast i8* %add.ptr9 to i64*
614  %1 = load i64, i64* %0, align 8
615  %add.ptr10.idx = add i64 %add, %mul
616  %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
617  %2 = bitcast i8* %add.ptr10 to i64*
618  %3 = load i64, i64* %2, align 8
619  %add.ptr11.idx = add i64 %add, %mul2
620  %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
621  %4 = bitcast i8* %add.ptr11 to i64*
622  %5 = load i64, i64* %4, align 8
623  %add.ptr12.idx = add i64 %add, %mul4
624  %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
625  %6 = bitcast i8* %add.ptr12 to i64*
626  %7 = load i64, i64* %6, align 8
627  %mul13 = mul i64 %3, %1
628  %mul14 = mul i64 %mul13, %5
629  %mul15 = mul i64 %mul14, %7
630  %add16 = add i64 %mul15, %sum.048
631  %inc = add nuw nsw i64 %i.047, 1
632  %exitcond.not = icmp eq i64 %inc, %n
633  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
634}
635
636; addresses:
637; 1: base1 + offset
638; 2: + 2*offset
639; 3: + base2 - base1 - 2*offset
640; 4: + 2*offset
641;
642; chains:
643; 1: base1 + offset, offsets: (0, 2*offset)
644; 2: base2 + offset, offsets: (0, 2*offset)
645;
646; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) {
647;   long long o1 = base1 + offset;
648;   long long o2 = base1 + 3 * offset;
649;   long long o3 = base2 + offset;
650;   long long o4 = base2 + 3 * offset;
651;   char *p1 = p + o1;
652;   char *p2 = p + o2;
653;   char *p3 = p + o3;
654;   char *p4 = p + o4;
655;   long long sum = 0;
656;   for (long long i = 0; i < n; ++i) {
657;     unsigned long x1 = *(unsigned long *)(p1 + i);
658;     unsigned long x2 = *(unsigned long *)(p2 + i);
659;     unsigned long x3 = *(unsigned long *)(p3 + i);
660;     unsigned long x4 = *(unsigned long *)(p4 + i);
661;     sum += x1 * x2 * x3 * x4;
662;   }
663;   return sum;
664; }
665;
666define i64 @two_chain_two_bases_succ(i8* %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) {
667; CHECK-LABEL: two_chain_two_bases_succ:
668; CHECK:       # %bb.0: # %entry
669; CHECK-NEXT:    cmpdi r7, 1
670; CHECK-NEXT:    blt cr0, .LBB6_4
671; CHECK-NEXT:  # %bb.1: # %for.body.preheader
672; CHECK-NEXT:    add r6, r6, r4
673; CHECK-NEXT:    add r5, r5, r4
674; CHECK-NEXT:    mtctr r7
675; CHECK-NEXT:    sldi r4, r4, 1
676; CHECK-NEXT:    add r5, r3, r5
677; CHECK-NEXT:    add r6, r3, r6
678; CHECK-NEXT:    li r3, 0
679; CHECK-NEXT:    .p2align 4
680; CHECK-NEXT:  .LBB6_2: # %for.body
681; CHECK-NEXT:    #
682; CHECK-NEXT:    ld r7, 0(r5)
683; CHECK-NEXT:    ldx r8, r5, r4
684; CHECK-NEXT:    ld r9, 0(r6)
685; CHECK-NEXT:    ldx r10, r6, r4
686; CHECK-NEXT:    addi r5, r5, 1
687; CHECK-NEXT:    addi r6, r6, 1
688; CHECK-NEXT:    mulld r7, r8, r7
689; CHECK-NEXT:    mulld r7, r7, r9
690; CHECK-NEXT:    maddld r3, r7, r10, r3
691; CHECK-NEXT:    bdnz .LBB6_2
692; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
693; CHECK-NEXT:    blr
694; CHECK-NEXT:  .LBB6_4:
695; CHECK-NEXT:    li r3, 0
696; CHECK-NEXT:    blr
697entry:
698  %mul = mul nsw i64 %offset, 3
699  %cmp44 = icmp sgt i64 %n, 0
700  br i1 %cmp44, label %for.body, label %for.cond.cleanup
701
702for.cond.cleanup:                                 ; preds = %for.body, %entry
703  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
704  ret i64 %sum.0.lcssa
705
706for.body:                                         ; preds = %entry, %for.body
707  %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
708  %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
709  %add = add i64 %i.045, %base1
710  %add.ptr8.idx = add i64 %add, %offset
711  %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx
712  %0 = bitcast i8* %add.ptr8 to i64*
713  %1 = load i64, i64* %0, align 8
714  %add1 = add i64 %i.045, %mul
715  %add.ptr9.idx = add i64 %add1, %base1
716  %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
717  %2 = bitcast i8* %add.ptr9 to i64*
718  %3 = load i64, i64* %2, align 8
719  %add2 = add i64 %i.045, %base2
720  %add.ptr10.idx = add i64 %add2, %offset
721  %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
722  %4 = bitcast i8* %add.ptr10 to i64*
723  %5 = load i64, i64* %4, align 8
724  %add.ptr11.idx = add i64 %add2, %mul
725  %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
726  %6 = bitcast i8* %add.ptr11 to i64*
727  %7 = load i64, i64* %6, align 8
728  %mul12 = mul i64 %3, %1
729  %mul13 = mul i64 %mul12, %5
730  %mul14 = mul i64 %mul13, %7
731  %add15 = add i64 %mul14, %sum.046
732  %inc = add nuw nsw i64 %i.045, 1
733  %exitcond.not = icmp eq i64 %inc, %n
734  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
735}
736;
737; Check chain commoning can reduce register pressure to save register spill/reload.
738;
739; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) {
740;   inc = inc4;
741; #pragma unroll 4
742;   for (long long i = 0; i < 4 * m; i++) {
743;     output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1];
744;     output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2];
745;     output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3];
746;     inc =  inc + inc4;
747;   }
748;   return 0;
749; }
750;
751define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
752; CHECK-LABEL: spill_reduce_succ:
753; CHECK:       # %bb.0: # %entry
754; CHECK-NEXT:    cmpdi r6, 1
755; CHECK-NEXT:    std r14, -144(r1) # 8-byte Folded Spill
756; CHECK-NEXT:    std r15, -136(r1) # 8-byte Folded Spill
757; CHECK-NEXT:    std r16, -128(r1) # 8-byte Folded Spill
758; CHECK-NEXT:    std r17, -120(r1) # 8-byte Folded Spill
759; CHECK-NEXT:    std r18, -112(r1) # 8-byte Folded Spill
760; CHECK-NEXT:    std r19, -104(r1) # 8-byte Folded Spill
761; CHECK-NEXT:    std r20, -96(r1) # 8-byte Folded Spill
762; CHECK-NEXT:    std r21, -88(r1) # 8-byte Folded Spill
763; CHECK-NEXT:    std r22, -80(r1) # 8-byte Folded Spill
764; CHECK-NEXT:    std r23, -72(r1) # 8-byte Folded Spill
765; CHECK-NEXT:    std r24, -64(r1) # 8-byte Folded Spill
766; CHECK-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
767; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
768; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
769; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
770; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
771; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
772; CHECK-NEXT:    std r31, -8(r1) # 8-byte Folded Spill
773; CHECK-NEXT:    std r2, -152(r1) # 8-byte Folded Spill
774; CHECK-NEXT:    std r9, -160(r1) # 8-byte Folded Spill
775; CHECK-NEXT:    std r8, -176(r1) # 8-byte Folded Spill
776; CHECK-NEXT:    std r7, -168(r1) # 8-byte Folded Spill
777; CHECK-NEXT:    blt cr0, .LBB7_7
778; CHECK-NEXT:  # %bb.1: # %for.body.preheader
779; CHECK-NEXT:    sldi r6, r6, 2
780; CHECK-NEXT:    li r7, 1
781; CHECK-NEXT:    mr r12, r10
782; CHECK-NEXT:    cmpdi r6, 1
783; CHECK-NEXT:    iselgt r7, r6, r7
784; CHECK-NEXT:    addi r8, r7, -1
785; CHECK-NEXT:    clrldi r6, r7, 63
786; CHECK-NEXT:    cmpldi r8, 3
787; CHECK-NEXT:    blt cr0, .LBB7_4
788; CHECK-NEXT:  # %bb.2: # %for.body.preheader.new
789; CHECK-NEXT:    rldicl r7, r7, 62, 2
790; CHECK-NEXT:    sldi r10, r12, 2
791; CHECK-NEXT:    ld r2, -168(r1) # 8-byte Folded Reload
792; CHECK-NEXT:    rldicl r7, r7, 2, 1
793; CHECK-NEXT:    std r7, -184(r1) # 8-byte Folded Spill
794; CHECK-NEXT:    ld r7, -160(r1) # 8-byte Folded Reload
795; CHECK-NEXT:    add r8, r7, r10
796; CHECK-NEXT:    mr r22, r7
797; CHECK-NEXT:    mr r7, r4
798; CHECK-NEXT:    mr r4, r3
799; CHECK-NEXT:    ld r3, -176(r1) # 8-byte Folded Reload
800; CHECK-NEXT:    sldi r8, r8, 3
801; CHECK-NEXT:    add r9, r5, r8
802; CHECK-NEXT:    add r8, r3, r10
803; CHECK-NEXT:    add r10, r2, r10
804; CHECK-NEXT:    sldi r10, r10, 3
805; CHECK-NEXT:    sldi r8, r8, 3
806; CHECK-NEXT:    add r30, r5, r10
807; CHECK-NEXT:    add r29, r7, r10
808; CHECK-NEXT:    add r28, r4, r10
809; CHECK-NEXT:    sldi r10, r12, 1
810; CHECK-NEXT:    add r8, r5, r8
811; CHECK-NEXT:    add r11, r12, r10
812; CHECK-NEXT:    add r0, r22, r11
813; CHECK-NEXT:    sldi r0, r0, 3
814; CHECK-NEXT:    add r27, r5, r0
815; CHECK-NEXT:    add r0, r3, r11
816; CHECK-NEXT:    add r11, r2, r11
817; CHECK-NEXT:    sldi r11, r11, 3
818; CHECK-NEXT:    sldi r0, r0, 3
819; CHECK-NEXT:    add r25, r5, r11
820; CHECK-NEXT:    add r24, r7, r11
821; CHECK-NEXT:    add r23, r4, r11
822; CHECK-NEXT:    add r11, r22, r10
823; CHECK-NEXT:    add r26, r5, r0
824; CHECK-NEXT:    mr r0, r22
825; CHECK-NEXT:    sldi r11, r11, 3
826; CHECK-NEXT:    add r22, r5, r11
827; CHECK-NEXT:    add r11, r3, r10
828; CHECK-NEXT:    add r10, r2, r10
829; CHECK-NEXT:    sldi r10, r10, 3
830; CHECK-NEXT:    sldi r11, r11, 3
831; CHECK-NEXT:    add r20, r5, r10
832; CHECK-NEXT:    add r19, r7, r10
833; CHECK-NEXT:    add r18, r4, r10
834; CHECK-NEXT:    add r10, r12, r0
835; CHECK-NEXT:    add r21, r5, r11
836; CHECK-NEXT:    sldi r11, r2, 3
837; CHECK-NEXT:    sldi r10, r10, 3
838; CHECK-NEXT:    add r17, r5, r10
839; CHECK-NEXT:    add r10, r12, r3
840; CHECK-NEXT:    sldi r10, r10, 3
841; CHECK-NEXT:    add r16, r5, r10
842; CHECK-NEXT:    add r10, r12, r2
843; CHECK-NEXT:    sldi r10, r10, 3
844; CHECK-NEXT:    add r15, r5, r10
845; CHECK-NEXT:    add r14, r7, r10
846; CHECK-NEXT:    add r31, r4, r10
847; CHECK-NEXT:    sldi r10, r3, 3
848; CHECK-NEXT:    mr r3, r4
849; CHECK-NEXT:    mr r4, r7
850; CHECK-NEXT:    ld r7, -160(r1) # 8-byte Folded Reload
851; CHECK-NEXT:    sub r0, r10, r11
852; CHECK-NEXT:    sldi r10, r7, 3
853; CHECK-NEXT:    ld r7, -184(r1) # 8-byte Folded Reload
854; CHECK-NEXT:    sub r2, r10, r11
855; CHECK-NEXT:    li r11, 0
856; CHECK-NEXT:    mr r10, r12
857; CHECK-NEXT:    addi r7, r7, -4
858; CHECK-NEXT:    rldicl r7, r7, 62, 2
859; CHECK-NEXT:    addi r7, r7, 1
860; CHECK-NEXT:    mtctr r7
861; CHECK-NEXT:    sldi r7, r12, 5
862; CHECK-NEXT:    .p2align 4
863; CHECK-NEXT:  .LBB7_3: # %for.body
864; CHECK-NEXT:    #
865; CHECK-NEXT:    lfd f0, 0(r31)
866; CHECK-NEXT:    lfd f1, 0(r14)
867; CHECK-NEXT:    add r10, r10, r12
868; CHECK-NEXT:    add r10, r10, r12
869; CHECK-NEXT:    xsmuldp f0, f0, f1
870; CHECK-NEXT:    lfd f1, 0(r15)
871; CHECK-NEXT:    add r10, r10, r12
872; CHECK-NEXT:    add r10, r10, r12
873; CHECK-NEXT:    xsadddp f0, f1, f0
874; CHECK-NEXT:    stfd f0, 0(r15)
875; CHECK-NEXT:    add r15, r15, r7
876; CHECK-NEXT:    lfdx f0, r31, r0
877; CHECK-NEXT:    lfdx f1, r14, r0
878; CHECK-NEXT:    xsmuldp f0, f0, f1
879; CHECK-NEXT:    lfdx f1, r16, r11
880; CHECK-NEXT:    xsadddp f0, f1, f0
881; CHECK-NEXT:    stfdx f0, r16, r11
882; CHECK-NEXT:    lfdx f0, r31, r2
883; CHECK-NEXT:    lfdx f1, r14, r2
884; CHECK-NEXT:    add r31, r31, r7
885; CHECK-NEXT:    add r14, r14, r7
886; CHECK-NEXT:    xsmuldp f0, f0, f1
887; CHECK-NEXT:    lfdx f1, r17, r11
888; CHECK-NEXT:    xsadddp f0, f1, f0
889; CHECK-NEXT:    stfdx f0, r17, r11
890; CHECK-NEXT:    lfd f0, 0(r18)
891; CHECK-NEXT:    lfd f1, 0(r19)
892; CHECK-NEXT:    xsmuldp f0, f0, f1
893; CHECK-NEXT:    lfdx f1, r20, r11
894; CHECK-NEXT:    xsadddp f0, f1, f0
895; CHECK-NEXT:    stfdx f0, r20, r11
896; CHECK-NEXT:    lfdx f0, r18, r0
897; CHECK-NEXT:    lfdx f1, r19, r0
898; CHECK-NEXT:    xsmuldp f0, f0, f1
899; CHECK-NEXT:    lfdx f1, r21, r11
900; CHECK-NEXT:    xsadddp f0, f1, f0
901; CHECK-NEXT:    stfdx f0, r21, r11
902; CHECK-NEXT:    lfdx f0, r18, r2
903; CHECK-NEXT:    lfdx f1, r19, r2
904; CHECK-NEXT:    add r18, r18, r7
905; CHECK-NEXT:    add r19, r19, r7
906; CHECK-NEXT:    xsmuldp f0, f0, f1
907; CHECK-NEXT:    lfdx f1, r22, r11
908; CHECK-NEXT:    xsadddp f0, f1, f0
909; CHECK-NEXT:    stfdx f0, r22, r11
910; CHECK-NEXT:    lfd f0, 0(r23)
911; CHECK-NEXT:    lfd f1, 0(r24)
912; CHECK-NEXT:    xsmuldp f0, f0, f1
913; CHECK-NEXT:    lfdx f1, r25, r11
914; CHECK-NEXT:    xsadddp f0, f1, f0
915; CHECK-NEXT:    stfdx f0, r25, r11
916; CHECK-NEXT:    lfdx f0, r23, r0
917; CHECK-NEXT:    lfdx f1, r24, r0
918; CHECK-NEXT:    xsmuldp f0, f0, f1
919; CHECK-NEXT:    lfdx f1, r26, r11
920; CHECK-NEXT:    xsadddp f0, f1, f0
921; CHECK-NEXT:    stfdx f0, r26, r11
922; CHECK-NEXT:    lfdx f0, r23, r2
923; CHECK-NEXT:    lfdx f1, r24, r2
924; CHECK-NEXT:    add r23, r23, r7
925; CHECK-NEXT:    add r24, r24, r7
926; CHECK-NEXT:    xsmuldp f0, f0, f1
927; CHECK-NEXT:    lfdx f1, r27, r11
928; CHECK-NEXT:    xsadddp f0, f1, f0
929; CHECK-NEXT:    stfdx f0, r27, r11
930; CHECK-NEXT:    lfd f0, 0(r28)
931; CHECK-NEXT:    lfd f1, 0(r29)
932; CHECK-NEXT:    xsmuldp f0, f0, f1
933; CHECK-NEXT:    lfdx f1, r30, r11
934; CHECK-NEXT:    xsadddp f0, f1, f0
935; CHECK-NEXT:    stfdx f0, r30, r11
936; CHECK-NEXT:    lfdx f0, r28, r0
937; CHECK-NEXT:    lfdx f1, r29, r0
938; CHECK-NEXT:    xsmuldp f0, f0, f1
939; CHECK-NEXT:    lfdx f1, r8, r11
940; CHECK-NEXT:    xsadddp f0, f1, f0
941; CHECK-NEXT:    stfdx f0, r8, r11
942; CHECK-NEXT:    lfdx f0, r28, r2
943; CHECK-NEXT:    lfdx f1, r29, r2
944; CHECK-NEXT:    add r28, r28, r7
945; CHECK-NEXT:    add r29, r29, r7
946; CHECK-NEXT:    xsmuldp f0, f0, f1
947; CHECK-NEXT:    lfdx f1, r9, r11
948; CHECK-NEXT:    xsadddp f0, f1, f0
949; CHECK-NEXT:    stfdx f0, r9, r11
950; CHECK-NEXT:    add r11, r11, r7
951; CHECK-NEXT:    bdnz .LBB7_3
952; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
953; CHECK-NEXT:    cmpldi r6, 0
954; CHECK-NEXT:    beq cr0, .LBB7_7
955; CHECK-NEXT:  # %bb.5: # %for.body.epil.preheader
956; CHECK-NEXT:    sldi r8, r12, 3
957; CHECK-NEXT:    ld r12, -176(r1) # 8-byte Folded Reload
958; CHECK-NEXT:    ld r7, -160(r1) # 8-byte Folded Reload
959; CHECK-NEXT:    add r12, r10, r12
960; CHECK-NEXT:    add r7, r10, r7
961; CHECK-NEXT:    sldi r0, r12, 3
962; CHECK-NEXT:    sldi r11, r7, 3
963; CHECK-NEXT:    add r12, r5, r0
964; CHECK-NEXT:    add r30, r4, r0
965; CHECK-NEXT:    add r29, r3, r0
966; CHECK-NEXT:    ld r0, -168(r1) # 8-byte Folded Reload
967; CHECK-NEXT:    add r7, r5, r11
968; CHECK-NEXT:    add r9, r4, r11
969; CHECK-NEXT:    add r11, r3, r11
970; CHECK-NEXT:    add r10, r10, r0
971; CHECK-NEXT:    sldi r10, r10, 3
972; CHECK-NEXT:    add r5, r5, r10
973; CHECK-NEXT:    add r4, r4, r10
974; CHECK-NEXT:    add r3, r3, r10
975; CHECK-NEXT:    li r10, 0
976; CHECK-NEXT:    .p2align 4
977; CHECK-NEXT:  .LBB7_6: # %for.body.epil
978; CHECK-NEXT:    #
979; CHECK-NEXT:    lfdx f0, r3, r10
980; CHECK-NEXT:    lfdx f1, r4, r10
981; CHECK-NEXT:    addi r6, r6, -1
982; CHECK-NEXT:    cmpldi r6, 0
983; CHECK-NEXT:    xsmuldp f0, f0, f1
984; CHECK-NEXT:    lfd f1, 0(r5)
985; CHECK-NEXT:    xsadddp f0, f1, f0
986; CHECK-NEXT:    stfd f0, 0(r5)
987; CHECK-NEXT:    add r5, r5, r8
988; CHECK-NEXT:    lfdx f0, r29, r10
989; CHECK-NEXT:    lfdx f1, r30, r10
990; CHECK-NEXT:    xsmuldp f0, f0, f1
991; CHECK-NEXT:    lfdx f1, r12, r10
992; CHECK-NEXT:    xsadddp f0, f1, f0
993; CHECK-NEXT:    stfdx f0, r12, r10
994; CHECK-NEXT:    lfdx f0, r11, r10
995; CHECK-NEXT:    lfdx f1, r9, r10
996; CHECK-NEXT:    xsmuldp f0, f0, f1
997; CHECK-NEXT:    lfdx f1, r7, r10
998; CHECK-NEXT:    xsadddp f0, f1, f0
999; CHECK-NEXT:    stfdx f0, r7, r10
1000; CHECK-NEXT:    add r10, r10, r8
1001; CHECK-NEXT:    bne cr0, .LBB7_6
1002; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
1003; CHECK-NEXT:    ld r2, -152(r1) # 8-byte Folded Reload
1004; CHECK-NEXT:    ld r31, -8(r1) # 8-byte Folded Reload
1005; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
1006; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
1007; CHECK-NEXT:    li r3, 0
1008; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
1009; CHECK-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
1010; CHECK-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
1011; CHECK-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
1012; CHECK-NEXT:    ld r24, -64(r1) # 8-byte Folded Reload
1013; CHECK-NEXT:    ld r23, -72(r1) # 8-byte Folded Reload
1014; CHECK-NEXT:    ld r22, -80(r1) # 8-byte Folded Reload
1015; CHECK-NEXT:    ld r21, -88(r1) # 8-byte Folded Reload
1016; CHECK-NEXT:    ld r20, -96(r1) # 8-byte Folded Reload
1017; CHECK-NEXT:    ld r19, -104(r1) # 8-byte Folded Reload
1018; CHECK-NEXT:    ld r18, -112(r1) # 8-byte Folded Reload
1019; CHECK-NEXT:    ld r17, -120(r1) # 8-byte Folded Reload
1020; CHECK-NEXT:    ld r16, -128(r1) # 8-byte Folded Reload
1021; CHECK-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
1022; CHECK-NEXT:    ld r14, -144(r1) # 8-byte Folded Reload
1023; CHECK-NEXT:    blr
1024entry:
1025  %cmp49 = icmp sgt i64 %m, 0
1026  br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup
1027
1028for.body.preheader:                               ; preds = %entry
1029  %0 = shl i64 %m, 2
1030  %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1)
1031  %1 = add nsw i64 %smax52, -1
1032  %xtraiter = and i64 %smax52, 1
1033  %2 = icmp ult i64 %1, 3
1034  br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1035
1036for.body.preheader.new:                           ; preds = %for.body.preheader
1037  %unroll_iter = and i64 %smax52, 9223372036854775804
1038  br label %for.body
1039
1040for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1041  %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ]
1042  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
1043  br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
1044
1045for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1046  %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1047  %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1048  %add.epil = add nsw i64 %inc.addr.050.epil, %inc1
1049  %arrayidx.epil = getelementptr inbounds double, double* %input1, i64 %add.epil
1050  %3 = load double, double* %arrayidx.epil, align 8
1051  %arrayidx2.epil = getelementptr inbounds double, double* %input2, i64 %add.epil
1052  %4 = load double, double* %arrayidx2.epil, align 8
1053  %mul3.epil = fmul double %3, %4
1054  %arrayidx5.epil = getelementptr inbounds double, double* %output, i64 %add.epil
1055  %5 = load double, double* %arrayidx5.epil, align 8
1056  %add6.epil = fadd double %5, %mul3.epil
1057  store double %add6.epil, double* %arrayidx5.epil, align 8
1058  %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2
1059  %arrayidx8.epil = getelementptr inbounds double, double* %input1, i64 %add7.epil
1060  %6 = load double, double* %arrayidx8.epil, align 8
1061  %arrayidx10.epil = getelementptr inbounds double, double* %input2, i64 %add7.epil
1062  %7 = load double, double* %arrayidx10.epil, align 8
1063  %mul11.epil = fmul double %6, %7
1064  %arrayidx13.epil = getelementptr inbounds double, double* %output, i64 %add7.epil
1065  %8 = load double, double* %arrayidx13.epil, align 8
1066  %add14.epil = fadd double %8, %mul11.epil
1067  store double %add14.epil, double* %arrayidx13.epil, align 8
1068  %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3
1069  %arrayidx16.epil = getelementptr inbounds double, double* %input1, i64 %add15.epil
1070  %9 = load double, double* %arrayidx16.epil, align 8
1071  %arrayidx18.epil = getelementptr inbounds double, double* %input2, i64 %add15.epil
1072  %10 = load double, double* %arrayidx18.epil, align 8
1073  %mul19.epil = fmul double %9, %10
1074  %arrayidx21.epil = getelementptr inbounds double, double* %output, i64 %add15.epil
1075  %11 = load double, double* %arrayidx21.epil, align 8
1076  %add22.epil = fadd double %11, %mul19.epil
1077  store double %add22.epil, double* %arrayidx21.epil, align 8
1078  %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4
1079  %epil.iter.sub = add nsw i64 %epil.iter, -1
1080  %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0
1081  br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil
1082
1083for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1084  ret i32 0
1085
1086for.body:                                         ; preds = %for.body, %for.body.preheader.new
1087  %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ]
1088  %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1089  %add = add nsw i64 %inc.addr.050, %inc1
1090  %arrayidx = getelementptr inbounds double, double* %input1, i64 %add
1091  %12 = load double, double* %arrayidx, align 8
1092  %arrayidx2 = getelementptr inbounds double, double* %input2, i64 %add
1093  %13 = load double, double* %arrayidx2, align 8
1094  %mul3 = fmul double %12, %13
1095  %arrayidx5 = getelementptr inbounds double, double* %output, i64 %add
1096  %14 = load double, double* %arrayidx5, align 8
1097  %add6 = fadd double %14, %mul3
1098  store double %add6, double* %arrayidx5, align 8
1099  %add7 = add nsw i64 %inc.addr.050, %inc2
1100  %arrayidx8 = getelementptr inbounds double, double* %input1, i64 %add7
1101  %15 = load double, double* %arrayidx8, align 8
1102  %arrayidx10 = getelementptr inbounds double, double* %input2, i64 %add7
1103  %16 = load double, double* %arrayidx10, align 8
1104  %mul11 = fmul double %15, %16
1105  %arrayidx13 = getelementptr inbounds double, double* %output, i64 %add7
1106  %17 = load double, double* %arrayidx13, align 8
1107  %add14 = fadd double %17, %mul11
1108  store double %add14, double* %arrayidx13, align 8
1109  %add15 = add nsw i64 %inc.addr.050, %inc3
1110  %arrayidx16 = getelementptr inbounds double, double* %input1, i64 %add15
1111  %18 = load double, double* %arrayidx16, align 8
1112  %arrayidx18 = getelementptr inbounds double, double* %input2, i64 %add15
1113  %19 = load double, double* %arrayidx18, align 8
1114  %mul19 = fmul double %18, %19
1115  %arrayidx21 = getelementptr inbounds double, double* %output, i64 %add15
1116  %20 = load double, double* %arrayidx21, align 8
1117  %add22 = fadd double %20, %mul19
1118  store double %add22, double* %arrayidx21, align 8
1119  %add23 = add nsw i64 %inc.addr.050, %inc4
1120  %add.1 = add nsw i64 %add23, %inc1
1121  %arrayidx.1 = getelementptr inbounds double, double* %input1, i64 %add.1
1122  %21 = load double, double* %arrayidx.1, align 8
1123  %arrayidx2.1 = getelementptr inbounds double, double* %input2, i64 %add.1
1124  %22 = load double, double* %arrayidx2.1, align 8
1125  %mul3.1 = fmul double %21, %22
1126  %arrayidx5.1 = getelementptr inbounds double, double* %output, i64 %add.1
1127  %23 = load double, double* %arrayidx5.1, align 8
1128  %add6.1 = fadd double %23, %mul3.1
1129  store double %add6.1, double* %arrayidx5.1, align 8
1130  %add7.1 = add nsw i64 %add23, %inc2
1131  %arrayidx8.1 = getelementptr inbounds double, double* %input1, i64 %add7.1
1132  %24 = load double, double* %arrayidx8.1, align 8
1133  %arrayidx10.1 = getelementptr inbounds double, double* %input2, i64 %add7.1
1134  %25 = load double, double* %arrayidx10.1, align 8
1135  %mul11.1 = fmul double %24, %25
1136  %arrayidx13.1 = getelementptr inbounds double, double* %output, i64 %add7.1
1137  %26 = load double, double* %arrayidx13.1, align 8
1138  %add14.1 = fadd double %26, %mul11.1
1139  store double %add14.1, double* %arrayidx13.1, align 8
1140  %add15.1 = add nsw i64 %add23, %inc3
1141  %arrayidx16.1 = getelementptr inbounds double, double* %input1, i64 %add15.1
1142  %27 = load double, double* %arrayidx16.1, align 8
1143  %arrayidx18.1 = getelementptr inbounds double, double* %input2, i64 %add15.1
1144  %28 = load double, double* %arrayidx18.1, align 8
1145  %mul19.1 = fmul double %27, %28
1146  %arrayidx21.1 = getelementptr inbounds double, double* %output, i64 %add15.1
1147  %29 = load double, double* %arrayidx21.1, align 8
1148  %add22.1 = fadd double %29, %mul19.1
1149  store double %add22.1, double* %arrayidx21.1, align 8
1150  %add23.1 = add nsw i64 %add23, %inc4
1151  %add.2 = add nsw i64 %add23.1, %inc1
1152  %arrayidx.2 = getelementptr inbounds double, double* %input1, i64 %add.2
1153  %30 = load double, double* %arrayidx.2, align 8
1154  %arrayidx2.2 = getelementptr inbounds double, double* %input2, i64 %add.2
1155  %31 = load double, double* %arrayidx2.2, align 8
1156  %mul3.2 = fmul double %30, %31
1157  %arrayidx5.2 = getelementptr inbounds double, double* %output, i64 %add.2
1158  %32 = load double, double* %arrayidx5.2, align 8
1159  %add6.2 = fadd double %32, %mul3.2
1160  store double %add6.2, double* %arrayidx5.2, align 8
1161  %add7.2 = add nsw i64 %add23.1, %inc2
1162  %arrayidx8.2 = getelementptr inbounds double, double* %input1, i64 %add7.2
1163  %33 = load double, double* %arrayidx8.2, align 8
1164  %arrayidx10.2 = getelementptr inbounds double, double* %input2, i64 %add7.2
1165  %34 = load double, double* %arrayidx10.2, align 8
1166  %mul11.2 = fmul double %33, %34
1167  %arrayidx13.2 = getelementptr inbounds double, double* %output, i64 %add7.2
1168  %35 = load double, double* %arrayidx13.2, align 8
1169  %add14.2 = fadd double %35, %mul11.2
1170  store double %add14.2, double* %arrayidx13.2, align 8
1171  %add15.2 = add nsw i64 %add23.1, %inc3
1172  %arrayidx16.2 = getelementptr inbounds double, double* %input1, i64 %add15.2
1173  %36 = load double, double* %arrayidx16.2, align 8
1174  %arrayidx18.2 = getelementptr inbounds double, double* %input2, i64 %add15.2
1175  %37 = load double, double* %arrayidx18.2, align 8
1176  %mul19.2 = fmul double %36, %37
1177  %arrayidx21.2 = getelementptr inbounds double, double* %output, i64 %add15.2
1178  %38 = load double, double* %arrayidx21.2, align 8
1179  %add22.2 = fadd double %38, %mul19.2
1180  store double %add22.2, double* %arrayidx21.2, align 8
1181  %add23.2 = add nsw i64 %add23.1, %inc4
1182  %add.3 = add nsw i64 %add23.2, %inc1
1183  %arrayidx.3 = getelementptr inbounds double, double* %input1, i64 %add.3
1184  %39 = load double, double* %arrayidx.3, align 8
1185  %arrayidx2.3 = getelementptr inbounds double, double* %input2, i64 %add.3
1186  %40 = load double, double* %arrayidx2.3, align 8
1187  %mul3.3 = fmul double %39, %40
1188  %arrayidx5.3 = getelementptr inbounds double, double* %output, i64 %add.3
1189  %41 = load double, double* %arrayidx5.3, align 8
1190  %add6.3 = fadd double %41, %mul3.3
1191  store double %add6.3, double* %arrayidx5.3, align 8
1192  %add7.3 = add nsw i64 %add23.2, %inc2
1193  %arrayidx8.3 = getelementptr inbounds double, double* %input1, i64 %add7.3
1194  %42 = load double, double* %arrayidx8.3, align 8
1195  %arrayidx10.3 = getelementptr inbounds double, double* %input2, i64 %add7.3
1196  %43 = load double, double* %arrayidx10.3, align 8
1197  %mul11.3 = fmul double %42, %43
1198  %arrayidx13.3 = getelementptr inbounds double, double* %output, i64 %add7.3
1199  %44 = load double, double* %arrayidx13.3, align 8
1200  %add14.3 = fadd double %44, %mul11.3
1201  store double %add14.3, double* %arrayidx13.3, align 8
1202  %add15.3 = add nsw i64 %add23.2, %inc3
1203  %arrayidx16.3 = getelementptr inbounds double, double* %input1, i64 %add15.3
1204  %45 = load double, double* %arrayidx16.3, align 8
1205  %arrayidx18.3 = getelementptr inbounds double, double* %input2, i64 %add15.3
1206  %46 = load double, double* %arrayidx18.3, align 8
1207  %mul19.3 = fmul double %45, %46
1208  %arrayidx21.3 = getelementptr inbounds double, double* %output, i64 %add15.3
1209  %47 = load double, double* %arrayidx21.3, align 8
1210  %add22.3 = fadd double %47, %mul19.3
1211  store double %add22.3, double* %arrayidx21.3, align 8
1212  %add23.3 = add nsw i64 %add23.2, %inc4
1213  %niter.nsub.3 = add i64 %niter, -4
1214  %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0
1215  br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1216}
1217
1218declare i64 @llvm.smax.i64(i64, i64)
1219
1220