1*2d51adcbSKang Zhang; Test the loop alignment.
2*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC
3*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
4*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
5*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
6*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
7*2d51adcbSKang Zhang
8*2d51adcbSKang Zhang; Test the loop alignment and the option -disable-ppc-innermost-loop-align32.
9*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=a2 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
10*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
11*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
12*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
13*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
14*2d51adcbSKang Zhang
15*2d51adcbSKang Zhang
16*2d51adcbSKang Zhang%struct.parm = type { i32*, i32, i32 }
17*2d51adcbSKang Zhang
18*2d51adcbSKang Zhang; Test the loop alignment when the innermost hot loop has more than 8 instructions.
19*2d51adcbSKang Zhangdefine void @big_loop(%struct.parm* %arg) {
20*2d51adcbSKang Zhangentry:
21*2d51adcbSKang Zhang  %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0
22*2d51adcbSKang Zhang  %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8
23*2d51adcbSKang Zhang  %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1
24*2d51adcbSKang Zhang  %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8
25*2d51adcbSKang Zhang  %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2
26*2d51adcbSKang Zhang  %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4
27*2d51adcbSKang Zhang  %0 = sext i32 %localArg.sroa.5.0.copyload to i64
28*2d51adcbSKang Zhang  br label %do.body
29*2d51adcbSKang Zhang
30*2d51adcbSKang Zhangdo.body:                                          ; preds = %do.end, %entry
31*2d51adcbSKang Zhang  %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ]
32*2d51adcbSKang Zhang  br label %do.body3
33*2d51adcbSKang Zhang
34*2d51adcbSKang Zhangdo.body3:                                         ; preds = %do.body3, %do.body
35*2d51adcbSKang Zhang  %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ]
36*2d51adcbSKang Zhang  %1 = add nsw i64 %indvars.iv, 2
37*2d51adcbSKang Zhang  %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1
38*2d51adcbSKang Zhang  %2 = add nsw i64 %indvars.iv, 3
39*2d51adcbSKang Zhang  %3 = trunc i64 %1 to i32
40*2d51adcbSKang Zhang  %4 = add nsw i64 %indvars.iv, 4
41*2d51adcbSKang Zhang  %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2
42*2d51adcbSKang Zhang  %5 = trunc i64 %2 to i32
43*2d51adcbSKang Zhang  store i32 %5, i32* %arrayidx10, align 4
44*2d51adcbSKang Zhang  %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4
45*2d51adcbSKang Zhang  %6 = trunc i64 %4 to i32
46*2d51adcbSKang Zhang  store i32 %6, i32* %arrayidx12, align 4
47*2d51adcbSKang Zhang  store i32 %3, i32* %arrayidx, align 4
48*2d51adcbSKang Zhang  %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv
49*2d51adcbSKang Zhang  %7 = trunc i64 %indvars.iv to i32
50*2d51adcbSKang Zhang  %8 = add i32 %7, 1
51*2d51adcbSKang Zhang  store i32 %8, i32* %arrayidx21, align 4
52*2d51adcbSKang Zhang  %indvars.iv.next = add nsw i64 %indvars.iv, -1
53*2d51adcbSKang Zhang  %9 = icmp eq i64 %indvars.iv, 0
54*2d51adcbSKang Zhang  br i1 %9, label %do.end, label %do.body3
55*2d51adcbSKang Zhang
56*2d51adcbSKang Zhangdo.end:                                           ; preds = %do.body3
57*2d51adcbSKang Zhang  %dec24 = add nsw i32 %m.0, -1
58*2d51adcbSKang Zhang  %tobool25 = icmp eq i32 %m.0, 0
59*2d51adcbSKang Zhang  br i1 %tobool25, label %do.end26, label %do.body
60*2d51adcbSKang Zhang
61*2d51adcbSKang Zhangdo.end26:                                         ; preds = %do.end
62*2d51adcbSKang Zhang  %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0
63*2d51adcbSKang Zhang  store i32 0, i32* %arrayidx28, align 4
64*2d51adcbSKang Zhang  ret void
65*2d51adcbSKang Zhang
66*2d51adcbSKang Zhang
67*2d51adcbSKang Zhang; CHECK-LABEL: @big_loop
68*2d51adcbSKang Zhang; CHECK: mtctr
69*2d51adcbSKang Zhang; GENERIC: .p2align  4
70*2d51adcbSKang Zhang; PWR: .p2align  5
71*2d51adcbSKang Zhang; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
72*2d51adcbSKang Zhang; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
73*2d51adcbSKang Zhang; CHECK: bdnz
74*2d51adcbSKang Zhang}
75*2d51adcbSKang Zhang
76*2d51adcbSKang Zhang; Test the loop alignment when the innermost hot loop has 5-8 instructions.
77*2d51adcbSKang Zhangdefine void @general_loop(i32* %s, i64 %m) {
78*2d51adcbSKang Zhangentry:
79*2d51adcbSKang Zhang  %tobool40 = icmp eq i64 %m, 0
80*2d51adcbSKang Zhang  br i1 %tobool40, label %while.end18, label %while.body3.lr.ph
81*2d51adcbSKang Zhang
82*2d51adcbSKang Zhangwhile.cond.loopexit:                              ; preds = %while.body3
83*2d51adcbSKang Zhang  %tobool = icmp eq i64 %dec, 0
84*2d51adcbSKang Zhang  br i1 %tobool, label %while.end18, label %while.body3.lr.ph
85*2d51adcbSKang Zhang
86*2d51adcbSKang Zhangwhile.body3.lr.ph:                                ; preds = %entry, %while.cond.loopexit
87*2d51adcbSKang Zhang  %m.addr.041 = phi i64 [ %dec, %while.cond.loopexit ], [ %m, %entry ]
88*2d51adcbSKang Zhang  %dec = add nsw i64 %m.addr.041, -1
89*2d51adcbSKang Zhang  %conv = trunc i64 %m.addr.041 to i32
90*2d51adcbSKang Zhang  %conv11 = trunc i64 %dec to i32
91*2d51adcbSKang Zhang  br label %while.body3
92*2d51adcbSKang Zhang
93*2d51adcbSKang Zhangwhile.body3:                                      ; preds = %while.body3.lr.ph, %while.body3
94*2d51adcbSKang Zhang  %n.039 = phi i64 [ %m.addr.041, %while.body3.lr.ph ], [ %dec16, %while.body3 ]
95*2d51adcbSKang Zhang  %inc = add nsw i64 %n.039, 1
96*2d51adcbSKang Zhang  %arrayidx = getelementptr inbounds i32, i32* %s, i64 %n.039
97*2d51adcbSKang Zhang  %inc5 = add nsw i64 %n.039, 2
98*2d51adcbSKang Zhang  %arrayidx6 = getelementptr inbounds i32, i32* %s, i64 %inc
99*2d51adcbSKang Zhang  %sub = sub nsw i64 %dec, %inc5
100*2d51adcbSKang Zhang  %conv7 = trunc i64 %sub to i32
101*2d51adcbSKang Zhang  %arrayidx9 = getelementptr inbounds i32, i32* %s, i64 %inc5
102*2d51adcbSKang Zhang  store i32 %conv7, i32* %arrayidx9, align 4
103*2d51adcbSKang Zhang  store i32 %conv11, i32* %arrayidx6, align 4
104*2d51adcbSKang Zhang  store i32 %conv, i32* %arrayidx, align 4
105*2d51adcbSKang Zhang  %dec16 = add nsw i64 %n.039, -1
106*2d51adcbSKang Zhang  %tobool2 = icmp eq i64 %dec16, 0
107*2d51adcbSKang Zhang  br i1 %tobool2, label %while.cond.loopexit, label %while.body3
108*2d51adcbSKang Zhang
109*2d51adcbSKang Zhangwhile.end18:                                      ; preds = %while.cond.loopexit, %entry
110*2d51adcbSKang Zhang  ret void
111*2d51adcbSKang Zhang
112*2d51adcbSKang Zhang
113*2d51adcbSKang Zhang; CHECK-LABEL: @general_loop
114*2d51adcbSKang Zhang; CHECK: mtctr
115*2d51adcbSKang Zhang; GENERIC: .p2align  4
116*2d51adcbSKang Zhang; PWR: .p2align  5
117*2d51adcbSKang Zhang; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
118*2d51adcbSKang Zhang; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  5
119*2d51adcbSKang Zhang; CHECK: bdnz
120*2d51adcbSKang Zhang}
121*2d51adcbSKang Zhang
122*2d51adcbSKang Zhang; Test the small loop alignment when the innermost hot loop has less than 4 instructions.
123*2d51adcbSKang Zhangdefine void @small_loop(i64 %m) {
124*2d51adcbSKang Zhangentry:
125*2d51adcbSKang Zhang  br label %do.body
126*2d51adcbSKang Zhang
127*2d51adcbSKang Zhangdo.body:                                          ; preds = %do.end, %entry
128*2d51adcbSKang Zhang  %m.addr.0 = phi i64 [ %m, %entry ], [ %1, %do.end ]
129*2d51adcbSKang Zhang  br label %do.body1
130*2d51adcbSKang Zhang
131*2d51adcbSKang Zhangdo.body1:                                         ; preds = %do.body1, %do.body
132*2d51adcbSKang Zhang  %n.0 = phi i64 [ %m.addr.0, %do.body ], [ %0, %do.body1 ]
133*2d51adcbSKang Zhang  %0 = tail call i64 asm "subi     $0,$0,1", "=r,0"(i64 %n.0)
134*2d51adcbSKang Zhang  %tobool = icmp eq i64 %0, 0
135*2d51adcbSKang Zhang  br i1 %tobool, label %do.end, label %do.body1
136*2d51adcbSKang Zhang
137*2d51adcbSKang Zhangdo.end:                                           ; preds = %do.body1
138*2d51adcbSKang Zhang  %1 = tail call i64 asm "subi     $1,$1,1", "=r,0"(i64 %m.addr.0)
139*2d51adcbSKang Zhang  %tobool3 = icmp eq i64 %1, 0
140*2d51adcbSKang Zhang  br i1 %tobool3, label %do.end4, label %do.body
141*2d51adcbSKang Zhang
142*2d51adcbSKang Zhangdo.end4:                                          ; preds = %do.end
143*2d51adcbSKang Zhang  ret void
144*2d51adcbSKang Zhang
145*2d51adcbSKang Zhang
146*2d51adcbSKang Zhang; CHECK-LABEL: @small_loop
147*2d51adcbSKang Zhang; CHECK: mr
148*2d51adcbSKang Zhang; GENERIC: .p2align  4
149*2d51adcbSKang Zhang; PWR: .p2align  5
150*2d51adcbSKang Zhang; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
151*2d51adcbSKang Zhang; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
152*2d51adcbSKang Zhang; CHECK: bne
153*2d51adcbSKang Zhang}
154*2d51adcbSKang Zhang
155*2d51adcbSKang Zhang; Test the loop alignment when the innermost cold loop has more than 8 instructions.
156*2d51adcbSKang Zhangdefine void @big_loop_cold_innerloop(%struct.parm* %arg) {
157*2d51adcbSKang Zhangentry:
158*2d51adcbSKang Zhang  %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0
159*2d51adcbSKang Zhang  %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8
160*2d51adcbSKang Zhang  %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1
161*2d51adcbSKang Zhang  %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8
162*2d51adcbSKang Zhang  %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2
163*2d51adcbSKang Zhang  %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4
164*2d51adcbSKang Zhang  %0 = sext i32 %localArg.sroa.5.0.copyload to i64
165*2d51adcbSKang Zhang  br label %do.body
166*2d51adcbSKang Zhang
167*2d51adcbSKang Zhangdo.body:                                          ; preds = %do.end, %entry
168*2d51adcbSKang Zhang  %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ]
169*2d51adcbSKang Zhang  br label %do.body3
170*2d51adcbSKang Zhang
171*2d51adcbSKang Zhangdo.body3:                                         ; preds = %do.body3, %do.body
172*2d51adcbSKang Zhang  %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ]
173*2d51adcbSKang Zhang  %1 = add nsw i64 %indvars.iv, 2
174*2d51adcbSKang Zhang  %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1
175*2d51adcbSKang Zhang  %2 = add nsw i64 %indvars.iv, 3
176*2d51adcbSKang Zhang  %3 = trunc i64 %1 to i32
177*2d51adcbSKang Zhang  %4 = add nsw i64 %indvars.iv, 4
178*2d51adcbSKang Zhang  %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2
179*2d51adcbSKang Zhang  %5 = trunc i64 %2 to i32
180*2d51adcbSKang Zhang  store i32 %5, i32* %arrayidx10, align 4
181*2d51adcbSKang Zhang  %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4
182*2d51adcbSKang Zhang  %6 = trunc i64 %4 to i32
183*2d51adcbSKang Zhang  store i32 %6, i32* %arrayidx12, align 4
184*2d51adcbSKang Zhang  store i32 %3, i32* %arrayidx, align 4
185*2d51adcbSKang Zhang  %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv
186*2d51adcbSKang Zhang  %7 = trunc i64 %indvars.iv to i32
187*2d51adcbSKang Zhang  %8 = add i32 %7, 1
188*2d51adcbSKang Zhang  store i32 %8, i32* %arrayidx21, align 4
189*2d51adcbSKang Zhang  %indvars.iv.next = add nsw i64 %indvars.iv, -1
190*2d51adcbSKang Zhang  %9 = icmp eq i64 %indvars.iv, 0
191*2d51adcbSKang Zhang  br i1 %9, label %do.end, label %do.body3
192*2d51adcbSKang Zhang
193*2d51adcbSKang Zhangdo.end:                                           ; preds = %do.body3
194*2d51adcbSKang Zhang  %dec24 = add nsw i32 %m.0, -1
195*2d51adcbSKang Zhang  %tobool25 = icmp eq i32 %m.0, 0
196*2d51adcbSKang Zhang  br i1 %tobool25, label %do.end26, label %do.body
197*2d51adcbSKang Zhang
198*2d51adcbSKang Zhangdo.end26:                                         ; preds = %do.end
199*2d51adcbSKang Zhang  %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0
200*2d51adcbSKang Zhang  store i32 0, i32* %arrayidx28, align 4
201*2d51adcbSKang Zhang  ret void
202*2d51adcbSKang Zhang
203*2d51adcbSKang Zhang
204*2d51adcbSKang Zhang; CHECK-LABEL: @big_loop_cold_innerloop
205*2d51adcbSKang Zhang; CHECK: mtctr
206*2d51adcbSKang Zhang; PWR: .p2align 5
207*2d51adcbSKang Zhang; CHECK-NOT: .p2align 5
208*2d51adcbSKang Zhang; CHECK: bdnz
209*2d51adcbSKang Zhang}
210