1*2d51adcbSKang Zhang; Test the loop alignment. 2*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC 3*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR 4*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR 5*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR 6*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR 7*2d51adcbSKang Zhang 8*2d51adcbSKang Zhang; Test the loop alignment and the option -disable-ppc-innermost-loop-align32. 9*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=a2 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 10*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 11*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 12*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 13*2d51adcbSKang Zhang; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 14*2d51adcbSKang Zhang 15*2d51adcbSKang Zhang 16*2d51adcbSKang Zhang%struct.parm = type { i32*, i32, i32 } 17*2d51adcbSKang Zhang 18*2d51adcbSKang Zhang; Test the loop alignment when the innermost hot loop has more than 8 instructions. 19*2d51adcbSKang Zhangdefine void @big_loop(%struct.parm* %arg) { 20*2d51adcbSKang Zhangentry: 21*2d51adcbSKang Zhang %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0 22*2d51adcbSKang Zhang %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8 23*2d51adcbSKang Zhang %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1 24*2d51adcbSKang Zhang %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8 25*2d51adcbSKang Zhang %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2 26*2d51adcbSKang Zhang %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4 27*2d51adcbSKang Zhang %0 = sext i32 %localArg.sroa.5.0.copyload to i64 28*2d51adcbSKang Zhang br label %do.body 29*2d51adcbSKang Zhang 30*2d51adcbSKang Zhangdo.body: ; preds = %do.end, %entry 31*2d51adcbSKang Zhang %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ] 32*2d51adcbSKang Zhang br label %do.body3 33*2d51adcbSKang Zhang 34*2d51adcbSKang Zhangdo.body3: ; preds = %do.body3, %do.body 35*2d51adcbSKang Zhang %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ] 36*2d51adcbSKang Zhang %1 = add nsw i64 %indvars.iv, 2 37*2d51adcbSKang Zhang %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1 38*2d51adcbSKang Zhang %2 = add nsw i64 %indvars.iv, 3 39*2d51adcbSKang Zhang %3 = trunc i64 %1 to i32 40*2d51adcbSKang Zhang %4 = add nsw i64 %indvars.iv, 4 41*2d51adcbSKang Zhang %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2 42*2d51adcbSKang Zhang %5 = trunc i64 %2 to i32 43*2d51adcbSKang Zhang store i32 %5, i32* %arrayidx10, align 4 44*2d51adcbSKang Zhang %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4 45*2d51adcbSKang Zhang %6 = trunc i64 %4 to i32 46*2d51adcbSKang Zhang store i32 %6, i32* %arrayidx12, align 4 47*2d51adcbSKang Zhang store i32 %3, i32* %arrayidx, align 4 48*2d51adcbSKang Zhang %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv 49*2d51adcbSKang Zhang %7 = trunc i64 %indvars.iv to i32 50*2d51adcbSKang Zhang %8 = add i32 %7, 1 51*2d51adcbSKang Zhang store i32 %8, i32* %arrayidx21, align 4 52*2d51adcbSKang Zhang %indvars.iv.next = add nsw i64 %indvars.iv, -1 53*2d51adcbSKang Zhang %9 = icmp eq i64 %indvars.iv, 0 54*2d51adcbSKang Zhang br i1 %9, label %do.end, label %do.body3 55*2d51adcbSKang Zhang 56*2d51adcbSKang Zhangdo.end: ; preds = %do.body3 57*2d51adcbSKang Zhang %dec24 = add nsw i32 %m.0, -1 58*2d51adcbSKang Zhang %tobool25 = icmp eq i32 %m.0, 0 59*2d51adcbSKang Zhang br i1 %tobool25, label %do.end26, label %do.body 60*2d51adcbSKang Zhang 61*2d51adcbSKang Zhangdo.end26: ; preds = %do.end 62*2d51adcbSKang Zhang %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0 63*2d51adcbSKang Zhang store i32 0, i32* %arrayidx28, align 4 64*2d51adcbSKang Zhang ret void 65*2d51adcbSKang Zhang 66*2d51adcbSKang Zhang 67*2d51adcbSKang Zhang; CHECK-LABEL: @big_loop 68*2d51adcbSKang Zhang; CHECK: mtctr 69*2d51adcbSKang Zhang; GENERIC: .p2align 4 70*2d51adcbSKang Zhang; PWR: .p2align 5 71*2d51adcbSKang Zhang; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 72*2d51adcbSKang Zhang; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 73*2d51adcbSKang Zhang; CHECK: bdnz 74*2d51adcbSKang Zhang} 75*2d51adcbSKang Zhang 76*2d51adcbSKang Zhang; Test the loop alignment when the innermost hot loop has 5-8 instructions. 77*2d51adcbSKang Zhangdefine void @general_loop(i32* %s, i64 %m) { 78*2d51adcbSKang Zhangentry: 79*2d51adcbSKang Zhang %tobool40 = icmp eq i64 %m, 0 80*2d51adcbSKang Zhang br i1 %tobool40, label %while.end18, label %while.body3.lr.ph 81*2d51adcbSKang Zhang 82*2d51adcbSKang Zhangwhile.cond.loopexit: ; preds = %while.body3 83*2d51adcbSKang Zhang %tobool = icmp eq i64 %dec, 0 84*2d51adcbSKang Zhang br i1 %tobool, label %while.end18, label %while.body3.lr.ph 85*2d51adcbSKang Zhang 86*2d51adcbSKang Zhangwhile.body3.lr.ph: ; preds = %entry, %while.cond.loopexit 87*2d51adcbSKang Zhang %m.addr.041 = phi i64 [ %dec, %while.cond.loopexit ], [ %m, %entry ] 88*2d51adcbSKang Zhang %dec = add nsw i64 %m.addr.041, -1 89*2d51adcbSKang Zhang %conv = trunc i64 %m.addr.041 to i32 90*2d51adcbSKang Zhang %conv11 = trunc i64 %dec to i32 91*2d51adcbSKang Zhang br label %while.body3 92*2d51adcbSKang Zhang 93*2d51adcbSKang Zhangwhile.body3: ; preds = %while.body3.lr.ph, %while.body3 94*2d51adcbSKang Zhang %n.039 = phi i64 [ %m.addr.041, %while.body3.lr.ph ], [ %dec16, %while.body3 ] 95*2d51adcbSKang Zhang %inc = add nsw i64 %n.039, 1 96*2d51adcbSKang Zhang %arrayidx = getelementptr inbounds i32, i32* %s, i64 %n.039 97*2d51adcbSKang Zhang %inc5 = add nsw i64 %n.039, 2 98*2d51adcbSKang Zhang %arrayidx6 = getelementptr inbounds i32, i32* %s, i64 %inc 99*2d51adcbSKang Zhang %sub = sub nsw i64 %dec, %inc5 100*2d51adcbSKang Zhang %conv7 = trunc i64 %sub to i32 101*2d51adcbSKang Zhang %arrayidx9 = getelementptr inbounds i32, i32* %s, i64 %inc5 102*2d51adcbSKang Zhang store i32 %conv7, i32* %arrayidx9, align 4 103*2d51adcbSKang Zhang store i32 %conv11, i32* %arrayidx6, align 4 104*2d51adcbSKang Zhang store i32 %conv, i32* %arrayidx, align 4 105*2d51adcbSKang Zhang %dec16 = add nsw i64 %n.039, -1 106*2d51adcbSKang Zhang %tobool2 = icmp eq i64 %dec16, 0 107*2d51adcbSKang Zhang br i1 %tobool2, label %while.cond.loopexit, label %while.body3 108*2d51adcbSKang Zhang 109*2d51adcbSKang Zhangwhile.end18: ; preds = %while.cond.loopexit, %entry 110*2d51adcbSKang Zhang ret void 111*2d51adcbSKang Zhang 112*2d51adcbSKang Zhang 113*2d51adcbSKang Zhang; CHECK-LABEL: @general_loop 114*2d51adcbSKang Zhang; CHECK: mtctr 115*2d51adcbSKang Zhang; GENERIC: .p2align 4 116*2d51adcbSKang Zhang; PWR: .p2align 5 117*2d51adcbSKang Zhang; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 118*2d51adcbSKang Zhang; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 5 119*2d51adcbSKang Zhang; CHECK: bdnz 120*2d51adcbSKang Zhang} 121*2d51adcbSKang Zhang 122*2d51adcbSKang Zhang; Test the small loop alignment when the innermost hot loop has less than 4 instructions. 123*2d51adcbSKang Zhangdefine void @small_loop(i64 %m) { 124*2d51adcbSKang Zhangentry: 125*2d51adcbSKang Zhang br label %do.body 126*2d51adcbSKang Zhang 127*2d51adcbSKang Zhangdo.body: ; preds = %do.end, %entry 128*2d51adcbSKang Zhang %m.addr.0 = phi i64 [ %m, %entry ], [ %1, %do.end ] 129*2d51adcbSKang Zhang br label %do.body1 130*2d51adcbSKang Zhang 131*2d51adcbSKang Zhangdo.body1: ; preds = %do.body1, %do.body 132*2d51adcbSKang Zhang %n.0 = phi i64 [ %m.addr.0, %do.body ], [ %0, %do.body1 ] 133*2d51adcbSKang Zhang %0 = tail call i64 asm "subi $0,$0,1", "=r,0"(i64 %n.0) 134*2d51adcbSKang Zhang %tobool = icmp eq i64 %0, 0 135*2d51adcbSKang Zhang br i1 %tobool, label %do.end, label %do.body1 136*2d51adcbSKang Zhang 137*2d51adcbSKang Zhangdo.end: ; preds = %do.body1 138*2d51adcbSKang Zhang %1 = tail call i64 asm "subi $1,$1,1", "=r,0"(i64 %m.addr.0) 139*2d51adcbSKang Zhang %tobool3 = icmp eq i64 %1, 0 140*2d51adcbSKang Zhang br i1 %tobool3, label %do.end4, label %do.body 141*2d51adcbSKang Zhang 142*2d51adcbSKang Zhangdo.end4: ; preds = %do.end 143*2d51adcbSKang Zhang ret void 144*2d51adcbSKang Zhang 145*2d51adcbSKang Zhang 146*2d51adcbSKang Zhang; CHECK-LABEL: @small_loop 147*2d51adcbSKang Zhang; CHECK: mr 148*2d51adcbSKang Zhang; GENERIC: .p2align 4 149*2d51adcbSKang Zhang; PWR: .p2align 5 150*2d51adcbSKang Zhang; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 151*2d51adcbSKang Zhang; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 152*2d51adcbSKang Zhang; CHECK: bne 153*2d51adcbSKang Zhang} 154*2d51adcbSKang Zhang 155*2d51adcbSKang Zhang; Test the loop alignment when the innermost cold loop has more than 8 instructions. 156*2d51adcbSKang Zhangdefine void @big_loop_cold_innerloop(%struct.parm* %arg) { 157*2d51adcbSKang Zhangentry: 158*2d51adcbSKang Zhang %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0 159*2d51adcbSKang Zhang %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8 160*2d51adcbSKang Zhang %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1 161*2d51adcbSKang Zhang %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8 162*2d51adcbSKang Zhang %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2 163*2d51adcbSKang Zhang %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4 164*2d51adcbSKang Zhang %0 = sext i32 %localArg.sroa.5.0.copyload to i64 165*2d51adcbSKang Zhang br label %do.body 166*2d51adcbSKang Zhang 167*2d51adcbSKang Zhangdo.body: ; preds = %do.end, %entry 168*2d51adcbSKang Zhang %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ] 169*2d51adcbSKang Zhang br label %do.body3 170*2d51adcbSKang Zhang 171*2d51adcbSKang Zhangdo.body3: ; preds = %do.body3, %do.body 172*2d51adcbSKang Zhang %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ] 173*2d51adcbSKang Zhang %1 = add nsw i64 %indvars.iv, 2 174*2d51adcbSKang Zhang %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1 175*2d51adcbSKang Zhang %2 = add nsw i64 %indvars.iv, 3 176*2d51adcbSKang Zhang %3 = trunc i64 %1 to i32 177*2d51adcbSKang Zhang %4 = add nsw i64 %indvars.iv, 4 178*2d51adcbSKang Zhang %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2 179*2d51adcbSKang Zhang %5 = trunc i64 %2 to i32 180*2d51adcbSKang Zhang store i32 %5, i32* %arrayidx10, align 4 181*2d51adcbSKang Zhang %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4 182*2d51adcbSKang Zhang %6 = trunc i64 %4 to i32 183*2d51adcbSKang Zhang store i32 %6, i32* %arrayidx12, align 4 184*2d51adcbSKang Zhang store i32 %3, i32* %arrayidx, align 4 185*2d51adcbSKang Zhang %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv 186*2d51adcbSKang Zhang %7 = trunc i64 %indvars.iv to i32 187*2d51adcbSKang Zhang %8 = add i32 %7, 1 188*2d51adcbSKang Zhang store i32 %8, i32* %arrayidx21, align 4 189*2d51adcbSKang Zhang %indvars.iv.next = add nsw i64 %indvars.iv, -1 190*2d51adcbSKang Zhang %9 = icmp eq i64 %indvars.iv, 0 191*2d51adcbSKang Zhang br i1 %9, label %do.end, label %do.body3 192*2d51adcbSKang Zhang 193*2d51adcbSKang Zhangdo.end: ; preds = %do.body3 194*2d51adcbSKang Zhang %dec24 = add nsw i32 %m.0, -1 195*2d51adcbSKang Zhang %tobool25 = icmp eq i32 %m.0, 0 196*2d51adcbSKang Zhang br i1 %tobool25, label %do.end26, label %do.body 197*2d51adcbSKang Zhang 198*2d51adcbSKang Zhangdo.end26: ; preds = %do.end 199*2d51adcbSKang Zhang %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0 200*2d51adcbSKang Zhang store i32 0, i32* %arrayidx28, align 4 201*2d51adcbSKang Zhang ret void 202*2d51adcbSKang Zhang 203*2d51adcbSKang Zhang 204*2d51adcbSKang Zhang; CHECK-LABEL: @big_loop_cold_innerloop 205*2d51adcbSKang Zhang; CHECK: mtctr 206*2d51adcbSKang Zhang; PWR: .p2align 5 207*2d51adcbSKang Zhang; CHECK-NOT: .p2align 5 208*2d51adcbSKang Zhang; CHECK: bdnz 209*2d51adcbSKang Zhang} 210