1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \ 3; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s 4 5; addresses: 6; 1: base1 + offset 7; 2: + offset 8; 3: + offset 9; 4: + offset 10; 11; chains: 12; 1: base: base1 + offset, offsets: (0, offset) 13; 2: base: base1 + 3*offset, offsets: (0, offset) 14; 15; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) { 16; long long o1 = base1 + offset; 17; long long o2 = base1 + 2 * offset; 18; long long o3 = base1 + 3 * offset; 19; long long o4 = base1 + 4 * offset; 20; char *p1 = p + o1; 21; char *p2 = p + o2; 22; char *p3 = p + o3; 23; char *p4 = p + o4; 24; long long sum = 0; 25; for (long long i = 0; i < n; ++i) { 26; unsigned long x1 = *(unsigned long *)(p1 + i); 27; unsigned long x2 = *(unsigned long *)(p2 + i); 28; unsigned long x3 = *(unsigned long *)(p3 + i); 29; unsigned long x4 = *(unsigned long *)(p4 + i); 30; sum += x1 * x2 * x3 * x4; 31; } 32; return sum; 33; } 34; 35define i64 @two_chain_same_offset_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) { 36; CHECK-LABEL: two_chain_same_offset_succ: 37; CHECK: # %bb.0: # %entry 38; CHECK-NEXT: cmpdi r6, 1 39; CHECK-NEXT: blt cr0, .LBB0_4 40; CHECK-NEXT: # %bb.1: # %for.body.preheader 41; CHECK-NEXT: sldi r7, r4, 1 42; CHECK-NEXT: mtctr r6 43; CHECK-NEXT: add r8, r4, r7 44; CHECK-NEXT: add r7, r5, r4 45; CHECK-NEXT: add r5, r5, r8 46; CHECK-NEXT: add r7, r3, r7 47; CHECK-NEXT: add r5, r3, r5 48; CHECK-NEXT: li r3, 0 49; CHECK-NEXT: .p2align 4 50; CHECK-NEXT: .LBB0_2: # %for.body 51; CHECK-NEXT: # 52; CHECK-NEXT: ld r6, 0(r7) 53; CHECK-NEXT: ldx r8, r7, r4 54; CHECK-NEXT: ld r9, 0(r5) 55; CHECK-NEXT: ldx r10, r5, r4 56; CHECK-NEXT: addi r7, r7, 1 57; CHECK-NEXT: addi r5, r5, 1 58; CHECK-NEXT: mulld r6, r8, r6 59; CHECK-NEXT: mulld r6, r6, r9 60; CHECK-NEXT: maddld r3, r6, r10, r3 61; CHECK-NEXT: bdnz .LBB0_2 62; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 63; CHECK-NEXT: blr 64; CHECK-NEXT: .LBB0_4: 65; CHECK-NEXT: li r3, 0 66; CHECK-NEXT: blr 67entry: 68 %mul = shl nsw i64 %offset, 1 69 %mul2 = mul nsw i64 %offset, 3 70 %mul4 = shl nsw i64 %offset, 2 71 %cmp46 = icmp sgt i64 %n, 0 72 br i1 %cmp46, label %for.body, label %for.cond.cleanup 73 74for.cond.cleanup: ; preds = %for.body, %entry 75 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ] 76 ret i64 %sum.0.lcssa 77 78for.body: ; preds = %entry, %for.body 79 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ] 80 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 81 %add = add i64 %i.047, %base1 82 %add.ptr9.idx = add i64 %add, %offset 83 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx 84 %0 = bitcast i8* %add.ptr9 to i64* 85 %1 = load i64, i64* %0, align 8 86 %add.ptr10.idx = add i64 %add, %mul 87 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx 88 %2 = bitcast i8* %add.ptr10 to i64* 89 %3 = load i64, i64* %2, align 8 90 %add.ptr11.idx = add i64 %add, %mul2 91 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx 92 %4 = bitcast i8* %add.ptr11 to i64* 93 %5 = load i64, i64* %4, align 8 94 %add.ptr12.idx = add i64 %add, %mul4 95 %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx 96 %6 = bitcast i8* %add.ptr12 to i64* 97 %7 = load i64, i64* %6, align 8 98 %mul13 = mul i64 %3, %1 99 %mul14 = mul i64 %mul13, %5 100 %mul15 = mul i64 %mul14, %7 101 %add16 = add i64 %mul15, %sum.048 102 %inc = add nuw nsw i64 %i.047, 1 103 %exitcond.not = icmp eq i64 %inc, %n 104 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 105} 106 107; addresses: 108; 1: base1 + offset 109; 2: + offset 110; 3: + offset 111; 4: + offset 112; 5: + offset 113; 114; It can not be commoned to chains because we need a chain for a single address. 115; It is not profitable to common chains if not all addresses are in chains. 116; 117; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) { 118; long long o1 = base1 + offset; 119; long long o2 = base1 + 2 * offset; 120; long long o3 = base1 + 3 * offset; 121; long long o4 = base1 + 4 * offset; 122; long long o5 = base1 + 5 * offset; 123; char *p1 = p + o1; 124; char *p2 = p + o2; 125; char *p3 = p + o3; 126; char *p4 = p + o4; 127; char *p5 = p + o5; 128; long long sum = 0; 129; for (long long i = 0; i < n; ++i) { 130; unsigned long x1 = *(unsigned long *)(p1 + i); 131; unsigned long x2 = *(unsigned long *)(p2 + i); 132; unsigned long x3 = *(unsigned long *)(p3 + i); 133; unsigned long x4 = *(unsigned long *)(p4 + i); 134; unsigned long x5 = *(unsigned long *)(p5 + i); 135; sum += x1 * x2 * x3 * x4 * x5; 136; } 137; return sum; 138; } 139; 140define i64 @not_perfect_chain_all_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { 141; CHECK-LABEL: not_perfect_chain_all_same_offset_fail: 142; CHECK: # %bb.0: # %entry 143; CHECK-NEXT: cmpdi r6, 1 144; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill 145; CHECK-NEXT: blt cr0, .LBB1_4 146; CHECK-NEXT: # %bb.1: # %for.body.preheader 147; CHECK-NEXT: sldi r7, r4, 1 148; CHECK-NEXT: sldi r9, r4, 2 149; CHECK-NEXT: add r5, r3, r5 150; CHECK-NEXT: li r3, 0 151; CHECK-NEXT: add r8, r4, r7 152; CHECK-NEXT: mtctr r6 153; CHECK-NEXT: add r10, r4, r9 154; CHECK-NEXT: .p2align 4 155; CHECK-NEXT: .LBB1_2: # %for.body 156; CHECK-NEXT: # 157; CHECK-NEXT: ldx r6, r5, r4 158; CHECK-NEXT: ldx r11, r5, r7 159; CHECK-NEXT: ldx r12, r5, r8 160; CHECK-NEXT: ldx r0, r5, r9 161; CHECK-NEXT: mulld r6, r11, r6 162; CHECK-NEXT: ldx r30, r5, r10 163; CHECK-NEXT: addi r5, r5, 1 164; CHECK-NEXT: mulld r6, r6, r12 165; CHECK-NEXT: mulld r6, r6, r0 166; CHECK-NEXT: maddld r3, r6, r30, r3 167; CHECK-NEXT: bdnz .LBB1_2 168; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 169; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 170; CHECK-NEXT: blr 171; CHECK-NEXT: .LBB1_4: 172; CHECK-NEXT: li r3, 0 173; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 174; CHECK-NEXT: blr 175entry: 176 %mul = shl nsw i64 %offset, 1 177 %mul2 = mul nsw i64 %offset, 3 178 %mul4 = shl nsw i64 %offset, 2 179 %mul6 = mul nsw i64 %offset, 5 180 %cmp58 = icmp sgt i64 %n, 0 181 br i1 %cmp58, label %for.body, label %for.cond.cleanup 182 183for.cond.cleanup: ; preds = %for.body, %entry 184 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ] 185 ret i64 %sum.0.lcssa 186 187for.body: ; preds = %entry, %for.body 188 %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ] 189 %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 190 %add = add i64 %i.059, %base1 191 %add.ptr12.idx = add i64 %add, %offset 192 %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx 193 %0 = bitcast i8* %add.ptr12 to i64* 194 %1 = load i64, i64* %0, align 8 195 %add.ptr13.idx = add i64 %add, %mul 196 %add.ptr13 = getelementptr inbounds i8, i8* %p, i64 %add.ptr13.idx 197 %2 = bitcast i8* %add.ptr13 to i64* 198 %3 = load i64, i64* %2, align 8 199 %add.ptr14.idx = add i64 %add, %mul2 200 %add.ptr14 = getelementptr inbounds i8, i8* %p, i64 %add.ptr14.idx 201 %4 = bitcast i8* %add.ptr14 to i64* 202 %5 = load i64, i64* %4, align 8 203 %add.ptr15.idx = add i64 %add, %mul4 204 %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx 205 %6 = bitcast i8* %add.ptr15 to i64* 206 %7 = load i64, i64* %6, align 8 207 %add.ptr16.idx = add i64 %add, %mul6 208 %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx 209 %8 = bitcast i8* %add.ptr16 to i64* 210 %9 = load i64, i64* %8, align 8 211 %mul17 = mul i64 %3, %1 212 %mul18 = mul i64 %mul17, %5 213 %mul19 = mul i64 %mul18, %7 214 %mul20 = mul i64 %mul19, %9 215 %add21 = add i64 %mul20, %sum.060 216 %inc = add nuw nsw i64 %i.059, 1 217 %exitcond.not = icmp eq i64 %inc, %n 218 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 219} 220 221; addresses: 222; 1: base1 223; 2: + 2*offset 224; 3: + offset 225; 226; We need at least 4 addresses to common 2 chains to reuse at least 1 offset. 227; 228; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) { 229; long long o1 = base1; 230; long long o2 = base1 + 2 * offset; 231; long long o3 = base1 + 3 * offset; 232; char *p1 = p + o1; 233; char *p2 = p + o2; 234; char *p3 = p + o3; 235; long long sum = 0; 236; for (long long i = 0; i < n; ++i) { 237; unsigned long x1 = *(unsigned long *)(p1 + i); 238; unsigned long x2 = *(unsigned long *)(p2 + i); 239; unsigned long x3 = *(unsigned long *)(p3 + i); 240; sum += x1 * x2 * x3; 241; } 242; return sum; 243; } 244; 245define i64 @no_enough_elements_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { 246; CHECK-LABEL: no_enough_elements_fail: 247; CHECK: # %bb.0: # %entry 248; CHECK-NEXT: cmpdi r6, 1 249; CHECK-NEXT: blt cr0, .LBB2_4 250; CHECK-NEXT: # %bb.1: # %for.body.preheader 251; CHECK-NEXT: sldi r7, r4, 1 252; CHECK-NEXT: mtctr r6 253; CHECK-NEXT: add r5, r3, r5 254; CHECK-NEXT: li r3, 0 255; CHECK-NEXT: add r4, r4, r7 256; CHECK-NEXT: .p2align 5 257; CHECK-NEXT: .LBB2_2: # %for.body 258; CHECK-NEXT: # 259; CHECK-NEXT: ld r6, 0(r5) 260; CHECK-NEXT: ldx r8, r5, r7 261; CHECK-NEXT: ldx r9, r5, r4 262; CHECK-NEXT: addi r5, r5, 1 263; CHECK-NEXT: mulld r6, r8, r6 264; CHECK-NEXT: maddld r3, r6, r9, r3 265; CHECK-NEXT: bdnz .LBB2_2 266; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 267; CHECK-NEXT: blr 268; CHECK-NEXT: .LBB2_4: 269; CHECK-NEXT: li r3, 0 270; CHECK-NEXT: blr 271entry: 272 %mul = shl nsw i64 %offset, 1 273 %mul1 = mul nsw i64 %offset, 3 274 %cmp32 = icmp sgt i64 %n, 0 275 br i1 %cmp32, label %for.body, label %for.cond.cleanup 276 277for.cond.cleanup: ; preds = %for.body, %entry 278 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ] 279 ret i64 %sum.0.lcssa 280 281for.body: ; preds = %entry, %for.body 282 %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ] 283 %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 284 %add.ptr5.idx = add i64 %i.033, %base1 285 %add.ptr5 = getelementptr inbounds i8, i8* %p, i64 %add.ptr5.idx 286 %0 = bitcast i8* %add.ptr5 to i64* 287 %1 = load i64, i64* %0, align 8 288 %add.ptr6.idx = add i64 %add.ptr5.idx, %mul 289 %add.ptr6 = getelementptr inbounds i8, i8* %p, i64 %add.ptr6.idx 290 %2 = bitcast i8* %add.ptr6 to i64* 291 %3 = load i64, i64* %2, align 8 292 %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1 293 %add.ptr7 = getelementptr inbounds i8, i8* %p, i64 %add.ptr7.idx 294 %4 = bitcast i8* %add.ptr7 to i64* 295 %5 = load i64, i64* %4, align 8 296 %mul8 = mul i64 %3, %1 297 %mul9 = mul i64 %mul8, %5 298 %add10 = add i64 %mul9, %sum.034 299 %inc = add nuw nsw i64 %i.033, 1 300 %exitcond.not = icmp eq i64 %inc, %n 301 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 302} 303 304; addresses: 305; 1: base1 306; 2: + 2*offset 307; 3: + 2*offset 308; 4: + 3*offset 309; 310; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains, 311; so we can not common any chains. 312; 313; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) { 314; long long o1 = base1; 315; long long o2 = base1 + 2 * offset; 316; long long o3 = base1 + 4 * offset; 317; long long o4 = base1 + 7 * offset; 318; char *p1 = p + o1; 319; char *p2 = p + o2; 320; char *p3 = p + o3; 321; char *p4 = p + o4; 322; long long sum = 0; 323; for (long long i = 0; i < n; ++i) { 324; unsigned long x1 = *(unsigned long *)(p1 + i); 325; unsigned long x2 = *(unsigned long *)(p2 + i); 326; unsigned long x3 = *(unsigned long *)(p3 + i); 327; unsigned long x4 = *(unsigned long *)(p4 + i); 328; sum += x1 * x2 * x3 * x4; 329; } 330; return sum; 331; } 332; 333define i64 @no_reuseable_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { 334; CHECK-LABEL: no_reuseable_offset_fail: 335; CHECK: # %bb.0: # %entry 336; CHECK-NEXT: cmpdi r6, 1 337; CHECK-NEXT: blt cr0, .LBB3_4 338; CHECK-NEXT: # %bb.1: # %for.body.preheader 339; CHECK-NEXT: sldi r9, r4, 3 340; CHECK-NEXT: mtctr r6 341; CHECK-NEXT: add r5, r3, r5 342; CHECK-NEXT: li r3, 0 343; CHECK-NEXT: sldi r7, r4, 1 344; CHECK-NEXT: sldi r8, r4, 2 345; CHECK-NEXT: sub r4, r9, r4 346; CHECK-NEXT: .p2align 4 347; CHECK-NEXT: .LBB3_2: # %for.body 348; CHECK-NEXT: # 349; CHECK-NEXT: ld r6, 0(r5) 350; CHECK-NEXT: ldx r9, r5, r7 351; CHECK-NEXT: ldx r10, r5, r8 352; CHECK-NEXT: ldx r11, r5, r4 353; CHECK-NEXT: addi r5, r5, 1 354; CHECK-NEXT: mulld r6, r9, r6 355; CHECK-NEXT: mulld r6, r6, r10 356; CHECK-NEXT: maddld r3, r6, r11, r3 357; CHECK-NEXT: bdnz .LBB3_2 358; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 359; CHECK-NEXT: blr 360; CHECK-NEXT: .LBB3_4: 361; CHECK-NEXT: li r3, 0 362; CHECK-NEXT: blr 363entry: 364 %mul = shl nsw i64 %offset, 1 365 %mul1 = shl nsw i64 %offset, 2 366 %mul3 = mul nsw i64 %offset, 7 367 %cmp44 = icmp sgt i64 %n, 0 368 br i1 %cmp44, label %for.body, label %for.cond.cleanup 369 370for.cond.cleanup: ; preds = %for.body, %entry 371 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ] 372 ret i64 %sum.0.lcssa 373 374for.body: ; preds = %entry, %for.body 375 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ] 376 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 377 %add.ptr8.idx = add i64 %i.045, %base1 378 %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx 379 %0 = bitcast i8* %add.ptr8 to i64* 380 %1 = load i64, i64* %0, align 8 381 %add.ptr9.idx = add i64 %add.ptr8.idx, %mul 382 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx 383 %2 = bitcast i8* %add.ptr9 to i64* 384 %3 = load i64, i64* %2, align 8 385 %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1 386 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx 387 %4 = bitcast i8* %add.ptr10 to i64* 388 %5 = load i64, i64* %4, align 8 389 %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3 390 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx 391 %6 = bitcast i8* %add.ptr11 to i64* 392 %7 = load i64, i64* %6, align 8 393 %mul12 = mul i64 %3, %1 394 %mul13 = mul i64 %mul12, %5 395 %mul14 = mul i64 %mul13, %7 396 %add15 = add i64 %mul14, %sum.046 397 %inc = add nuw nsw i64 %i.045, 1 398 %exitcond.not = icmp eq i64 %inc, %n 399 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 400} 401 402; addresses: 403; 1: base1 + offset 404; 2: + offset 405; 3: + 3*offset 406; 4: + 2*offset 407; 5: + 1*offset 408; 6: + 2*offset 409; 410; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5. 411; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6 412; and address 5(2*offset), so we can not common chains for these addresses. 413; 414; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) { 415; long long o1 = base1 + offset; 416; long long o2 = base1 + 2 * offset; 417; long long o3 = base1 + 5 * offset; 418; long long o4 = base1 + 7 * offset; 419; long long o5 = base1 + 8 * offset; 420; long long o6 = base1 + 10 * offset; 421; char *p1 = p + o1; 422; char *p2 = p + o2; 423; char *p3 = p + o3; 424; char *p4 = p + o4; 425; char *p5 = p + o5; 426; char *p6 = p + o6; 427; long long sum = 0; 428; for (long long i = 0; i < n; ++i) { 429; unsigned long x1 = *(unsigned long *)(p1 + i); 430; unsigned long x2 = *(unsigned long *)(p2 + i); 431; unsigned long x3 = *(unsigned long *)(p3 + i); 432; unsigned long x4 = *(unsigned long *)(p4 + i); 433; unsigned long x5 = *(unsigned long *)(p5 + i); 434; unsigned long x6 = *(unsigned long *)(p6 + i); 435; sum += x1 * x2 * x3 * x4 * x5 * x6; 436; } 437; return sum; 438; } 439; 440define i64 @not_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { 441; CHECK-LABEL: not_same_offset_fail: 442; CHECK: # %bb.0: # %entry 443; CHECK-NEXT: cmpdi r6, 1 444; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill 445; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill 446; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill 447; CHECK-NEXT: blt cr0, .LBB4_3 448; CHECK-NEXT: # %bb.1: # %for.body.preheader 449; CHECK-NEXT: mulli r11, r4, 10 450; CHECK-NEXT: sldi r8, r4, 2 451; CHECK-NEXT: add r5, r3, r5 452; CHECK-NEXT: li r3, 0 453; CHECK-NEXT: add r8, r4, r8 454; CHECK-NEXT: sldi r9, r4, 3 455; CHECK-NEXT: mtctr r6 456; CHECK-NEXT: sldi r7, r4, 1 457; CHECK-NEXT: sub r10, r9, r4 458; CHECK-NEXT: .p2align 4 459; CHECK-NEXT: .LBB4_2: # %for.body 460; CHECK-NEXT: # 461; CHECK-NEXT: ldx r6, r5, r4 462; CHECK-NEXT: ldx r12, r5, r7 463; CHECK-NEXT: ldx r0, r5, r8 464; CHECK-NEXT: ldx r30, r5, r10 465; CHECK-NEXT: mulld r6, r12, r6 466; CHECK-NEXT: ldx r29, r5, r9 467; CHECK-NEXT: ldx r28, r5, r11 468; CHECK-NEXT: addi r5, r5, 1 469; CHECK-NEXT: mulld r6, r6, r0 470; CHECK-NEXT: mulld r6, r6, r30 471; CHECK-NEXT: mulld r6, r6, r29 472; CHECK-NEXT: maddld r3, r6, r28, r3 473; CHECK-NEXT: bdnz .LBB4_2 474; CHECK-NEXT: b .LBB4_4 475; CHECK-NEXT: .LBB4_3: 476; CHECK-NEXT: li r3, 0 477; CHECK-NEXT: .LBB4_4: # %for.cond.cleanup 478; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 479; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload 480; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload 481; CHECK-NEXT: blr 482entry: 483 %mul = shl nsw i64 %offset, 1 484 %mul2 = mul nsw i64 %offset, 5 485 %mul4 = mul nsw i64 %offset, 7 486 %mul6 = shl nsw i64 %offset, 3 487 %mul8 = mul nsw i64 %offset, 10 488 %cmp70 = icmp sgt i64 %n, 0 489 br i1 %cmp70, label %for.body, label %for.cond.cleanup 490 491for.cond.cleanup: ; preds = %for.body, %entry 492 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ] 493 ret i64 %sum.0.lcssa 494 495for.body: ; preds = %entry, %for.body 496 %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ] 497 %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 498 %add = add i64 %i.071, %base1 499 %add.ptr15.idx = add i64 %add, %offset 500 %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx 501 %0 = bitcast i8* %add.ptr15 to i64* 502 %1 = load i64, i64* %0, align 8 503 %add.ptr16.idx = add i64 %add, %mul 504 %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx 505 %2 = bitcast i8* %add.ptr16 to i64* 506 %3 = load i64, i64* %2, align 8 507 %add.ptr17.idx = add i64 %add, %mul2 508 %add.ptr17 = getelementptr inbounds i8, i8* %p, i64 %add.ptr17.idx 509 %4 = bitcast i8* %add.ptr17 to i64* 510 %5 = load i64, i64* %4, align 8 511 %add.ptr18.idx = add i64 %add, %mul4 512 %add.ptr18 = getelementptr inbounds i8, i8* %p, i64 %add.ptr18.idx 513 %6 = bitcast i8* %add.ptr18 to i64* 514 %7 = load i64, i64* %6, align 8 515 %add.ptr19.idx = add i64 %add, %mul6 516 %add.ptr19 = getelementptr inbounds i8, i8* %p, i64 %add.ptr19.idx 517 %8 = bitcast i8* %add.ptr19 to i64* 518 %9 = load i64, i64* %8, align 8 519 %add.ptr20.idx = add i64 %add, %mul8 520 %add.ptr20 = getelementptr inbounds i8, i8* %p, i64 %add.ptr20.idx 521 %10 = bitcast i8* %add.ptr20 to i64* 522 %11 = load i64, i64* %10, align 8 523 %mul21 = mul i64 %3, %1 524 %mul22 = mul i64 %mul21, %5 525 %mul23 = mul i64 %mul22, %7 526 %mul24 = mul i64 %mul23, %9 527 %mul25 = mul i64 %mul24, %11 528 %add26 = add i64 %mul25, %sum.072 529 %inc = add nuw nsw i64 %i.071, 1 530 %exitcond.not = icmp eq i64 %inc, %n 531 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 532} 533 534; addresses: 535; 1: base1 + offset 536; 2: + offset 537; 3: + 3*offset 538; 4: + 2*offset 539; 540; chains: 541; 1: base1 + offset, offsets: (0, 2*offset) 542; 2: base1 + 4*offset, offsets: (0, 2*offset) 543; 544; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) { 545; long long o1 = base1 + offset; 546; long long o2 = base1 + 3 * offset; 547; long long o3 = base1 + 4 * offset; 548; long long o4 = base1 + 6 * offset; 549; char *p1 = p + o1; 550; char *p2 = p + o2; 551; char *p3 = p + o3; 552; char *p4 = p + o4; 553; long long sum = 0; 554; for (long long i = 0; i < n; ++i) { 555; unsigned long x1 = *(unsigned long *)(p1 + i); 556; unsigned long x2 = *(unsigned long *)(p2 + i); 557; unsigned long x3 = *(unsigned long *)(p3 + i); 558; unsigned long x4 = *(unsigned long *)(p4 + i); 559; sum += x1 * x2 * x3 * x4; 560; } 561; return sum; 562; } 563; 564define i64 @two_chain_different_offsets_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) { 565; CHECK-LABEL: two_chain_different_offsets_succ: 566; CHECK: # %bb.0: # %entry 567; CHECK-NEXT: cmpdi r6, 1 568; CHECK-NEXT: blt cr0, .LBB5_4 569; CHECK-NEXT: # %bb.1: # %for.body.preheader 570; CHECK-NEXT: sldi r8, r4, 2 571; CHECK-NEXT: add r7, r5, r4 572; CHECK-NEXT: mtctr r6 573; CHECK-NEXT: add r5, r5, r8 574; CHECK-NEXT: add r7, r3, r7 575; CHECK-NEXT: sldi r4, r4, 1 576; CHECK-NEXT: add r5, r3, r5 577; CHECK-NEXT: li r3, 0 578; CHECK-NEXT: .p2align 4 579; CHECK-NEXT: .LBB5_2: # %for.body 580; CHECK-NEXT: # 581; CHECK-NEXT: ld r6, 0(r7) 582; CHECK-NEXT: ldx r8, r7, r4 583; CHECK-NEXT: ld r9, 0(r5) 584; CHECK-NEXT: ldx r10, r5, r4 585; CHECK-NEXT: addi r7, r7, 1 586; CHECK-NEXT: addi r5, r5, 1 587; CHECK-NEXT: mulld r6, r8, r6 588; CHECK-NEXT: mulld r6, r6, r9 589; CHECK-NEXT: maddld r3, r6, r10, r3 590; CHECK-NEXT: bdnz .LBB5_2 591; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 592; CHECK-NEXT: blr 593; CHECK-NEXT: .LBB5_4: 594; CHECK-NEXT: li r3, 0 595; CHECK-NEXT: blr 596entry: 597 %mul = mul nsw i64 %offset, 3 598 %mul2 = shl nsw i64 %offset, 2 599 %mul4 = mul nsw i64 %offset, 6 600 %cmp46 = icmp sgt i64 %n, 0 601 br i1 %cmp46, label %for.body, label %for.cond.cleanup 602 603for.cond.cleanup: ; preds = %for.body, %entry 604 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ] 605 ret i64 %sum.0.lcssa 606 607for.body: ; preds = %entry, %for.body 608 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ] 609 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 610 %add = add i64 %i.047, %base1 611 %add.ptr9.idx = add i64 %add, %offset 612 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx 613 %0 = bitcast i8* %add.ptr9 to i64* 614 %1 = load i64, i64* %0, align 8 615 %add.ptr10.idx = add i64 %add, %mul 616 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx 617 %2 = bitcast i8* %add.ptr10 to i64* 618 %3 = load i64, i64* %2, align 8 619 %add.ptr11.idx = add i64 %add, %mul2 620 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx 621 %4 = bitcast i8* %add.ptr11 to i64* 622 %5 = load i64, i64* %4, align 8 623 %add.ptr12.idx = add i64 %add, %mul4 624 %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx 625 %6 = bitcast i8* %add.ptr12 to i64* 626 %7 = load i64, i64* %6, align 8 627 %mul13 = mul i64 %3, %1 628 %mul14 = mul i64 %mul13, %5 629 %mul15 = mul i64 %mul14, %7 630 %add16 = add i64 %mul15, %sum.048 631 %inc = add nuw nsw i64 %i.047, 1 632 %exitcond.not = icmp eq i64 %inc, %n 633 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 634} 635 636; addresses: 637; 1: base1 + offset 638; 2: + 2*offset 639; 3: + base2 - base1 - 2*offset 640; 4: + 2*offset 641; 642; chains: 643; 1: base1 + offset, offsets: (0, 2*offset) 644; 2: base2 + offset, offsets: (0, 2*offset) 645; 646; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) { 647; long long o1 = base1 + offset; 648; long long o2 = base1 + 3 * offset; 649; long long o3 = base2 + offset; 650; long long o4 = base2 + 3 * offset; 651; char *p1 = p + o1; 652; char *p2 = p + o2; 653; char *p3 = p + o3; 654; char *p4 = p + o4; 655; long long sum = 0; 656; for (long long i = 0; i < n; ++i) { 657; unsigned long x1 = *(unsigned long *)(p1 + i); 658; unsigned long x2 = *(unsigned long *)(p2 + i); 659; unsigned long x3 = *(unsigned long *)(p3 + i); 660; unsigned long x4 = *(unsigned long *)(p4 + i); 661; sum += x1 * x2 * x3 * x4; 662; } 663; return sum; 664; } 665; 666define i64 @two_chain_two_bases_succ(i8* %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) { 667; CHECK-LABEL: two_chain_two_bases_succ: 668; CHECK: # %bb.0: # %entry 669; CHECK-NEXT: cmpdi r7, 1 670; CHECK-NEXT: blt cr0, .LBB6_4 671; CHECK-NEXT: # %bb.1: # %for.body.preheader 672; CHECK-NEXT: add r6, r6, r4 673; CHECK-NEXT: add r5, r5, r4 674; CHECK-NEXT: mtctr r7 675; CHECK-NEXT: sldi r4, r4, 1 676; CHECK-NEXT: add r5, r3, r5 677; CHECK-NEXT: add r6, r3, r6 678; CHECK-NEXT: li r3, 0 679; CHECK-NEXT: .p2align 4 680; CHECK-NEXT: .LBB6_2: # %for.body 681; CHECK-NEXT: # 682; CHECK-NEXT: ld r7, 0(r5) 683; CHECK-NEXT: ldx r8, r5, r4 684; CHECK-NEXT: ld r9, 0(r6) 685; CHECK-NEXT: ldx r10, r6, r4 686; CHECK-NEXT: addi r5, r5, 1 687; CHECK-NEXT: addi r6, r6, 1 688; CHECK-NEXT: mulld r7, r8, r7 689; CHECK-NEXT: mulld r7, r7, r9 690; CHECK-NEXT: maddld r3, r7, r10, r3 691; CHECK-NEXT: bdnz .LBB6_2 692; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 693; CHECK-NEXT: blr 694; CHECK-NEXT: .LBB6_4: 695; CHECK-NEXT: li r3, 0 696; CHECK-NEXT: blr 697entry: 698 %mul = mul nsw i64 %offset, 3 699 %cmp44 = icmp sgt i64 %n, 0 700 br i1 %cmp44, label %for.body, label %for.cond.cleanup 701 702for.cond.cleanup: ; preds = %for.body, %entry 703 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ] 704 ret i64 %sum.0.lcssa 705 706for.body: ; preds = %entry, %for.body 707 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ] 708 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 709 %add = add i64 %i.045, %base1 710 %add.ptr8.idx = add i64 %add, %offset 711 %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx 712 %0 = bitcast i8* %add.ptr8 to i64* 713 %1 = load i64, i64* %0, align 8 714 %add1 = add i64 %i.045, %mul 715 %add.ptr9.idx = add i64 %add1, %base1 716 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx 717 %2 = bitcast i8* %add.ptr9 to i64* 718 %3 = load i64, i64* %2, align 8 719 %add2 = add i64 %i.045, %base2 720 %add.ptr10.idx = add i64 %add2, %offset 721 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx 722 %4 = bitcast i8* %add.ptr10 to i64* 723 %5 = load i64, i64* %4, align 8 724 %add.ptr11.idx = add i64 %add2, %mul 725 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx 726 %6 = bitcast i8* %add.ptr11 to i64* 727 %7 = load i64, i64* %6, align 8 728 %mul12 = mul i64 %3, %1 729 %mul13 = mul i64 %mul12, %5 730 %mul14 = mul i64 %mul13, %7 731 %add15 = add i64 %mul14, %sum.046 732 %inc = add nuw nsw i64 %i.045, 1 733 %exitcond.not = icmp eq i64 %inc, %n 734 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 735} 736; 737; Check chain commoning can reduce register pressure to save register spill/reload. 738; 739; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) { 740; inc = inc4; 741; #pragma unroll 4 742; for (long long i = 0; i < 4 * m; i++) { 743; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1]; 744; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2]; 745; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3]; 746; inc = inc + inc4; 747; } 748; return 0; 749; } 750; 751define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) { 752; CHECK-LABEL: spill_reduce_succ: 753; CHECK: # %bb.0: # %entry 754; CHECK-NEXT: cmpdi r6, 1 755; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill 756; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill 757; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill 758; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill 759; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill 760; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill 761; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill 762; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill 763; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill 764; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill 765; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill 766; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill 767; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill 768; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill 769; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill 770; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill 771; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill 772; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill 773; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill 774; CHECK-NEXT: std r9, -160(r1) # 8-byte Folded Spill 775; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill 776; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill 777; CHECK-NEXT: blt cr0, .LBB7_7 778; CHECK-NEXT: # %bb.1: # %for.body.preheader 779; CHECK-NEXT: sldi r6, r6, 2 780; CHECK-NEXT: li r7, 1 781; CHECK-NEXT: mr r12, r10 782; CHECK-NEXT: cmpdi r6, 1 783; CHECK-NEXT: iselgt r7, r6, r7 784; CHECK-NEXT: addi r8, r7, -1 785; CHECK-NEXT: clrldi r6, r7, 63 786; CHECK-NEXT: cmpldi r8, 3 787; CHECK-NEXT: blt cr0, .LBB7_4 788; CHECK-NEXT: # %bb.2: # %for.body.preheader.new 789; CHECK-NEXT: rldicl r7, r7, 62, 2 790; CHECK-NEXT: sldi r10, r12, 2 791; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload 792; CHECK-NEXT: rldicl r7, r7, 2, 1 793; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill 794; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload 795; CHECK-NEXT: add r8, r7, r10 796; CHECK-NEXT: mr r22, r7 797; CHECK-NEXT: mr r7, r4 798; CHECK-NEXT: mr r4, r3 799; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload 800; CHECK-NEXT: sldi r8, r8, 3 801; CHECK-NEXT: add r9, r5, r8 802; CHECK-NEXT: add r8, r3, r10 803; CHECK-NEXT: add r10, r2, r10 804; CHECK-NEXT: sldi r10, r10, 3 805; CHECK-NEXT: sldi r8, r8, 3 806; CHECK-NEXT: add r30, r5, r10 807; CHECK-NEXT: add r29, r7, r10 808; CHECK-NEXT: add r28, r4, r10 809; CHECK-NEXT: sldi r10, r12, 1 810; CHECK-NEXT: add r8, r5, r8 811; CHECK-NEXT: add r11, r12, r10 812; CHECK-NEXT: add r0, r22, r11 813; CHECK-NEXT: sldi r0, r0, 3 814; CHECK-NEXT: add r27, r5, r0 815; CHECK-NEXT: add r0, r3, r11 816; CHECK-NEXT: add r11, r2, r11 817; CHECK-NEXT: sldi r11, r11, 3 818; CHECK-NEXT: sldi r0, r0, 3 819; CHECK-NEXT: add r25, r5, r11 820; CHECK-NEXT: add r24, r7, r11 821; CHECK-NEXT: add r23, r4, r11 822; CHECK-NEXT: add r11, r22, r10 823; CHECK-NEXT: add r26, r5, r0 824; CHECK-NEXT: mr r0, r22 825; CHECK-NEXT: sldi r11, r11, 3 826; CHECK-NEXT: add r22, r5, r11 827; CHECK-NEXT: add r11, r3, r10 828; CHECK-NEXT: add r10, r2, r10 829; CHECK-NEXT: sldi r10, r10, 3 830; CHECK-NEXT: sldi r11, r11, 3 831; CHECK-NEXT: add r20, r5, r10 832; CHECK-NEXT: add r19, r7, r10 833; CHECK-NEXT: add r18, r4, r10 834; CHECK-NEXT: add r10, r12, r0 835; CHECK-NEXT: add r21, r5, r11 836; CHECK-NEXT: sldi r11, r2, 3 837; CHECK-NEXT: sldi r10, r10, 3 838; CHECK-NEXT: add r17, r5, r10 839; CHECK-NEXT: add r10, r12, r3 840; CHECK-NEXT: sldi r10, r10, 3 841; CHECK-NEXT: add r16, r5, r10 842; CHECK-NEXT: add r10, r12, r2 843; CHECK-NEXT: sldi r10, r10, 3 844; CHECK-NEXT: add r15, r5, r10 845; CHECK-NEXT: add r14, r7, r10 846; CHECK-NEXT: add r31, r4, r10 847; CHECK-NEXT: sldi r10, r3, 3 848; CHECK-NEXT: mr r3, r4 849; CHECK-NEXT: mr r4, r7 850; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload 851; CHECK-NEXT: sub r0, r10, r11 852; CHECK-NEXT: sldi r10, r7, 3 853; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload 854; CHECK-NEXT: sub r2, r10, r11 855; CHECK-NEXT: li r11, 0 856; CHECK-NEXT: mr r10, r12 857; CHECK-NEXT: addi r7, r7, -4 858; CHECK-NEXT: rldicl r7, r7, 62, 2 859; CHECK-NEXT: addi r7, r7, 1 860; CHECK-NEXT: mtctr r7 861; CHECK-NEXT: sldi r7, r12, 5 862; CHECK-NEXT: .p2align 4 863; CHECK-NEXT: .LBB7_3: # %for.body 864; CHECK-NEXT: # 865; CHECK-NEXT: lfd f0, 0(r31) 866; CHECK-NEXT: lfd f1, 0(r14) 867; CHECK-NEXT: add r10, r10, r12 868; CHECK-NEXT: add r10, r10, r12 869; CHECK-NEXT: xsmuldp f0, f0, f1 870; CHECK-NEXT: lfd f1, 0(r15) 871; CHECK-NEXT: add r10, r10, r12 872; CHECK-NEXT: add r10, r10, r12 873; CHECK-NEXT: xsadddp f0, f1, f0 874; CHECK-NEXT: stfd f0, 0(r15) 875; CHECK-NEXT: add r15, r15, r7 876; CHECK-NEXT: lfdx f0, r31, r0 877; CHECK-NEXT: lfdx f1, r14, r0 878; CHECK-NEXT: xsmuldp f0, f0, f1 879; CHECK-NEXT: lfdx f1, r16, r11 880; CHECK-NEXT: xsadddp f0, f1, f0 881; CHECK-NEXT: stfdx f0, r16, r11 882; CHECK-NEXT: lfdx f0, r31, r2 883; CHECK-NEXT: lfdx f1, r14, r2 884; CHECK-NEXT: add r31, r31, r7 885; CHECK-NEXT: add r14, r14, r7 886; CHECK-NEXT: xsmuldp f0, f0, f1 887; CHECK-NEXT: lfdx f1, r17, r11 888; CHECK-NEXT: xsadddp f0, f1, f0 889; CHECK-NEXT: stfdx f0, r17, r11 890; CHECK-NEXT: lfd f0, 0(r18) 891; CHECK-NEXT: lfd f1, 0(r19) 892; CHECK-NEXT: xsmuldp f0, f0, f1 893; CHECK-NEXT: lfdx f1, r20, r11 894; CHECK-NEXT: xsadddp f0, f1, f0 895; CHECK-NEXT: stfdx f0, r20, r11 896; CHECK-NEXT: lfdx f0, r18, r0 897; CHECK-NEXT: lfdx f1, r19, r0 898; CHECK-NEXT: xsmuldp f0, f0, f1 899; CHECK-NEXT: lfdx f1, r21, r11 900; CHECK-NEXT: xsadddp f0, f1, f0 901; CHECK-NEXT: stfdx f0, r21, r11 902; CHECK-NEXT: lfdx f0, r18, r2 903; CHECK-NEXT: lfdx f1, r19, r2 904; CHECK-NEXT: add r18, r18, r7 905; CHECK-NEXT: add r19, r19, r7 906; CHECK-NEXT: xsmuldp f0, f0, f1 907; CHECK-NEXT: lfdx f1, r22, r11 908; CHECK-NEXT: xsadddp f0, f1, f0 909; CHECK-NEXT: stfdx f0, r22, r11 910; CHECK-NEXT: lfd f0, 0(r23) 911; CHECK-NEXT: lfd f1, 0(r24) 912; CHECK-NEXT: xsmuldp f0, f0, f1 913; CHECK-NEXT: lfdx f1, r25, r11 914; CHECK-NEXT: xsadddp f0, f1, f0 915; CHECK-NEXT: stfdx f0, r25, r11 916; CHECK-NEXT: lfdx f0, r23, r0 917; CHECK-NEXT: lfdx f1, r24, r0 918; CHECK-NEXT: xsmuldp f0, f0, f1 919; CHECK-NEXT: lfdx f1, r26, r11 920; CHECK-NEXT: xsadddp f0, f1, f0 921; CHECK-NEXT: stfdx f0, r26, r11 922; CHECK-NEXT: lfdx f0, r23, r2 923; CHECK-NEXT: lfdx f1, r24, r2 924; CHECK-NEXT: add r23, r23, r7 925; CHECK-NEXT: add r24, r24, r7 926; CHECK-NEXT: xsmuldp f0, f0, f1 927; CHECK-NEXT: lfdx f1, r27, r11 928; CHECK-NEXT: xsadddp f0, f1, f0 929; CHECK-NEXT: stfdx f0, r27, r11 930; CHECK-NEXT: lfd f0, 0(r28) 931; CHECK-NEXT: lfd f1, 0(r29) 932; CHECK-NEXT: xsmuldp f0, f0, f1 933; CHECK-NEXT: lfdx f1, r30, r11 934; CHECK-NEXT: xsadddp f0, f1, f0 935; CHECK-NEXT: stfdx f0, r30, r11 936; CHECK-NEXT: lfdx f0, r28, r0 937; CHECK-NEXT: lfdx f1, r29, r0 938; CHECK-NEXT: xsmuldp f0, f0, f1 939; CHECK-NEXT: lfdx f1, r8, r11 940; CHECK-NEXT: xsadddp f0, f1, f0 941; CHECK-NEXT: stfdx f0, r8, r11 942; CHECK-NEXT: lfdx f0, r28, r2 943; CHECK-NEXT: lfdx f1, r29, r2 944; CHECK-NEXT: add r28, r28, r7 945; CHECK-NEXT: add r29, r29, r7 946; CHECK-NEXT: xsmuldp f0, f0, f1 947; CHECK-NEXT: lfdx f1, r9, r11 948; CHECK-NEXT: xsadddp f0, f1, f0 949; CHECK-NEXT: stfdx f0, r9, r11 950; CHECK-NEXT: add r11, r11, r7 951; CHECK-NEXT: bdnz .LBB7_3 952; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa 953; CHECK-NEXT: cmpldi r6, 0 954; CHECK-NEXT: beq cr0, .LBB7_7 955; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader 956; CHECK-NEXT: sldi r8, r12, 3 957; CHECK-NEXT: ld r12, -176(r1) # 8-byte Folded Reload 958; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload 959; CHECK-NEXT: add r12, r10, r12 960; CHECK-NEXT: add r7, r10, r7 961; CHECK-NEXT: sldi r0, r12, 3 962; CHECK-NEXT: sldi r11, r7, 3 963; CHECK-NEXT: add r12, r5, r0 964; CHECK-NEXT: add r30, r4, r0 965; CHECK-NEXT: add r29, r3, r0 966; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload 967; CHECK-NEXT: add r7, r5, r11 968; CHECK-NEXT: add r9, r4, r11 969; CHECK-NEXT: add r11, r3, r11 970; CHECK-NEXT: add r10, r10, r0 971; CHECK-NEXT: sldi r10, r10, 3 972; CHECK-NEXT: add r5, r5, r10 973; CHECK-NEXT: add r4, r4, r10 974; CHECK-NEXT: add r3, r3, r10 975; CHECK-NEXT: li r10, 0 976; CHECK-NEXT: .p2align 4 977; CHECK-NEXT: .LBB7_6: # %for.body.epil 978; CHECK-NEXT: # 979; CHECK-NEXT: lfdx f0, r3, r10 980; CHECK-NEXT: lfdx f1, r4, r10 981; CHECK-NEXT: addi r6, r6, -1 982; CHECK-NEXT: cmpldi r6, 0 983; CHECK-NEXT: xsmuldp f0, f0, f1 984; CHECK-NEXT: lfd f1, 0(r5) 985; CHECK-NEXT: xsadddp f0, f1, f0 986; CHECK-NEXT: stfd f0, 0(r5) 987; CHECK-NEXT: add r5, r5, r8 988; CHECK-NEXT: lfdx f0, r29, r10 989; CHECK-NEXT: lfdx f1, r30, r10 990; CHECK-NEXT: xsmuldp f0, f0, f1 991; CHECK-NEXT: lfdx f1, r12, r10 992; CHECK-NEXT: xsadddp f0, f1, f0 993; CHECK-NEXT: stfdx f0, r12, r10 994; CHECK-NEXT: lfdx f0, r11, r10 995; CHECK-NEXT: lfdx f1, r9, r10 996; CHECK-NEXT: xsmuldp f0, f0, f1 997; CHECK-NEXT: lfdx f1, r7, r10 998; CHECK-NEXT: xsadddp f0, f1, f0 999; CHECK-NEXT: stfdx f0, r7, r10 1000; CHECK-NEXT: add r10, r10, r8 1001; CHECK-NEXT: bne cr0, .LBB7_6 1002; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup 1003; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload 1004; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload 1005; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 1006; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload 1007; CHECK-NEXT: li r3, 0 1008; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload 1009; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload 1010; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload 1011; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload 1012; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload 1013; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload 1014; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload 1015; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload 1016; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload 1017; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload 1018; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload 1019; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload 1020; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload 1021; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload 1022; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload 1023; CHECK-NEXT: blr 1024entry: 1025 %cmp49 = icmp sgt i64 %m, 0 1026 br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup 1027 1028for.body.preheader: ; preds = %entry 1029 %0 = shl i64 %m, 2 1030 %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1) 1031 %1 = add nsw i64 %smax52, -1 1032 %xtraiter = and i64 %smax52, 1 1033 %2 = icmp ult i64 %1, 3 1034 br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 1035 1036for.body.preheader.new: ; preds = %for.body.preheader 1037 %unroll_iter = and i64 %smax52, 9223372036854775804 1038 br label %for.body 1039 1040for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 1041 %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ] 1042 %lcmp.mod.not = icmp eq i64 %xtraiter, 0 1043 br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil 1044 1045for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil 1046 %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1047 %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1048 %add.epil = add nsw i64 %inc.addr.050.epil, %inc1 1049 %arrayidx.epil = getelementptr inbounds double, double* %input1, i64 %add.epil 1050 %3 = load double, double* %arrayidx.epil, align 8 1051 %arrayidx2.epil = getelementptr inbounds double, double* %input2, i64 %add.epil 1052 %4 = load double, double* %arrayidx2.epil, align 8 1053 %mul3.epil = fmul double %3, %4 1054 %arrayidx5.epil = getelementptr inbounds double, double* %output, i64 %add.epil 1055 %5 = load double, double* %arrayidx5.epil, align 8 1056 %add6.epil = fadd double %5, %mul3.epil 1057 store double %add6.epil, double* %arrayidx5.epil, align 8 1058 %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2 1059 %arrayidx8.epil = getelementptr inbounds double, double* %input1, i64 %add7.epil 1060 %6 = load double, double* %arrayidx8.epil, align 8 1061 %arrayidx10.epil = getelementptr inbounds double, double* %input2, i64 %add7.epil 1062 %7 = load double, double* %arrayidx10.epil, align 8 1063 %mul11.epil = fmul double %6, %7 1064 %arrayidx13.epil = getelementptr inbounds double, double* %output, i64 %add7.epil 1065 %8 = load double, double* %arrayidx13.epil, align 8 1066 %add14.epil = fadd double %8, %mul11.epil 1067 store double %add14.epil, double* %arrayidx13.epil, align 8 1068 %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3 1069 %arrayidx16.epil = getelementptr inbounds double, double* %input1, i64 %add15.epil 1070 %9 = load double, double* %arrayidx16.epil, align 8 1071 %arrayidx18.epil = getelementptr inbounds double, double* %input2, i64 %add15.epil 1072 %10 = load double, double* %arrayidx18.epil, align 8 1073 %mul19.epil = fmul double %9, %10 1074 %arrayidx21.epil = getelementptr inbounds double, double* %output, i64 %add15.epil 1075 %11 = load double, double* %arrayidx21.epil, align 8 1076 %add22.epil = fadd double %11, %mul19.epil 1077 store double %add22.epil, double* %arrayidx21.epil, align 8 1078 %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4 1079 %epil.iter.sub = add nsw i64 %epil.iter, -1 1080 %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0 1081 br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil 1082 1083for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 1084 ret i32 0 1085 1086for.body: ; preds = %for.body, %for.body.preheader.new 1087 %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ] 1088 %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1089 %add = add nsw i64 %inc.addr.050, %inc1 1090 %arrayidx = getelementptr inbounds double, double* %input1, i64 %add 1091 %12 = load double, double* %arrayidx, align 8 1092 %arrayidx2 = getelementptr inbounds double, double* %input2, i64 %add 1093 %13 = load double, double* %arrayidx2, align 8 1094 %mul3 = fmul double %12, %13 1095 %arrayidx5 = getelementptr inbounds double, double* %output, i64 %add 1096 %14 = load double, double* %arrayidx5, align 8 1097 %add6 = fadd double %14, %mul3 1098 store double %add6, double* %arrayidx5, align 8 1099 %add7 = add nsw i64 %inc.addr.050, %inc2 1100 %arrayidx8 = getelementptr inbounds double, double* %input1, i64 %add7 1101 %15 = load double, double* %arrayidx8, align 8 1102 %arrayidx10 = getelementptr inbounds double, double* %input2, i64 %add7 1103 %16 = load double, double* %arrayidx10, align 8 1104 %mul11 = fmul double %15, %16 1105 %arrayidx13 = getelementptr inbounds double, double* %output, i64 %add7 1106 %17 = load double, double* %arrayidx13, align 8 1107 %add14 = fadd double %17, %mul11 1108 store double %add14, double* %arrayidx13, align 8 1109 %add15 = add nsw i64 %inc.addr.050, %inc3 1110 %arrayidx16 = getelementptr inbounds double, double* %input1, i64 %add15 1111 %18 = load double, double* %arrayidx16, align 8 1112 %arrayidx18 = getelementptr inbounds double, double* %input2, i64 %add15 1113 %19 = load double, double* %arrayidx18, align 8 1114 %mul19 = fmul double %18, %19 1115 %arrayidx21 = getelementptr inbounds double, double* %output, i64 %add15 1116 %20 = load double, double* %arrayidx21, align 8 1117 %add22 = fadd double %20, %mul19 1118 store double %add22, double* %arrayidx21, align 8 1119 %add23 = add nsw i64 %inc.addr.050, %inc4 1120 %add.1 = add nsw i64 %add23, %inc1 1121 %arrayidx.1 = getelementptr inbounds double, double* %input1, i64 %add.1 1122 %21 = load double, double* %arrayidx.1, align 8 1123 %arrayidx2.1 = getelementptr inbounds double, double* %input2, i64 %add.1 1124 %22 = load double, double* %arrayidx2.1, align 8 1125 %mul3.1 = fmul double %21, %22 1126 %arrayidx5.1 = getelementptr inbounds double, double* %output, i64 %add.1 1127 %23 = load double, double* %arrayidx5.1, align 8 1128 %add6.1 = fadd double %23, %mul3.1 1129 store double %add6.1, double* %arrayidx5.1, align 8 1130 %add7.1 = add nsw i64 %add23, %inc2 1131 %arrayidx8.1 = getelementptr inbounds double, double* %input1, i64 %add7.1 1132 %24 = load double, double* %arrayidx8.1, align 8 1133 %arrayidx10.1 = getelementptr inbounds double, double* %input2, i64 %add7.1 1134 %25 = load double, double* %arrayidx10.1, align 8 1135 %mul11.1 = fmul double %24, %25 1136 %arrayidx13.1 = getelementptr inbounds double, double* %output, i64 %add7.1 1137 %26 = load double, double* %arrayidx13.1, align 8 1138 %add14.1 = fadd double %26, %mul11.1 1139 store double %add14.1, double* %arrayidx13.1, align 8 1140 %add15.1 = add nsw i64 %add23, %inc3 1141 %arrayidx16.1 = getelementptr inbounds double, double* %input1, i64 %add15.1 1142 %27 = load double, double* %arrayidx16.1, align 8 1143 %arrayidx18.1 = getelementptr inbounds double, double* %input2, i64 %add15.1 1144 %28 = load double, double* %arrayidx18.1, align 8 1145 %mul19.1 = fmul double %27, %28 1146 %arrayidx21.1 = getelementptr inbounds double, double* %output, i64 %add15.1 1147 %29 = load double, double* %arrayidx21.1, align 8 1148 %add22.1 = fadd double %29, %mul19.1 1149 store double %add22.1, double* %arrayidx21.1, align 8 1150 %add23.1 = add nsw i64 %add23, %inc4 1151 %add.2 = add nsw i64 %add23.1, %inc1 1152 %arrayidx.2 = getelementptr inbounds double, double* %input1, i64 %add.2 1153 %30 = load double, double* %arrayidx.2, align 8 1154 %arrayidx2.2 = getelementptr inbounds double, double* %input2, i64 %add.2 1155 %31 = load double, double* %arrayidx2.2, align 8 1156 %mul3.2 = fmul double %30, %31 1157 %arrayidx5.2 = getelementptr inbounds double, double* %output, i64 %add.2 1158 %32 = load double, double* %arrayidx5.2, align 8 1159 %add6.2 = fadd double %32, %mul3.2 1160 store double %add6.2, double* %arrayidx5.2, align 8 1161 %add7.2 = add nsw i64 %add23.1, %inc2 1162 %arrayidx8.2 = getelementptr inbounds double, double* %input1, i64 %add7.2 1163 %33 = load double, double* %arrayidx8.2, align 8 1164 %arrayidx10.2 = getelementptr inbounds double, double* %input2, i64 %add7.2 1165 %34 = load double, double* %arrayidx10.2, align 8 1166 %mul11.2 = fmul double %33, %34 1167 %arrayidx13.2 = getelementptr inbounds double, double* %output, i64 %add7.2 1168 %35 = load double, double* %arrayidx13.2, align 8 1169 %add14.2 = fadd double %35, %mul11.2 1170 store double %add14.2, double* %arrayidx13.2, align 8 1171 %add15.2 = add nsw i64 %add23.1, %inc3 1172 %arrayidx16.2 = getelementptr inbounds double, double* %input1, i64 %add15.2 1173 %36 = load double, double* %arrayidx16.2, align 8 1174 %arrayidx18.2 = getelementptr inbounds double, double* %input2, i64 %add15.2 1175 %37 = load double, double* %arrayidx18.2, align 8 1176 %mul19.2 = fmul double %36, %37 1177 %arrayidx21.2 = getelementptr inbounds double, double* %output, i64 %add15.2 1178 %38 = load double, double* %arrayidx21.2, align 8 1179 %add22.2 = fadd double %38, %mul19.2 1180 store double %add22.2, double* %arrayidx21.2, align 8 1181 %add23.2 = add nsw i64 %add23.1, %inc4 1182 %add.3 = add nsw i64 %add23.2, %inc1 1183 %arrayidx.3 = getelementptr inbounds double, double* %input1, i64 %add.3 1184 %39 = load double, double* %arrayidx.3, align 8 1185 %arrayidx2.3 = getelementptr inbounds double, double* %input2, i64 %add.3 1186 %40 = load double, double* %arrayidx2.3, align 8 1187 %mul3.3 = fmul double %39, %40 1188 %arrayidx5.3 = getelementptr inbounds double, double* %output, i64 %add.3 1189 %41 = load double, double* %arrayidx5.3, align 8 1190 %add6.3 = fadd double %41, %mul3.3 1191 store double %add6.3, double* %arrayidx5.3, align 8 1192 %add7.3 = add nsw i64 %add23.2, %inc2 1193 %arrayidx8.3 = getelementptr inbounds double, double* %input1, i64 %add7.3 1194 %42 = load double, double* %arrayidx8.3, align 8 1195 %arrayidx10.3 = getelementptr inbounds double, double* %input2, i64 %add7.3 1196 %43 = load double, double* %arrayidx10.3, align 8 1197 %mul11.3 = fmul double %42, %43 1198 %arrayidx13.3 = getelementptr inbounds double, double* %output, i64 %add7.3 1199 %44 = load double, double* %arrayidx13.3, align 8 1200 %add14.3 = fadd double %44, %mul11.3 1201 store double %add14.3, double* %arrayidx13.3, align 8 1202 %add15.3 = add nsw i64 %add23.2, %inc3 1203 %arrayidx16.3 = getelementptr inbounds double, double* %input1, i64 %add15.3 1204 %45 = load double, double* %arrayidx16.3, align 8 1205 %arrayidx18.3 = getelementptr inbounds double, double* %input2, i64 %add15.3 1206 %46 = load double, double* %arrayidx18.3, align 8 1207 %mul19.3 = fmul double %45, %46 1208 %arrayidx21.3 = getelementptr inbounds double, double* %output, i64 %add15.3 1209 %47 = load double, double* %arrayidx21.3, align 8 1210 %add22.3 = fadd double %47, %mul19.3 1211 store double %add22.3, double* %arrayidx21.3, align 8 1212 %add23.3 = add nsw i64 %add23.2, %inc4 1213 %niter.nsub.3 = add i64 %niter, -4 1214 %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0 1215 br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1216} 1217 1218declare i64 @llvm.smax.i64(i64, i64) 1219 1220