1; RUN: llc -O3 -mv67t -march=hexagon < %s | FileCheck %s 2 3; Test that the inner loop in the tiny core version of bkfir has the assembler 4; directive "p2align 4". 5 6; CHECK: loop0(.LBB0_[[LOOP:.]], 7; CHECK-NOT: falign 8; CHECK: p2align 4 9; CHECK: } :endloop0 10 11define void @bkfir(i32* nocapture readonly %in, i32* nocapture readonly %coefs, i32 %tap, i32 %length, i32* nocapture %out) local_unnamed_addr #0 { 12entry: 13 %0 = bitcast i32* %out to i64* 14 %cmp141 = icmp sgt i32 %length, 0 15 br i1 %cmp141, label %for.body.lr.ph, label %for.end52 16 17for.body.lr.ph: 18 %1 = bitcast i32* %coefs to i64* 19 %cmp8127 = icmp sgt i32 %tap, 0 20 br i1 %cmp8127, label %for.body.us.preheader, label %for.body.lr.ph.split 21 22for.body.us.preheader: 23 br label %for.body.us 24 25for.body.us: 26 %add.ptr.us.phi = phi i32* [ %add.ptr.us.inc, %for.cond7.for.end_crit_edge.us ], [ %in, %for.body.us.preheader ] 27 %i.0143.us = phi i32 [ %add51.us, %for.cond7.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ] 28 %optr.0142.us = phi i64* [ %incdec.ptr49.us, %for.cond7.for.end_crit_edge.us ], [ %0, %for.body.us.preheader ] 29 %2 = bitcast i32* %add.ptr.us.phi to i64* 30 %incdec.ptr.us = getelementptr inbounds i32, i32* %add.ptr.us.phi, i32 2 31 %3 = bitcast i32* %incdec.ptr.us to i64* 32 %4 = load i64, i64* %2, align 8 33 %incdec.ptr1.us = getelementptr inbounds i32, i32* %add.ptr.us.phi, i32 4 34 %5 = bitcast i32* %incdec.ptr1.us to i64* 35 %6 = load i64, i64* %3, align 8 36 %_Q6V64_internal_union.sroa.0.0.extract.trunc.us = trunc i64 %6 to i32 37 %_Q6V64_internal_union2.sroa.3.0.extract.shift.us = lshr i64 %4, 32 38 %_Q6V64_internal_union2.sroa.3.0.extract.trunc.us = trunc i64 %_Q6V64_internal_union2.sroa.3.0.extract.shift.us to i32 39 %7 = tail call i64 @llvm.hexagon.A2.combinew(i32 %_Q6V64_internal_union.sroa.0.0.extract.trunc.us, i32 %_Q6V64_internal_union2.sroa.3.0.extract.trunc.us) 40 %add.ptr.us.inc = getelementptr i32, i32* %add.ptr.us.phi, i32 4 41 br label %for.body9.us 42 43for.body9.us: 44 %j.0137.us = phi i32 [ 0, %for.body.us ], [ %add.us, %for.body9.us ] 45 %x0x1.0136.us = phi i64 [ %4, %for.body.us ], [ %10, %for.body9.us ] 46 %x2x3.0135.us = phi i64 [ %6, %for.body.us ], [ %11, %for.body9.us ] 47 %x1x2.0134.us = phi i64 [ %7, %for.body.us ], [ %13, %for.body9.us ] 48 %iptrD.0133.us = phi i64* [ %5, %for.body.us ], [ %incdec.ptr13.us, %for.body9.us ] 49 %iptrC.0132.us = phi i64* [ %1, %for.body.us ], [ %incdec.ptr11.us, %for.body9.us ] 50 %sum0.0131.us = phi i64 [ 0, %for.body.us ], [ %18, %for.body9.us ] 51 %sum1.0130.us = phi i64 [ 0, %for.body.us ], [ %19, %for.body9.us ] 52 %sum2.0129.us = phi i64 [ 0, %for.body.us ], [ %20, %for.body9.us ] 53 %sum3.0128.us = phi i64 [ 0, %for.body.us ], [ %21, %for.body9.us ] 54 %incdec.ptr10.us = getelementptr inbounds i64, i64* %iptrC.0132.us, i32 1 55 %8 = load i64, i64* %iptrC.0132.us, align 8 56 %incdec.ptr11.us = getelementptr inbounds i64, i64* %iptrC.0132.us, i32 2 57 %9 = load i64, i64* %incdec.ptr10.us, align 8 58 %incdec.ptr12.us = getelementptr inbounds i64, i64* %iptrD.0133.us, i32 1 59 %10 = load i64, i64* %iptrD.0133.us, align 8 60 %incdec.ptr13.us = getelementptr inbounds i64, i64* %iptrD.0133.us, i32 2 61 %11 = load i64, i64* %incdec.ptr12.us, align 8 62 %_Q6V64_internal_union14.sroa.0.0.extract.trunc.us = trunc i64 %10 to i32 63 %_Q6V64_internal_union14.sroa.4.0.extract.shift.us = lshr i64 %10, 32 64 %_Q6V64_internal_union19.sroa.3.0.extract.shift.us = lshr i64 %x2x3.0135.us, 32 65 %_Q6V64_internal_union19.sroa.3.0.extract.trunc.us = trunc i64 %_Q6V64_internal_union19.sroa.3.0.extract.shift.us to i32 66 %12 = tail call i64 @llvm.hexagon.A2.combinew(i32 %_Q6V64_internal_union14.sroa.0.0.extract.trunc.us, i32 %_Q6V64_internal_union19.sroa.3.0.extract.trunc.us) 67 %_Q6V64_internal_union24.sroa.0.0.extract.trunc.us = trunc i64 %11 to i32 68 %_Q6V64_internal_union29.sroa.3.0.extract.trunc.us = trunc i64 %_Q6V64_internal_union14.sroa.4.0.extract.shift.us to i32 69 %13 = tail call i64 @llvm.hexagon.A2.combinew(i32 %_Q6V64_internal_union24.sroa.0.0.extract.trunc.us, i32 %_Q6V64_internal_union29.sroa.3.0.extract.trunc.us) 70 %14 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %sum0.0131.us, i64 %x0x1.0136.us, i64 %8) 71 %15 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %sum1.0130.us, i64 %x1x2.0134.us, i64 %8) 72 %16 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %sum2.0129.us, i64 %x2x3.0135.us, i64 %8) 73 %17 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %sum3.0128.us, i64 %12, i64 %8) 74 %18 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %14, i64 %x2x3.0135.us, i64 %9) 75 %19 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %15, i64 %12, i64 %9) 76 %20 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %16, i64 %10, i64 %9) 77 %21 = tail call i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64 %17, i64 %13, i64 %9) 78 %add.us = add nuw nsw i32 %j.0137.us, 4 79 %cmp8.us = icmp slt i32 %add.us, %tap 80 br i1 %cmp8.us, label %for.body9.us, label %for.cond7.for.end_crit_edge.us 81 82for.cond7.for.end_crit_edge.us: 83 %22 = ashr i64 %18, 39 84 %23 = ashr i64 %19, 39 85 %24 = ashr i64 %20, 39 86 %25 = ashr i64 %21, 39 87 %26 = tail call i32 @llvm.hexagon.A2.sat(i64 %22) 88 %27 = tail call i32 @llvm.hexagon.A2.sat(i64 %23) 89 %28 = tail call i32 @llvm.hexagon.A2.sat(i64 %24) 90 %29 = tail call i32 @llvm.hexagon.A2.sat(i64 %25) 91 %_Q6V64_internal_union34.sroa.4.0.insert.ext.us = zext i32 %27 to i64 92 %_Q6V64_internal_union34.sroa.4.0.insert.shift.us = shl nuw i64 %_Q6V64_internal_union34.sroa.4.0.insert.ext.us, 32 93 %_Q6V64_internal_union34.sroa.0.0.insert.ext.us = zext i32 %26 to i64 94 %_Q6V64_internal_union34.sroa.0.0.insert.insert.us = or i64 %_Q6V64_internal_union34.sroa.4.0.insert.shift.us, %_Q6V64_internal_union34.sroa.0.0.insert.ext.us 95 %incdec.ptr41.us = getelementptr inbounds i64, i64* %optr.0142.us, i32 1 96 store i64 %_Q6V64_internal_union34.sroa.0.0.insert.insert.us, i64* %optr.0142.us, align 8 97 %_Q6V64_internal_union42.sroa.4.0.insert.ext.us = zext i32 %29 to i64 98 %_Q6V64_internal_union42.sroa.4.0.insert.shift.us = shl nuw i64 %_Q6V64_internal_union42.sroa.4.0.insert.ext.us, 32 99 %_Q6V64_internal_union42.sroa.0.0.insert.ext.us = zext i32 %28 to i64 100 %_Q6V64_internal_union42.sroa.0.0.insert.insert.us = or i64 %_Q6V64_internal_union42.sroa.4.0.insert.shift.us, %_Q6V64_internal_union42.sroa.0.0.insert.ext.us 101 %incdec.ptr49.us = getelementptr inbounds i64, i64* %optr.0142.us, i32 2 102 store i64 %_Q6V64_internal_union42.sroa.0.0.insert.insert.us, i64* %incdec.ptr41.us, align 8 103 %add51.us = add nuw nsw i32 %i.0143.us, 4 104 %cmp.us = icmp slt i32 %add51.us, %length 105 br i1 %cmp.us, label %for.body.us, label %for.end52 106 107for.body.lr.ph.split: 108 %30 = tail call i32 @llvm.hexagon.A2.sat(i64 0) 109 %_Q6V64_internal_union34.sroa.4.0.insert.ext = zext i32 %30 to i64 110 %_Q6V64_internal_union34.sroa.4.0.insert.shift = shl nuw i64 %_Q6V64_internal_union34.sroa.4.0.insert.ext, 32 111 %_Q6V64_internal_union34.sroa.0.0.insert.insert = or i64 %_Q6V64_internal_union34.sroa.4.0.insert.shift, %_Q6V64_internal_union34.sroa.4.0.insert.ext 112 br label %for.body 113 114for.body: 115 %i.0143 = phi i32 [ 0, %for.body.lr.ph.split ], [ %add51, %for.body ] 116 %optr.0142 = phi i64* [ %0, %for.body.lr.ph.split ], [ %incdec.ptr49, %for.body ] 117 %incdec.ptr41 = getelementptr inbounds i64, i64* %optr.0142, i32 1 118 store i64 %_Q6V64_internal_union34.sroa.0.0.insert.insert, i64* %optr.0142, align 8 119 %incdec.ptr49 = getelementptr inbounds i64, i64* %optr.0142, i32 2 120 store i64 %_Q6V64_internal_union34.sroa.0.0.insert.insert, i64* %incdec.ptr41, align 8 121 %add51 = add nuw nsw i32 %i.0143, 4 122 %cmp = icmp slt i32 %add51, %length 123 br i1 %cmp, label %for.body, label %for.end52 124 125for.end52: 126 ret void 127} 128 129declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1 130declare i64 @llvm.hexagon.M7.dcmpyrwc.acc(i64, i64, i64) #1 131declare i32 @llvm.hexagon.A2.sat(i64) #1 132 133attributes #0 = { nounwind "target-cpu"="hexagonv67t" "target-features"="+audio" } 134attributes #1 = { nounwind readnone } 135