1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Dialect/Affine/IR/AffineOps.h" 16 #include "mlir/Dialect/SCF/Passes.h" 17 #include "mlir/Dialect/SCF/SCF.h" 18 #include "mlir/Dialect/SCF/Transforms.h" 19 #include "mlir/Dialect/StandardOps/IR/Ops.h" 20 #include "mlir/Dialect/Utils/StaticValueUtils.h" 21 #include "mlir/IR/AffineExpr.h" 22 #include "mlir/IR/BlockAndValueMapping.h" 23 #include "mlir/IR/PatternMatch.h" 24 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 25 #include "llvm/ADT/DenseMap.h" 26 27 using namespace mlir; 28 using scf::ForOp; 29 using scf::ParallelOp; 30 31 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 32 /// into 2 loops after checking if the bounds are equal to that constant. This 33 /// is beneficial if the loop will almost always have the constant bound and 34 /// that version can be fully unrolled and vectorized. 35 static void specializeParallelLoopForUnrolling(ParallelOp op) { 36 SmallVector<int64_t, 2> constantIndices; 37 constantIndices.reserve(op.upperBound().size()); 38 for (auto bound : op.upperBound()) { 39 auto minOp = bound.getDefiningOp<AffineMinOp>(); 40 if (!minOp) 41 return; 42 int64_t minConstant = std::numeric_limits<int64_t>::max(); 43 for (AffineExpr expr : minOp.map().getResults()) { 44 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 45 minConstant = std::min(minConstant, constantIndex.getValue()); 46 } 47 if (minConstant == std::numeric_limits<int64_t>::max()) 48 return; 49 constantIndices.push_back(minConstant); 50 } 51 52 OpBuilder b(op); 53 BlockAndValueMapping map; 54 Value cond; 55 for (auto bound : llvm::zip(op.upperBound(), constantIndices)) { 56 Value constant = b.create<ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 57 Value cmp = b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, 58 std::get<0>(bound), constant); 59 cond = cond ? b.create<AndOp>(op.getLoc(), cond, cmp) : cmp; 60 map.map(std::get<0>(bound), constant); 61 } 62 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 63 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 64 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 65 op.erase(); 66 } 67 68 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 69 /// 2 loops after checking if the bounds are equal to that constant. This is 70 /// beneficial if the loop will almost always have the constant bound and that 71 /// version can be fully unrolled and vectorized. 72 static void specializeForLoopForUnrolling(ForOp op) { 73 auto bound = op.upperBound(); 74 auto minOp = bound.getDefiningOp<AffineMinOp>(); 75 if (!minOp) 76 return; 77 int64_t minConstant = std::numeric_limits<int64_t>::max(); 78 for (AffineExpr expr : minOp.map().getResults()) { 79 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 80 minConstant = std::min(minConstant, constantIndex.getValue()); 81 } 82 if (minConstant == std::numeric_limits<int64_t>::max()) 83 return; 84 85 OpBuilder b(op); 86 BlockAndValueMapping map; 87 Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant); 88 Value cond = 89 b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant); 90 map.map(bound, constant); 91 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 92 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 93 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 94 op.erase(); 95 } 96 97 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 98 /// into a for loop where the step divides the iteration space evenly, followed 99 /// by an scf.if for the last (partial) iteration (if any). 100 LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp, 101 scf::IfOp &ifOp) { 102 RewriterBase::InsertionGuard guard(b); 103 auto lbInt = getConstantIntValue(forOp.lowerBound()); 104 auto ubInt = getConstantIntValue(forOp.upperBound()); 105 auto stepInt = getConstantIntValue(forOp.step()); 106 107 // No specialization necessary if step already divides upper bound evenly. 108 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 109 return failure(); 110 // No specialization necessary if step size is 1. 111 if (stepInt == static_cast<int64_t>(1)) 112 return failure(); 113 114 auto loc = forOp.getLoc(); 115 AffineExpr dim0, dim1, dim2; 116 bindDims(b.getContext(), dim0, dim1, dim2); 117 // New upper bound: %ub - (%ub - %lb) mod %step 118 auto modMap = AffineMap::get(3, 0, {dim1 - ((dim1 - dim0) % dim2)}); 119 b.setInsertionPoint(forOp); 120 Value splitBound = b.createOrFold<AffineApplyOp>( 121 loc, modMap, 122 ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); 123 124 // Set new upper loop bound. 125 Value previousUb = forOp.upperBound(); 126 b.updateRootInPlace(forOp, 127 [&]() { forOp.upperBoundMutable().assign(splitBound); }); 128 b.setInsertionPointAfter(forOp); 129 130 // Do we need one more iteration? 131 Value hasMoreIter = 132 b.create<CmpIOp>(loc, CmpIPredicate::slt, splitBound, previousUb); 133 134 // Create IfOp for last iteration. 135 auto resultTypes = forOp.getResultTypes(); 136 ifOp = b.create<scf::IfOp>(loc, resultTypes, hasMoreIter, 137 /*withElseRegion=*/!resultTypes.empty()); 138 forOp.replaceAllUsesWith(ifOp->getResults()); 139 140 // Build then case. 141 BlockAndValueMapping bvm; 142 bvm.map(forOp.region().getArgument(0), splitBound); 143 for (auto it : llvm::zip(forOp.getRegionIterArgs(), forOp->getResults())) { 144 bvm.map(std::get<0>(it), std::get<1>(it)); 145 } 146 b.cloneRegionBefore(forOp.region(), ifOp.thenRegion(), 147 ifOp.thenRegion().begin(), bvm); 148 // Build else case. 149 if (!resultTypes.empty()) 150 ifOp.getElseBodyBuilder(b.getListener()) 151 .create<scf::YieldOp>(loc, forOp->getResults()); 152 153 return success(); 154 } 155 156 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 157 158 namespace { 159 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 160 using OpRewritePattern<ForOp>::OpRewritePattern; 161 162 LogicalResult matchAndRewrite(ForOp forOp, 163 PatternRewriter &rewriter) const override { 164 if (forOp->hasAttr(kPeeledLoopLabel)) 165 return failure(); 166 167 scf::IfOp ifOp; 168 if (failed(peelForLoop(rewriter, forOp, ifOp))) 169 return failure(); 170 // Apply label, so that the same loop is not rewritten a second time. 171 rewriter.updateRootInPlace(forOp, [&]() { 172 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 173 }); 174 175 return success(); 176 } 177 }; 178 } // namespace 179 180 namespace { 181 struct ParallelLoopSpecialization 182 : public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> { 183 void runOnFunction() override { 184 getFunction().walk( 185 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 186 } 187 }; 188 189 struct ForLoopSpecialization 190 : public SCFForLoopSpecializationBase<ForLoopSpecialization> { 191 void runOnFunction() override { 192 getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 193 } 194 }; 195 196 struct ForLoopPeeling : public SCFForLoopPeelingBase<ForLoopPeeling> { 197 void runOnFunction() override { 198 FuncOp funcOp = getFunction(); 199 MLIRContext *ctx = funcOp.getContext(); 200 RewritePatternSet patterns(ctx); 201 patterns.add<ForLoopPeelingPattern>(ctx); 202 (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); 203 204 // Drop the marker. 205 funcOp.walk([](ForOp op) { op->removeAttr(kPeeledLoopLabel); }); 206 } 207 }; 208 } // namespace 209 210 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 211 return std::make_unique<ParallelLoopSpecialization>(); 212 } 213 214 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 215 return std::make_unique<ForLoopSpecialization>(); 216 } 217 218 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 219 return std::make_unique<ForLoopPeeling>(); 220 } 221