1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Analysis/AffineStructures.h" 16 #include "mlir/Dialect/Affine/IR/AffineOps.h" 17 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 18 #include "mlir/Dialect/SCF/AffineCanonicalizationUtils.h" 19 #include "mlir/Dialect/SCF/Passes.h" 20 #include "mlir/Dialect/SCF/SCF.h" 21 #include "mlir/Dialect/SCF/Transforms.h" 22 #include "mlir/Dialect/StandardOps/IR/Ops.h" 23 #include "mlir/Dialect/Utils/StaticValueUtils.h" 24 #include "mlir/IR/AffineExpr.h" 25 #include "mlir/IR/BlockAndValueMapping.h" 26 #include "mlir/IR/PatternMatch.h" 27 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 28 #include "llvm/ADT/DenseMap.h" 29 30 using namespace mlir; 31 using scf::ForOp; 32 using scf::ParallelOp; 33 34 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 35 /// into 2 loops after checking if the bounds are equal to that constant. This 36 /// is beneficial if the loop will almost always have the constant bound and 37 /// that version can be fully unrolled and vectorized. 38 static void specializeParallelLoopForUnrolling(ParallelOp op) { 39 SmallVector<int64_t, 2> constantIndices; 40 constantIndices.reserve(op.upperBound().size()); 41 for (auto bound : op.upperBound()) { 42 auto minOp = bound.getDefiningOp<AffineMinOp>(); 43 if (!minOp) 44 return; 45 int64_t minConstant = std::numeric_limits<int64_t>::max(); 46 for (AffineExpr expr : minOp.map().getResults()) { 47 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 48 minConstant = std::min(minConstant, constantIndex.getValue()); 49 } 50 if (minConstant == std::numeric_limits<int64_t>::max()) 51 return; 52 constantIndices.push_back(minConstant); 53 } 54 55 OpBuilder b(op); 56 BlockAndValueMapping map; 57 Value cond; 58 for (auto bound : llvm::zip(op.upperBound(), constantIndices)) { 59 Value constant = 60 b.create<arith::ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 61 Value cmp = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 62 std::get<0>(bound), constant); 63 cond = cond ? b.create<arith::AndIOp>(op.getLoc(), cond, cmp) : cmp; 64 map.map(std::get<0>(bound), constant); 65 } 66 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 67 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 68 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 69 op.erase(); 70 } 71 72 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 73 /// 2 loops after checking if the bounds are equal to that constant. This is 74 /// beneficial if the loop will almost always have the constant bound and that 75 /// version can be fully unrolled and vectorized. 76 static void specializeForLoopForUnrolling(ForOp op) { 77 auto bound = op.upperBound(); 78 auto minOp = bound.getDefiningOp<AffineMinOp>(); 79 if (!minOp) 80 return; 81 int64_t minConstant = std::numeric_limits<int64_t>::max(); 82 for (AffineExpr expr : minOp.map().getResults()) { 83 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 84 minConstant = std::min(minConstant, constantIndex.getValue()); 85 } 86 if (minConstant == std::numeric_limits<int64_t>::max()) 87 return; 88 89 OpBuilder b(op); 90 BlockAndValueMapping map; 91 Value constant = b.create<arith::ConstantIndexOp>(op.getLoc(), minConstant); 92 Value cond = b.create<arith::CmpIOp>(op.getLoc(), arith::CmpIPredicate::eq, 93 bound, constant); 94 map.map(bound, constant); 95 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 96 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 97 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 98 op.erase(); 99 } 100 101 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 102 /// into a for loop where the step divides the iteration space evenly, followed 103 /// by an scf.if for the last (partial) iteration (if any). 104 /// 105 /// This function rewrites the given scf.for loop in-place and creates a new 106 /// scf.if operation for the last iteration. It replaces all uses of the 107 /// unpeeled loop with the results of the newly generated scf.if. 108 /// 109 /// The newly generated scf.if operation is returned via `ifOp`. The boundary 110 /// at which the loop is split (new upper bound) is returned via `splitBound`. 111 /// The return value indicates whether the loop was rewritten or not. 112 static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, 113 ForOp &partialIteration, Value &splitBound) { 114 RewriterBase::InsertionGuard guard(b); 115 auto lbInt = getConstantIntValue(forOp.lowerBound()); 116 auto ubInt = getConstantIntValue(forOp.upperBound()); 117 auto stepInt = getConstantIntValue(forOp.step()); 118 119 // No specialization necessary if step already divides upper bound evenly. 120 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 121 return failure(); 122 // No specialization necessary if step size is 1. 123 if (stepInt == static_cast<int64_t>(1)) 124 return failure(); 125 126 auto loc = forOp.getLoc(); 127 AffineExpr sym0, sym1, sym2; 128 bindSymbols(b.getContext(), sym0, sym1, sym2); 129 // New upper bound: %ub - (%ub - %lb) mod %step 130 auto modMap = AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)}); 131 b.setInsertionPoint(forOp); 132 splitBound = b.createOrFold<AffineApplyOp>( 133 loc, modMap, 134 ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); 135 136 // Create ForOp for partial iteration. 137 b.setInsertionPointAfter(forOp); 138 partialIteration = cast<ForOp>(b.clone(*forOp.getOperation())); 139 partialIteration.lowerBoundMutable().assign(splitBound); 140 forOp.replaceAllUsesWith(partialIteration->getResults()); 141 partialIteration.initArgsMutable().assign(forOp->getResults()); 142 143 // Set new upper loop bound. 144 b.updateRootInPlace(forOp, 145 [&]() { forOp.upperBoundMutable().assign(splitBound); }); 146 147 return success(); 148 } 149 150 template <typename OpTy, bool IsMin> 151 static void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, ForOp forOp, 152 ForOp partialIteration, 153 Value previousUb) { 154 Value mainIv = forOp.getInductionVar(); 155 Value partialIv = partialIteration.getInductionVar(); 156 assert(forOp.step() == partialIteration.step() && 157 "expected same step in main and partial loop"); 158 Value step = forOp.step(); 159 160 forOp.walk([&](OpTy affineOp) { 161 AffineMap map = affineOp.getAffineMap(); 162 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map, 163 affineOp.operands(), IsMin, mainIv, 164 previousUb, step, 165 /*insideLoop=*/true); 166 }); 167 partialIteration.walk([&](OpTy affineOp) { 168 AffineMap map = affineOp.getAffineMap(); 169 (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map, 170 affineOp.operands(), IsMin, partialIv, 171 previousUb, step, /*insideLoop=*/false); 172 }); 173 } 174 175 LogicalResult mlir::scf::peelAndCanonicalizeForLoop(RewriterBase &rewriter, 176 ForOp forOp, 177 ForOp &partialIteration) { 178 Value previousUb = forOp.upperBound(); 179 Value splitBound; 180 if (failed(peelForLoop(rewriter, forOp, partialIteration, splitBound))) 181 return failure(); 182 183 // Rewrite affine.min and affine.max ops. 184 rewriteAffineOpAfterPeeling<AffineMinOp, /*IsMin=*/true>( 185 rewriter, forOp, partialIteration, previousUb); 186 rewriteAffineOpAfterPeeling<AffineMaxOp, /*IsMin=*/false>( 187 rewriter, forOp, partialIteration, previousUb); 188 189 return success(); 190 } 191 192 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 193 static constexpr char kPartialIterationLabel[] = "__partial_iteration__"; 194 195 namespace { 196 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 197 ForLoopPeelingPattern(MLIRContext *ctx, bool skipPartial) 198 : OpRewritePattern<ForOp>(ctx), skipPartial(skipPartial) {} 199 200 LogicalResult matchAndRewrite(ForOp forOp, 201 PatternRewriter &rewriter) const override { 202 // Do not peel already peeled loops. 203 if (forOp->hasAttr(kPeeledLoopLabel)) 204 return failure(); 205 if (skipPartial) { 206 // No peeling of loops inside the partial iteration of another peeled 207 // loop. 208 Operation *op = forOp.getOperation(); 209 while ((op = op->getParentOfType<scf::ForOp>())) { 210 if (op->hasAttr(kPartialIterationLabel)) 211 return failure(); 212 } 213 } 214 // Apply loop peeling. 215 scf::ForOp partialIteration; 216 if (failed(peelAndCanonicalizeForLoop(rewriter, forOp, partialIteration))) 217 return failure(); 218 // Apply label, so that the same loop is not rewritten a second time. 219 partialIteration->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 220 rewriter.updateRootInPlace(forOp, [&]() { 221 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 222 }); 223 partialIteration->setAttr(kPartialIterationLabel, rewriter.getUnitAttr()); 224 return success(); 225 } 226 227 /// If set to true, loops inside partial iterations of another peeled loop 228 /// are not peeled. This reduces the size of the generated code. Partial 229 /// iterations are not usually performance critical. 230 /// Note: Takes into account the entire chain of parent operations, not just 231 /// the direct parent. 232 bool skipPartial; 233 }; 234 } // namespace 235 236 namespace { 237 struct ParallelLoopSpecialization 238 : public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> { 239 void runOnFunction() override { 240 getFunction().walk( 241 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 242 } 243 }; 244 245 struct ForLoopSpecialization 246 : public SCFForLoopSpecializationBase<ForLoopSpecialization> { 247 void runOnFunction() override { 248 getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 249 } 250 }; 251 252 struct ForLoopPeeling : public SCFForLoopPeelingBase<ForLoopPeeling> { 253 void runOnFunction() override { 254 FuncOp funcOp = getFunction(); 255 MLIRContext *ctx = funcOp.getContext(); 256 RewritePatternSet patterns(ctx); 257 patterns.add<ForLoopPeelingPattern>(ctx, skipPartial); 258 (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); 259 260 // Drop the markers. 261 funcOp.walk([](Operation *op) { 262 op->removeAttr(kPeeledLoopLabel); 263 op->removeAttr(kPartialIterationLabel); 264 }); 265 } 266 }; 267 } // namespace 268 269 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 270 return std::make_unique<ParallelLoopSpecialization>(); 271 } 272 273 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 274 return std::make_unique<ForLoopSpecialization>(); 275 } 276 277 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 278 return std::make_unique<ForLoopPeeling>(); 279 } 280