1 //===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Specializes parallel loops and for loops for easier unrolling and 10 // vectorization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Dialect/Affine/IR/AffineOps.h" 16 #include "mlir/Dialect/SCF/Passes.h" 17 #include "mlir/Dialect/SCF/SCF.h" 18 #include "mlir/Dialect/SCF/Transforms.h" 19 #include "mlir/Dialect/StandardOps/IR/Ops.h" 20 #include "mlir/Dialect/Utils/StaticValueUtils.h" 21 #include "mlir/IR/AffineExpr.h" 22 #include "mlir/IR/BlockAndValueMapping.h" 23 #include "mlir/IR/PatternMatch.h" 24 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 25 #include "llvm/ADT/DenseMap.h" 26 27 using namespace mlir; 28 using scf::ForOp; 29 using scf::ParallelOp; 30 31 /// Rewrite a parallel loop with bounds defined by an affine.min with a constant 32 /// into 2 loops after checking if the bounds are equal to that constant. This 33 /// is beneficial if the loop will almost always have the constant bound and 34 /// that version can be fully unrolled and vectorized. 35 static void specializeParallelLoopForUnrolling(ParallelOp op) { 36 SmallVector<int64_t, 2> constantIndices; 37 constantIndices.reserve(op.upperBound().size()); 38 for (auto bound : op.upperBound()) { 39 auto minOp = bound.getDefiningOp<AffineMinOp>(); 40 if (!minOp) 41 return; 42 int64_t minConstant = std::numeric_limits<int64_t>::max(); 43 for (AffineExpr expr : minOp.map().getResults()) { 44 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 45 minConstant = std::min(minConstant, constantIndex.getValue()); 46 } 47 if (minConstant == std::numeric_limits<int64_t>::max()) 48 return; 49 constantIndices.push_back(minConstant); 50 } 51 52 OpBuilder b(op); 53 BlockAndValueMapping map; 54 Value cond; 55 for (auto bound : llvm::zip(op.upperBound(), constantIndices)) { 56 Value constant = b.create<ConstantIndexOp>(op.getLoc(), std::get<1>(bound)); 57 Value cmp = b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, 58 std::get<0>(bound), constant); 59 cond = cond ? b.create<AndOp>(op.getLoc(), cond, cmp) : cmp; 60 map.map(std::get<0>(bound), constant); 61 } 62 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 63 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 64 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 65 op.erase(); 66 } 67 68 /// Rewrite a for loop with bounds defined by an affine.min with a constant into 69 /// 2 loops after checking if the bounds are equal to that constant. This is 70 /// beneficial if the loop will almost always have the constant bound and that 71 /// version can be fully unrolled and vectorized. 72 static void specializeForLoopForUnrolling(ForOp op) { 73 auto bound = op.upperBound(); 74 auto minOp = bound.getDefiningOp<AffineMinOp>(); 75 if (!minOp) 76 return; 77 int64_t minConstant = std::numeric_limits<int64_t>::max(); 78 for (AffineExpr expr : minOp.map().getResults()) { 79 if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>()) 80 minConstant = std::min(minConstant, constantIndex.getValue()); 81 } 82 if (minConstant == std::numeric_limits<int64_t>::max()) 83 return; 84 85 OpBuilder b(op); 86 BlockAndValueMapping map; 87 Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant); 88 Value cond = 89 b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant); 90 map.map(bound, constant); 91 auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true); 92 ifOp.getThenBodyBuilder().clone(*op.getOperation(), map); 93 ifOp.getElseBodyBuilder().clone(*op.getOperation()); 94 op.erase(); 95 } 96 97 /// Rewrite a for loop with bounds/step that potentially do not divide evenly 98 /// into a for loop where the step divides the iteration space evenly, followed 99 /// by an scf.if for the last (partial) iteration (if any). 100 LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp, 101 scf::IfOp &ifOp) { 102 RewriterBase::InsertionGuard guard(b); 103 auto lbInt = getConstantIntValue(forOp.lowerBound()); 104 auto ubInt = getConstantIntValue(forOp.upperBound()); 105 auto stepInt = getConstantIntValue(forOp.step()); 106 107 // No specialization necessary if step already divides upper bound evenly. 108 if (lbInt && ubInt && stepInt && (*ubInt - *lbInt) % *stepInt == 0) 109 return failure(); 110 // No specialization necessary if step size is 1. 111 if (stepInt == static_cast<int64_t>(1)) 112 return failure(); 113 114 auto loc = forOp.getLoc(); 115 AffineExpr dim0, dim1, dim2; 116 bindDims(b.getContext(), dim0, dim1, dim2); 117 // New upper bound: %ub - (%ub - %lb) mod %step 118 auto modMap = AffineMap::get(3, 0, {dim1 - ((dim1 - dim0) % dim2)}); 119 b.setInsertionPoint(forOp); 120 Value splitBound = b.createOrFold<AffineApplyOp>( 121 loc, modMap, 122 ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); 123 124 // Set new upper loop bound. 125 Value previousUb = forOp.upperBound(); 126 b.updateRootInPlace(forOp, 127 [&]() { forOp.upperBoundMutable().assign(splitBound); }); 128 b.setInsertionPointAfter(forOp); 129 130 // Do we need one more iteration? 131 Value hasMoreIter = 132 b.create<CmpIOp>(loc, CmpIPredicate::slt, splitBound, previousUb); 133 134 // Create IfOp for last iteration. 135 auto resultTypes = llvm::to_vector<4>( 136 llvm::map_range(forOp.initArgs(), [](Value v) { return v.getType(); })); 137 ifOp = b.create<scf::IfOp>(loc, resultTypes, hasMoreIter, 138 /*withElseRegion=*/!resultTypes.empty()); 139 forOp.replaceAllUsesWith(ifOp->getResults()); 140 141 // Build then case. 142 BlockAndValueMapping bvm; 143 bvm.map(forOp.region().getArgument(0), splitBound); 144 for (auto it : llvm::zip(forOp.region().getArguments().drop_front(), 145 forOp->getResults())) { 146 bvm.map(std::get<0>(it), std::get<1>(it)); 147 } 148 b.cloneRegionBefore(forOp.region(), ifOp.thenRegion(), 149 ifOp.thenRegion().begin(), bvm); 150 // Build else case. 151 if (!resultTypes.empty()) 152 ifOp.getElseBodyBuilder().create<scf::YieldOp>(loc, forOp->getResults()); 153 154 return success(); 155 } 156 157 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__"; 158 159 namespace { 160 struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> { 161 using OpRewritePattern<ForOp>::OpRewritePattern; 162 163 LogicalResult matchAndRewrite(ForOp forOp, 164 PatternRewriter &rewriter) const override { 165 if (forOp->hasAttr(kPeeledLoopLabel)) 166 return failure(); 167 168 scf::IfOp ifOp; 169 if (failed(peelForLoop(rewriter, forOp, ifOp))) 170 return failure(); 171 // Apply label, so that the same loop is not rewritten a second time. 172 rewriter.updateRootInPlace(forOp, [&]() { 173 forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr()); 174 }); 175 176 return success(); 177 } 178 }; 179 } // namespace 180 181 namespace { 182 struct ParallelLoopSpecialization 183 : public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> { 184 void runOnFunction() override { 185 getFunction().walk( 186 [](ParallelOp op) { specializeParallelLoopForUnrolling(op); }); 187 } 188 }; 189 190 struct ForLoopSpecialization 191 : public SCFForLoopSpecializationBase<ForLoopSpecialization> { 192 void runOnFunction() override { 193 getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); }); 194 } 195 }; 196 197 struct ForLoopPeeling : public SCFForLoopPeelingBase<ForLoopPeeling> { 198 void runOnFunction() override { 199 FuncOp funcOp = getFunction(); 200 MLIRContext *ctx = funcOp.getContext(); 201 RewritePatternSet patterns(ctx); 202 patterns.add<ForLoopPeelingPattern>(ctx); 203 (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); 204 205 // Drop the marker. 206 funcOp.walk([](ForOp op) { op->removeAttr(kPeeledLoopLabel); }); 207 } 208 }; 209 } // namespace 210 211 std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() { 212 return std::make_unique<ParallelLoopSpecialization>(); 213 } 214 215 std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() { 216 return std::make_unique<ForLoopSpecialization>(); 217 } 218 219 std::unique_ptr<Pass> mlir::createForLoopPeelingPass() { 220 return std::make_unique<ForLoopPeeling>(); 221 } 222