1 //===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam. Unroll and jam is a transformation 10 // that improves locality, in particular, register reuse, while also improving 11 // operation level parallelism. The example below shows what it does in nearly 12 // the general case. Loop unroll and jam currently works if the bounds of the 13 // loops inner to the loop being unroll-jammed do not depend on the latter. 14 // 15 // Before After unroll and jam of i by factor 2: 16 // 17 // for i, step = 2 18 // for i S1(i); 19 // S1; S2(i); 20 // S2; S1(i+1); 21 // for j S2(i+1); 22 // S3; for j 23 // S4; S3(i, j); 24 // S5; S4(i, j); 25 // S6; S3(i+1, j) 26 // S4(i+1, j) 27 // S5(i); 28 // S6(i); 29 // S5(i+1); 30 // S6(i+1); 31 // 32 // Note: 'if/else' blocks are not jammed. So, if there are loops inside if 33 // op's, bodies of those loops will not be jammed. 34 //===----------------------------------------------------------------------===// 35 #include "mlir/Analysis/LoopAnalysis.h" 36 #include "mlir/Dialect/Affine/IR/AffineOps.h" 37 #include "mlir/Dialect/Affine/Passes.h" 38 #include "mlir/IR/AffineExpr.h" 39 #include "mlir/IR/AffineMap.h" 40 #include "mlir/IR/BlockAndValueMapping.h" 41 #include "mlir/IR/Builders.h" 42 #include "mlir/Pass/Pass.h" 43 #include "mlir/Transforms/LoopUtils.h" 44 #include "llvm/ADT/DenseMap.h" 45 #include "llvm/Support/CommandLine.h" 46 47 using namespace mlir; 48 49 #define DEBUG_TYPE "affine-loop-unroll-jam" 50 51 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); 52 53 // Loop unroll and jam factor. 54 static llvm::cl::opt<unsigned> 55 clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden, 56 llvm::cl::desc("Use this unroll jam factor for all loops" 57 " (default 4)"), 58 llvm::cl::cat(clOptionsCategory)); 59 60 namespace { 61 /// Loop unroll jam pass. Currently, this just unroll jams the first 62 /// outer loop in a Function. 63 struct LoopUnrollAndJam : public FunctionPass<LoopUnrollAndJam> { 64 Optional<unsigned> unrollJamFactor; 65 static const unsigned kDefaultUnrollJamFactor = 4; 66 67 explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None) 68 : unrollJamFactor(unrollJamFactor) {} 69 70 void runOnFunction() override; 71 LogicalResult runOnAffineForOp(AffineForOp forOp); 72 }; 73 } // end anonymous namespace 74 75 std::unique_ptr<OpPassBase<FuncOp>> 76 mlir::createLoopUnrollAndJamPass(int unrollJamFactor) { 77 return std::make_unique<LoopUnrollAndJam>( 78 unrollJamFactor == -1 ? None : Optional<unsigned>(unrollJamFactor)); 79 } 80 81 void LoopUnrollAndJam::runOnFunction() { 82 // Currently, just the outermost loop from the first loop nest is 83 // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on 84 // any for operation. 85 auto &entryBlock = getFunction().front(); 86 if (auto forOp = dyn_cast<AffineForOp>(entryBlock.front())) 87 runOnAffineForOp(forOp); 88 } 89 90 /// Unroll and jam a 'affine.for' op. Default unroll jam factor is 91 /// kDefaultUnrollJamFactor. Return failure if nothing was done. 92 LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) { 93 // Unroll and jam by the factor that was passed if any. 94 if (unrollJamFactor.hasValue()) 95 return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue()); 96 // Otherwise, unroll jam by the command-line factor if one was specified. 97 if (clUnrollJamFactor.getNumOccurrences() > 0) 98 return loopUnrollJamByFactor(forOp, clUnrollJamFactor); 99 100 // Unroll and jam by four otherwise. 101 return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor); 102 } 103 104 LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp, 105 uint64_t unrollJamFactor) { 106 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 107 108 if (mayBeConstantTripCount.hasValue() && 109 mayBeConstantTripCount.getValue() < unrollJamFactor) 110 return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue()); 111 return loopUnrollJamByFactor(forOp, unrollJamFactor); 112 } 113 114 /// Unrolls and jams this loop by the specified factor. 115 LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp, 116 uint64_t unrollJamFactor) { 117 // Gathers all maximal sub-blocks of operations that do not themselves 118 // include a for op (a operation could have a descendant for op though 119 // in its tree). Ignore the block terminators. 120 struct JamBlockGatherer { 121 // Store iterators to the first and last op of each sub-block found. 122 std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks; 123 124 // This is a linear time walk. 125 void walk(Operation *op) { 126 for (auto ®ion : op->getRegions()) 127 for (auto &block : region) 128 walk(block); 129 } 130 void walk(Block &block) { 131 for (auto it = block.begin(), e = std::prev(block.end()); it != e;) { 132 auto subBlockStart = it; 133 while (it != e && !isa<AffineForOp>(&*it)) 134 ++it; 135 if (it != subBlockStart) 136 subBlocks.push_back({subBlockStart, std::prev(it)}); 137 // Process all for insts that appear next. 138 while (it != e && isa<AffineForOp>(&*it)) 139 walk(&*it++); 140 } 141 } 142 }; 143 144 assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1"); 145 146 if (unrollJamFactor == 1) 147 return promoteIfSingleIteration(forOp); 148 149 if (forOp.getBody()->empty() || 150 forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) 151 return failure(); 152 153 // Loops where both lower and upper bounds are multi-result maps won't be 154 // unrolled (since the trip can't be expressed as an affine function in 155 // general). 156 // TODO(mlir-team): this may not be common, but we could support the case 157 // where the lower bound is a multi-result map and the ub is a single result 158 // one. 159 if (forOp.getLowerBoundMap().getNumResults() != 1) 160 return failure(); 161 162 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 163 // If the trip count is lower than the unroll jam factor, no unroll jam. 164 if (mayBeConstantTripCount.hasValue() && 165 mayBeConstantTripCount.getValue() < unrollJamFactor) 166 return failure(); 167 168 auto *forInst = forOp.getOperation(); 169 170 // Gather all sub-blocks to jam upon the loop being unrolled. 171 JamBlockGatherer jbg; 172 jbg.walk(forInst); 173 auto &subBlocks = jbg.subBlocks; 174 175 // Generate the cleanup loop if trip count isn't a multiple of 176 // unrollJamFactor. 177 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) { 178 // Insert the cleanup loop right after 'forOp'. 179 OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst))); 180 auto cleanupAffineForOp = cast<AffineForOp>(builder.clone(*forInst)); 181 // Adjust the lower bound of the cleanup loop; its upper bound is the same 182 // as the original loop's upper bound. 183 AffineMap cleanupMap; 184 SmallVector<Value, 4> cleanupOperands; 185 getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap, 186 &cleanupOperands, builder); 187 cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap); 188 189 // Promote the cleanup loop if it has turned into a single iteration loop. 190 promoteIfSingleIteration(cleanupAffineForOp); 191 192 // Adjust the upper bound of the original loop - it will be the same as the 193 // cleanup loop's lower bound. Its lower bound remains unchanged. 194 forOp.setUpperBound(cleanupOperands, cleanupMap); 195 } 196 197 // Scale the step of loop being unroll-jammed by the unroll-jam factor. 198 int64_t step = forOp.getStep(); 199 forOp.setStep(step * unrollJamFactor); 200 201 auto forOpIV = forOp.getInductionVar(); 202 // Unroll and jam (appends unrollJamFactor - 1 additional copies). 203 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { 204 // Operand map persists across all sub-blocks. 205 BlockAndValueMapping operandMapping; 206 for (auto &subBlock : subBlocks) { 207 // Builder to insert unroll-jammed bodies. Insert right at the end of 208 // sub-block. 209 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second)); 210 211 // If the induction variable is used, create a remapping to the value for 212 // this unrolled instance. 213 if (!forOpIV.use_empty()) { 214 // iv' = iv + i, i = 1 to unrollJamFactor-1. 215 auto d0 = builder.getAffineDimExpr(0); 216 auto bumpMap = AffineMap::get(1, 0, {d0 + i * step}); 217 auto ivUnroll = 218 builder.create<AffineApplyOp>(forInst->getLoc(), bumpMap, forOpIV); 219 operandMapping.map(forOpIV, ivUnroll); 220 } 221 // Clone the sub-block being unroll-jammed. 222 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) { 223 builder.clone(*it, operandMapping); 224 } 225 } 226 } 227 228 // Promote the loop body up if this has turned into a single iteration loop. 229 promoteIfSingleIteration(forOp); 230 return success(); 231 } 232 233 static PassRegistration<LoopUnrollAndJam> pass("affine-loop-unroll-jam", 234 "Unroll and jam loops"); 235