1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop software pipelining 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/SCF/SCF.h" 15 #include "mlir/Dialect/SCF/Transforms.h" 16 #include "mlir/Dialect/SCF/Utils.h" 17 #include "mlir/Dialect/StandardOps/IR/Ops.h" 18 #include "mlir/IR/BlockAndValueMapping.h" 19 #include "mlir/IR/PatternMatch.h" 20 #include "mlir/Support/MathExtras.h" 21 22 using namespace mlir; 23 using namespace mlir::scf; 24 25 namespace { 26 27 /// Helper to keep internal information during pipelining transformation. 28 struct LoopPipelinerInternal { 29 /// Coarse liverange information for ops used across stages. 30 struct LiverangeInfo { 31 unsigned lastUseStage = 0; 32 unsigned defStage = 0; 33 }; 34 35 protected: 36 ForOp forOp; 37 unsigned maxStage = 0; 38 DenseMap<Operation *, unsigned> stages; 39 std::vector<Operation *> opOrder; 40 int64_t ub; 41 int64_t lb; 42 int64_t step; 43 44 // When peeling the kernel we generate several version of each value for 45 // different stage of the prologue. This map tracks the mapping between 46 // original Values in the loop and the different versions 47 // peeled from the loop. 48 DenseMap<Value, llvm::SmallVector<Value>> valueMapping; 49 50 /// Assign a value to `valueMapping`, this means `val` represents the version 51 /// `idx` of `key` in the epilogue. 52 void setValueMapping(Value key, Value el, int64_t idx); 53 54 public: 55 /// Initalize the information for the given `op`, return true if it 56 /// satisfies the pre-condition to apply pipelining. 57 bool initializeLoopInfo(ForOp op, const PipeliningOption &options); 58 /// Emits the prologue, this creates `maxStage - 1` part which will contain 59 /// operations from stages [0; i], where i is the part index. 60 void emitPrologue(PatternRewriter &rewriter); 61 /// Gather liverange information for Values that are used in a different stage 62 /// than its definition. 63 llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues(); 64 scf::ForOp createKernelLoop( 65 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 66 PatternRewriter &rewriter, 67 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap); 68 /// Emits the pipelined kernel. This clones loop operations following user 69 /// order and remaps operands defined in a different stage as their use. 70 void createKernel( 71 scf::ForOp newForOp, 72 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 73 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 74 PatternRewriter &rewriter); 75 /// Emits the epilogue, this creates `maxStage - 1` part which will contain 76 /// operations from stages [i; maxStage], where i is the part index. 77 llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter); 78 }; 79 80 bool LoopPipelinerInternal::initializeLoopInfo( 81 ForOp op, const PipeliningOption &options) { 82 forOp = op; 83 auto upperBoundCst = forOp.upperBound().getDefiningOp<ConstantIndexOp>(); 84 auto lowerBoundCst = forOp.lowerBound().getDefiningOp<ConstantIndexOp>(); 85 auto stepCst = forOp.step().getDefiningOp<ConstantIndexOp>(); 86 if (!upperBoundCst || !lowerBoundCst || !stepCst) 87 return false; 88 ub = upperBoundCst.getValue(); 89 lb = lowerBoundCst.getValue(); 90 step = stepCst.getValue(); 91 int64_t numIteration = ceilDiv(ub - lb, step); 92 std::vector<std::pair<Operation *, unsigned>> schedule; 93 options.getScheduleFn(forOp, schedule); 94 if (schedule.empty()) 95 return false; 96 97 opOrder.reserve(schedule.size()); 98 for (auto &opSchedule : schedule) { 99 maxStage = std::max(maxStage, opSchedule.second); 100 stages[opSchedule.first] = opSchedule.second; 101 opOrder.push_back(opSchedule.first); 102 } 103 if (numIteration <= maxStage) 104 return false; 105 106 // All operations need to have a stage. 107 if (forOp 108 .walk([this](Operation *op) { 109 if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) && 110 stages.find(op) == stages.end()) 111 return WalkResult::interrupt(); 112 return WalkResult::advance(); 113 }) 114 .wasInterrupted()) 115 return false; 116 117 // Only support loop carried dependency with a distance of 1. This means the 118 // source of all the scf.yield operands needs to be defined by operations in 119 // the loop. 120 if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), 121 [this](Value operand) { 122 Operation *def = operand.getDefiningOp(); 123 return !def || stages.find(def) == stages.end(); 124 })) 125 return false; 126 return true; 127 } 128 129 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) { 130 // Initialize the iteration argument to the loop initiale values. 131 for (BlockArgument &arg : forOp.getRegionIterArgs()) { 132 OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg); 133 setValueMapping(arg, operand.get(), 0); 134 } 135 auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator()); 136 for (int64_t i = 0; i < maxStage; i++) { 137 // special handling for induction variable as the increment is implicit. 138 Value iv = rewriter.create<ConstantIndexOp>(forOp.getLoc(), lb + i); 139 setValueMapping(forOp.getInductionVar(), iv, i); 140 for (Operation *op : opOrder) { 141 if (stages[op] > i) 142 continue; 143 Operation *newOp = rewriter.clone(*op); 144 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 145 auto it = valueMapping.find(op->getOperand(opIdx)); 146 if (it != valueMapping.end()) 147 newOp->setOperand(opIdx, it->second[i - stages[op]]); 148 } 149 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 150 setValueMapping(op->getResult(destId), newOp->getResult(destId), 151 i - stages[op]); 152 // If the value is a loop carried dependency update the loop argument 153 // mapping. 154 for (OpOperand &operand : yield->getOpOperands()) { 155 if (operand.get() != op->getResult(destId)) 156 continue; 157 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 158 newOp->getResult(destId), i - stages[op] + 1); 159 } 160 } 161 } 162 } 163 } 164 165 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 166 LoopPipelinerInternal::analyzeCrossStageValues() { 167 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues; 168 for (Operation *op : opOrder) { 169 unsigned stage = stages[op]; 170 for (OpOperand &operand : op->getOpOperands()) { 171 Operation *def = operand.get().getDefiningOp(); 172 if (!def) 173 continue; 174 auto defStage = stages.find(def); 175 if (defStage == stages.end() || defStage->second == stage) 176 continue; 177 assert(stage > defStage->second); 178 LiverangeInfo &info = crossStageValues[operand.get()]; 179 info.defStage = defStage->second; 180 info.lastUseStage = std::max(info.lastUseStage, stage); 181 } 182 } 183 return crossStageValues; 184 } 185 186 scf::ForOp LoopPipelinerInternal::createKernelLoop( 187 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 188 &crossStageValues, 189 PatternRewriter &rewriter, 190 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) { 191 // Creates the list of initial values associated to values used across 192 // stages. The initial values come from the prologue created above. 193 // Keep track of the kernel argument associated to each version of the 194 // values passed to the kernel. 195 llvm::SmallVector<Value> newLoopArg; 196 // For existing loop argument initialize them with the right version from the 197 // prologue. 198 for (auto retVal : 199 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 200 Operation *def = retVal.value().getDefiningOp(); 201 assert(def && "Only support loop carried dependencies of distance 1"); 202 unsigned defStage = stages[def]; 203 Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]] 204 [maxStage - defStage]; 205 assert(valueVersion); 206 newLoopArg.push_back(valueVersion); 207 } 208 for (auto escape : crossStageValues) { 209 LiverangeInfo &info = escape.second; 210 Value value = escape.first; 211 for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage; 212 stageIdx++) { 213 Value valueVersion = 214 valueMapping[value][maxStage - info.lastUseStage + stageIdx]; 215 assert(valueVersion); 216 newLoopArg.push_back(valueVersion); 217 loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage - 218 stageIdx)] = newLoopArg.size() - 1; 219 } 220 } 221 222 // Create the new kernel loop. Since we need to peel `numStages - 1` 223 // iteration we change the upper bound to remove those iterations. 224 Value newUb = 225 rewriter.create<ConstantIndexOp>(forOp.getLoc(), ub - maxStage * step); 226 auto newForOp = rewriter.create<scf::ForOp>( 227 forOp.getLoc(), forOp.lowerBound(), newUb, forOp.step(), newLoopArg); 228 return newForOp; 229 } 230 231 void LoopPipelinerInternal::createKernel( 232 scf::ForOp newForOp, 233 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 234 &crossStageValues, 235 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 236 PatternRewriter &rewriter) { 237 valueMapping.clear(); 238 239 // Create the kernel, we clone instruction based on the order given by 240 // user and remap operands coming from a previous stages. 241 rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); 242 BlockAndValueMapping mapping; 243 mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); 244 for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) { 245 mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); 246 } 247 for (Operation *op : opOrder) { 248 int64_t useStage = stages[op]; 249 auto *newOp = rewriter.clone(*op, mapping); 250 for (OpOperand &operand : op->getOpOperands()) { 251 // Special case for the induction variable uses. We replace it with a 252 // version incremented based on the stage where it is used. 253 if (operand.get() == forOp.getInductionVar()) { 254 rewriter.setInsertionPoint(newOp); 255 Value offset = rewriter.create<ConstantIndexOp>( 256 forOp.getLoc(), (maxStage - stages[op]) * step); 257 Value iv = rewriter.create<AddIOp>(forOp.getLoc(), 258 newForOp.getInductionVar(), offset); 259 newOp->setOperand(operand.getOperandNumber(), iv); 260 rewriter.setInsertionPointAfter(newOp); 261 continue; 262 } 263 auto arg = operand.get().dyn_cast<BlockArgument>(); 264 if (arg && arg.getOwner() == forOp.getBody()) { 265 // If the value is a loop carried value coming from stage N + 1 remap, 266 // it will become a direct use. 267 Value ret = forOp.getBody()->getTerminator()->getOperand( 268 arg.getArgNumber() - 1); 269 Operation *dep = ret.getDefiningOp(); 270 if (!dep) 271 continue; 272 auto stageDep = stages.find(dep); 273 if (stageDep == stages.end() || stageDep->second == useStage) 274 continue; 275 assert(stageDep->second == useStage + 1); 276 newOp->setOperand(operand.getOperandNumber(), 277 mapping.lookupOrDefault(ret)); 278 continue; 279 } 280 // For operands defined in a previous stage we need to remap it to use 281 // the correct region argument. We look for the right version of the 282 // Value based on the stage where it is used. 283 Operation *def = operand.get().getDefiningOp(); 284 if (!def) 285 continue; 286 auto stageDef = stages.find(def); 287 if (stageDef == stages.end() || stageDef->second == useStage) 288 continue; 289 auto remap = loopArgMap.find( 290 std::make_pair(operand.get(), useStage - stageDef->second)); 291 assert(remap != loopArgMap.end()); 292 newOp->setOperand(operand.getOperandNumber(), 293 newForOp.getRegionIterArgs()[remap->second]); 294 } 295 } 296 297 // Collect the Values that need to be returned by the forOp. For each 298 // value we need to have `LastUseStage - DefStage` number of versions 299 // returned. 300 // We create a mapping between original values and the associated loop 301 // returned values that will be needed by the epilogue. 302 llvm::SmallVector<Value> yieldOperands; 303 for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) { 304 yieldOperands.push_back(mapping.lookupOrDefault(retVal)); 305 } 306 for (auto &it : crossStageValues) { 307 int64_t version = maxStage - it.second.lastUseStage + 1; 308 unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage; 309 // add the original verstion to yield ops. 310 // If there is a liverange spanning across more than 2 stages we need to add 311 // extra arg. 312 for (unsigned i = 1; i < numVersionReturned; i++) { 313 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 314 version++); 315 yieldOperands.push_back( 316 newForOp.getBody()->getArguments()[yieldOperands.size() + 1 + 317 newForOp.getNumInductionVars()]); 318 } 319 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 320 version++); 321 yieldOperands.push_back(mapping.lookupOrDefault(it.first)); 322 } 323 // Map the yield operand to the forOp returned value. 324 for (auto retVal : 325 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 326 Operation *def = retVal.value().getDefiningOp(); 327 assert(def && "Only support loop carried dependencies of distance 1"); 328 unsigned defStage = stages[def]; 329 setValueMapping(forOp.getRegionIterArgs()[retVal.index()], 330 newForOp->getResult(retVal.index()), 331 maxStage - defStage + 1); 332 } 333 rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands); 334 } 335 336 llvm::SmallVector<Value> 337 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) { 338 llvm::SmallVector<Value> returnValues(forOp->getNumResults()); 339 // Emit different versions of the induction variable. They will be 340 // removed by dead code if not used. 341 for (int64_t i = 0; i < maxStage; i++) { 342 Value newlastIter = rewriter.create<ConstantIndexOp>( 343 forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i)); 344 setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); 345 } 346 // Emit `maxStage - 1` epilogue part that includes operations fro stages 347 // [i; maxStage]. 348 for (int64_t i = 1; i <= maxStage; i++) { 349 for (Operation *op : opOrder) { 350 if (stages[op] < i) 351 continue; 352 Operation *newOp = rewriter.clone(*op); 353 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 354 auto it = valueMapping.find(op->getOperand(opIdx)); 355 if (it != valueMapping.end()) { 356 Value v = it->second[maxStage - stages[op] + i]; 357 assert(v); 358 newOp->setOperand(opIdx, v); 359 } 360 } 361 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 362 setValueMapping(op->getResult(destId), newOp->getResult(destId), 363 maxStage - stages[op] + i); 364 // If the value is a loop carried dependency update the loop argument 365 // mapping and keep track of the last version to replace the original 366 // forOp uses. 367 for (OpOperand &operand : 368 forOp.getBody()->getTerminator()->getOpOperands()) { 369 if (operand.get() != op->getResult(destId)) 370 continue; 371 unsigned version = maxStage - stages[op] + i + 1; 372 // If the version is greater than maxStage it means it maps to the 373 // original forOp returned value. 374 if (version > maxStage) { 375 returnValues[operand.getOperandNumber()] = newOp->getResult(destId); 376 continue; 377 } 378 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 379 newOp->getResult(destId), version); 380 } 381 } 382 } 383 } 384 return returnValues; 385 } 386 387 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) { 388 auto it = valueMapping.find(key); 389 // If the value is not in the map yet add a vector big enough to store all 390 // versions. 391 if (it == valueMapping.end()) 392 it = 393 valueMapping 394 .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1))) 395 .first; 396 it->second[idx] = el; 397 } 398 399 /// Generate a pipelined version of the scf.for loop based on the schedule given 400 /// as option. This applies the mechanical transformation of changing the loop 401 /// and generating the prologue/epilogue for the pipelining and doesn't make any 402 /// decision regarding the schedule. 403 /// Based on the option the loop is split into several stages. 404 /// The transformation assumes that the scheduling given by user is valid. 405 /// For example if we break a loop into 3 stages named S0, S1, S2 we would 406 /// generate the following code with the number in parenthesis the iteration 407 /// index: 408 /// S0(0) // Prologue 409 /// S0(1) S1(0) // Prologue 410 /// scf.for %I = %C0 to %N - 2 { 411 /// S0(I+2) S1(I+1) S2(I) // Pipelined kernel 412 /// } 413 /// S1(N) S2(N-1) // Epilogue 414 /// S2(N) // Epilogue 415 struct ForLoopPipelining : public OpRewritePattern<ForOp> { 416 ForLoopPipelining(const PipeliningOption &options, MLIRContext *context) 417 : OpRewritePattern<ForOp>(context), options(options) {} 418 LogicalResult matchAndRewrite(ForOp forOp, 419 PatternRewriter &rewriter) const override { 420 421 LoopPipelinerInternal pipeliner; 422 if (!pipeliner.initializeLoopInfo(forOp, options)) 423 return failure(); 424 425 // 1. Emit prologue. 426 pipeliner.emitPrologue(rewriter); 427 428 // 2. Track values used across stages. When a value cross stages it will 429 // need to be passed as loop iteration arguments. 430 // We first collect the values that are used in a different stage than where 431 // they are defined. 432 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 433 crossStageValues = pipeliner.analyzeCrossStageValues(); 434 435 // Mapping between original loop values used cross stage and the block 436 // arguments associated after pipelining. A Value may map to several 437 // arguments if its liverange spans across more than 2 stages. 438 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap; 439 // 3. Create the new kernel loop and return the block arguments mapping. 440 ForOp newForOp = 441 pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap); 442 // Create the kernel block, order ops based on user choice and remap 443 // operands. 444 pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter); 445 446 // 4. Emit the epilogue after the new forOp. 447 rewriter.setInsertionPointAfter(newForOp); 448 llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter); 449 450 // 5. Erase the original loop and replace the uses with the epilogue output. 451 if (forOp->getNumResults() > 0) 452 rewriter.replaceOp(forOp, returnValues); 453 else 454 rewriter.eraseOp(forOp); 455 456 return success(); 457 } 458 459 protected: 460 PipeliningOption options; 461 }; 462 463 } // namespace 464 465 void mlir::scf::populateSCFLoopPipeliningPatterns( 466 RewritePatternSet &patterns, const PipeliningOption &options) { 467 patterns.add<ForLoopPipelining>(options, patterns.getContext()); 468 } 469