1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop software pipelining 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 15 #include "mlir/Dialect/SCF/SCF.h" 16 #include "mlir/Dialect/SCF/Transforms.h" 17 #include "mlir/Dialect/SCF/Utils/Utils.h" 18 #include "mlir/Dialect/StandardOps/IR/Ops.h" 19 #include "mlir/IR/BlockAndValueMapping.h" 20 #include "mlir/IR/PatternMatch.h" 21 #include "mlir/Support/MathExtras.h" 22 23 using namespace mlir; 24 using namespace mlir::scf; 25 26 namespace { 27 28 /// Helper to keep internal information during pipelining transformation. 29 struct LoopPipelinerInternal { 30 /// Coarse liverange information for ops used across stages. 31 struct LiverangeInfo { 32 unsigned lastUseStage = 0; 33 unsigned defStage = 0; 34 }; 35 36 protected: 37 ForOp forOp; 38 unsigned maxStage = 0; 39 DenseMap<Operation *, unsigned> stages; 40 std::vector<Operation *> opOrder; 41 int64_t ub; 42 int64_t lb; 43 int64_t step; 44 PipeliningOption::AnnotationlFnType annotateFn = nullptr; 45 46 // When peeling the kernel we generate several version of each value for 47 // different stage of the prologue. This map tracks the mapping between 48 // original Values in the loop and the different versions 49 // peeled from the loop. 50 DenseMap<Value, llvm::SmallVector<Value>> valueMapping; 51 52 /// Assign a value to `valueMapping`, this means `val` represents the version 53 /// `idx` of `key` in the epilogue. 54 void setValueMapping(Value key, Value el, int64_t idx); 55 56 public: 57 /// Initalize the information for the given `op`, return true if it 58 /// satisfies the pre-condition to apply pipelining. 59 bool initializeLoopInfo(ForOp op, const PipeliningOption &options); 60 /// Emits the prologue, this creates `maxStage - 1` part which will contain 61 /// operations from stages [0; i], where i is the part index. 62 void emitPrologue(PatternRewriter &rewriter); 63 /// Gather liverange information for Values that are used in a different stage 64 /// than its definition. 65 llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues(); 66 scf::ForOp createKernelLoop( 67 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 68 PatternRewriter &rewriter, 69 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap); 70 /// Emits the pipelined kernel. This clones loop operations following user 71 /// order and remaps operands defined in a different stage as their use. 72 void createKernel( 73 scf::ForOp newForOp, 74 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 75 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 76 PatternRewriter &rewriter); 77 /// Emits the epilogue, this creates `maxStage - 1` part which will contain 78 /// operations from stages [i; maxStage], where i is the part index. 79 llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter); 80 }; 81 82 bool LoopPipelinerInternal::initializeLoopInfo( 83 ForOp op, const PipeliningOption &options) { 84 forOp = op; 85 auto upperBoundCst = 86 forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 87 auto lowerBoundCst = 88 forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 89 auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 90 if (!upperBoundCst || !lowerBoundCst || !stepCst) 91 return false; 92 ub = upperBoundCst.value(); 93 lb = lowerBoundCst.value(); 94 step = stepCst.value(); 95 int64_t numIteration = ceilDiv(ub - lb, step); 96 std::vector<std::pair<Operation *, unsigned>> schedule; 97 options.getScheduleFn(forOp, schedule); 98 if (schedule.empty()) 99 return false; 100 101 opOrder.reserve(schedule.size()); 102 for (auto &opSchedule : schedule) { 103 maxStage = std::max(maxStage, opSchedule.second); 104 stages[opSchedule.first] = opSchedule.second; 105 opOrder.push_back(opSchedule.first); 106 } 107 if (numIteration <= maxStage) 108 return false; 109 110 // All operations need to have a stage. 111 if (forOp 112 .walk([this](Operation *op) { 113 if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) && 114 stages.find(op) == stages.end()) 115 return WalkResult::interrupt(); 116 return WalkResult::advance(); 117 }) 118 .wasInterrupted()) 119 return false; 120 121 // Only support loop carried dependency with a distance of 1. This means the 122 // source of all the scf.yield operands needs to be defined by operations in 123 // the loop. 124 if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), 125 [this](Value operand) { 126 Operation *def = operand.getDefiningOp(); 127 return !def || stages.find(def) == stages.end(); 128 })) 129 return false; 130 annotateFn = options.annotateFn; 131 return true; 132 } 133 134 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) { 135 // Initialize the iteration argument to the loop initiale values. 136 for (BlockArgument &arg : forOp.getRegionIterArgs()) { 137 OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg); 138 setValueMapping(arg, operand.get(), 0); 139 } 140 auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator()); 141 for (int64_t i = 0; i < maxStage; i++) { 142 // special handling for induction variable as the increment is implicit. 143 Value iv = 144 rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step); 145 setValueMapping(forOp.getInductionVar(), iv, i); 146 for (Operation *op : opOrder) { 147 if (stages[op] > i) 148 continue; 149 Operation *newOp = rewriter.clone(*op); 150 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 151 auto it = valueMapping.find(op->getOperand(opIdx)); 152 if (it != valueMapping.end()) 153 newOp->setOperand(opIdx, it->second[i - stages[op]]); 154 } 155 if (annotateFn) 156 annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); 157 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 158 setValueMapping(op->getResult(destId), newOp->getResult(destId), 159 i - stages[op]); 160 // If the value is a loop carried dependency update the loop argument 161 // mapping. 162 for (OpOperand &operand : yield->getOpOperands()) { 163 if (operand.get() != op->getResult(destId)) 164 continue; 165 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 166 newOp->getResult(destId), i - stages[op] + 1); 167 } 168 } 169 } 170 } 171 } 172 173 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 174 LoopPipelinerInternal::analyzeCrossStageValues() { 175 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues; 176 for (Operation *op : opOrder) { 177 unsigned stage = stages[op]; 178 for (OpOperand &operand : op->getOpOperands()) { 179 Operation *def = operand.get().getDefiningOp(); 180 if (!def) 181 continue; 182 auto defStage = stages.find(def); 183 if (defStage == stages.end() || defStage->second == stage) 184 continue; 185 assert(stage > defStage->second); 186 LiverangeInfo &info = crossStageValues[operand.get()]; 187 info.defStage = defStage->second; 188 info.lastUseStage = std::max(info.lastUseStage, stage); 189 } 190 } 191 return crossStageValues; 192 } 193 194 scf::ForOp LoopPipelinerInternal::createKernelLoop( 195 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 196 &crossStageValues, 197 PatternRewriter &rewriter, 198 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) { 199 // Creates the list of initial values associated to values used across 200 // stages. The initial values come from the prologue created above. 201 // Keep track of the kernel argument associated to each version of the 202 // values passed to the kernel. 203 llvm::SmallVector<Value> newLoopArg; 204 // For existing loop argument initialize them with the right version from the 205 // prologue. 206 for (const auto &retVal : 207 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 208 Operation *def = retVal.value().getDefiningOp(); 209 assert(def && "Only support loop carried dependencies of distance 1"); 210 unsigned defStage = stages[def]; 211 Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]] 212 [maxStage - defStage]; 213 assert(valueVersion); 214 newLoopArg.push_back(valueVersion); 215 } 216 for (auto escape : crossStageValues) { 217 LiverangeInfo &info = escape.second; 218 Value value = escape.first; 219 for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage; 220 stageIdx++) { 221 Value valueVersion = 222 valueMapping[value][maxStage - info.lastUseStage + stageIdx]; 223 assert(valueVersion); 224 newLoopArg.push_back(valueVersion); 225 loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage - 226 stageIdx)] = newLoopArg.size() - 1; 227 } 228 } 229 230 // Create the new kernel loop. Since we need to peel `numStages - 1` 231 // iteration we change the upper bound to remove those iterations. 232 Value newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), 233 ub - maxStage * step); 234 auto newForOp = 235 rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb, 236 forOp.getStep(), newLoopArg); 237 return newForOp; 238 } 239 240 void LoopPipelinerInternal::createKernel( 241 scf::ForOp newForOp, 242 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 243 &crossStageValues, 244 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 245 PatternRewriter &rewriter) { 246 valueMapping.clear(); 247 248 // Create the kernel, we clone instruction based on the order given by 249 // user and remap operands coming from a previous stages. 250 rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); 251 BlockAndValueMapping mapping; 252 mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); 253 for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) { 254 mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); 255 } 256 for (Operation *op : opOrder) { 257 int64_t useStage = stages[op]; 258 auto *newOp = rewriter.clone(*op, mapping); 259 for (OpOperand &operand : op->getOpOperands()) { 260 // Special case for the induction variable uses. We replace it with a 261 // version incremented based on the stage where it is used. 262 if (operand.get() == forOp.getInductionVar()) { 263 rewriter.setInsertionPoint(newOp); 264 Value offset = rewriter.create<arith::ConstantIndexOp>( 265 forOp.getLoc(), (maxStage - stages[op]) * step); 266 Value iv = rewriter.create<arith::AddIOp>( 267 forOp.getLoc(), newForOp.getInductionVar(), offset); 268 newOp->setOperand(operand.getOperandNumber(), iv); 269 rewriter.setInsertionPointAfter(newOp); 270 continue; 271 } 272 auto arg = operand.get().dyn_cast<BlockArgument>(); 273 if (arg && arg.getOwner() == forOp.getBody()) { 274 // If the value is a loop carried value coming from stage N + 1 remap, 275 // it will become a direct use. 276 Value ret = forOp.getBody()->getTerminator()->getOperand( 277 arg.getArgNumber() - 1); 278 Operation *dep = ret.getDefiningOp(); 279 if (!dep) 280 continue; 281 auto stageDep = stages.find(dep); 282 if (stageDep == stages.end() || stageDep->second == useStage) 283 continue; 284 assert(stageDep->second == useStage + 1); 285 newOp->setOperand(operand.getOperandNumber(), 286 mapping.lookupOrDefault(ret)); 287 continue; 288 } 289 // For operands defined in a previous stage we need to remap it to use 290 // the correct region argument. We look for the right version of the 291 // Value based on the stage where it is used. 292 Operation *def = operand.get().getDefiningOp(); 293 if (!def) 294 continue; 295 auto stageDef = stages.find(def); 296 if (stageDef == stages.end() || stageDef->second == useStage) 297 continue; 298 auto remap = loopArgMap.find( 299 std::make_pair(operand.get(), useStage - stageDef->second)); 300 assert(remap != loopArgMap.end()); 301 newOp->setOperand(operand.getOperandNumber(), 302 newForOp.getRegionIterArgs()[remap->second]); 303 } 304 if (annotateFn) 305 annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0); 306 } 307 308 // Collect the Values that need to be returned by the forOp. For each 309 // value we need to have `LastUseStage - DefStage` number of versions 310 // returned. 311 // We create a mapping between original values and the associated loop 312 // returned values that will be needed by the epilogue. 313 llvm::SmallVector<Value> yieldOperands; 314 for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) { 315 yieldOperands.push_back(mapping.lookupOrDefault(retVal)); 316 } 317 for (auto &it : crossStageValues) { 318 int64_t version = maxStage - it.second.lastUseStage + 1; 319 unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage; 320 // add the original verstion to yield ops. 321 // If there is a liverange spanning across more than 2 stages we need to add 322 // extra arg. 323 for (unsigned i = 1; i < numVersionReturned; i++) { 324 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 325 version++); 326 yieldOperands.push_back( 327 newForOp.getBody()->getArguments()[yieldOperands.size() + 1 + 328 newForOp.getNumInductionVars()]); 329 } 330 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 331 version++); 332 yieldOperands.push_back(mapping.lookupOrDefault(it.first)); 333 } 334 // Map the yield operand to the forOp returned value. 335 for (const auto &retVal : 336 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 337 Operation *def = retVal.value().getDefiningOp(); 338 assert(def && "Only support loop carried dependencies of distance 1"); 339 unsigned defStage = stages[def]; 340 setValueMapping(forOp.getRegionIterArgs()[retVal.index()], 341 newForOp->getResult(retVal.index()), 342 maxStage - defStage + 1); 343 } 344 rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands); 345 } 346 347 llvm::SmallVector<Value> 348 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) { 349 llvm::SmallVector<Value> returnValues(forOp->getNumResults()); 350 // Emit different versions of the induction variable. They will be 351 // removed by dead code if not used. 352 for (int64_t i = 0; i < maxStage; i++) { 353 Value newlastIter = rewriter.create<arith::ConstantIndexOp>( 354 forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i)); 355 setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); 356 } 357 // Emit `maxStage - 1` epilogue part that includes operations fro stages 358 // [i; maxStage]. 359 for (int64_t i = 1; i <= maxStage; i++) { 360 for (Operation *op : opOrder) { 361 if (stages[op] < i) 362 continue; 363 Operation *newOp = rewriter.clone(*op); 364 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 365 auto it = valueMapping.find(op->getOperand(opIdx)); 366 if (it != valueMapping.end()) { 367 Value v = it->second[maxStage - stages[op] + i]; 368 assert(v); 369 newOp->setOperand(opIdx, v); 370 } 371 } 372 if (annotateFn) 373 annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1); 374 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 375 setValueMapping(op->getResult(destId), newOp->getResult(destId), 376 maxStage - stages[op] + i); 377 // If the value is a loop carried dependency update the loop argument 378 // mapping and keep track of the last version to replace the original 379 // forOp uses. 380 for (OpOperand &operand : 381 forOp.getBody()->getTerminator()->getOpOperands()) { 382 if (operand.get() != op->getResult(destId)) 383 continue; 384 unsigned version = maxStage - stages[op] + i + 1; 385 // If the version is greater than maxStage it means it maps to the 386 // original forOp returned value. 387 if (version > maxStage) { 388 returnValues[operand.getOperandNumber()] = newOp->getResult(destId); 389 continue; 390 } 391 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 392 newOp->getResult(destId), version); 393 } 394 } 395 } 396 } 397 return returnValues; 398 } 399 400 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) { 401 auto it = valueMapping.find(key); 402 // If the value is not in the map yet add a vector big enough to store all 403 // versions. 404 if (it == valueMapping.end()) 405 it = 406 valueMapping 407 .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1))) 408 .first; 409 it->second[idx] = el; 410 } 411 412 /// Generate a pipelined version of the scf.for loop based on the schedule given 413 /// as option. This applies the mechanical transformation of changing the loop 414 /// and generating the prologue/epilogue for the pipelining and doesn't make any 415 /// decision regarding the schedule. 416 /// Based on the option the loop is split into several stages. 417 /// The transformation assumes that the scheduling given by user is valid. 418 /// For example if we break a loop into 3 stages named S0, S1, S2 we would 419 /// generate the following code with the number in parenthesis the iteration 420 /// index: 421 /// S0(0) // Prologue 422 /// S0(1) S1(0) // Prologue 423 /// scf.for %I = %C0 to %N - 2 { 424 /// S0(I+2) S1(I+1) S2(I) // Pipelined kernel 425 /// } 426 /// S1(N) S2(N-1) // Epilogue 427 /// S2(N) // Epilogue 428 struct ForLoopPipelining : public OpRewritePattern<ForOp> { 429 ForLoopPipelining(const PipeliningOption &options, MLIRContext *context) 430 : OpRewritePattern<ForOp>(context), options(options) {} 431 LogicalResult matchAndRewrite(ForOp forOp, 432 PatternRewriter &rewriter) const override { 433 434 LoopPipelinerInternal pipeliner; 435 if (!pipeliner.initializeLoopInfo(forOp, options)) 436 return failure(); 437 438 // 1. Emit prologue. 439 pipeliner.emitPrologue(rewriter); 440 441 // 2. Track values used across stages. When a value cross stages it will 442 // need to be passed as loop iteration arguments. 443 // We first collect the values that are used in a different stage than where 444 // they are defined. 445 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 446 crossStageValues = pipeliner.analyzeCrossStageValues(); 447 448 // Mapping between original loop values used cross stage and the block 449 // arguments associated after pipelining. A Value may map to several 450 // arguments if its liverange spans across more than 2 stages. 451 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap; 452 // 3. Create the new kernel loop and return the block arguments mapping. 453 ForOp newForOp = 454 pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap); 455 // Create the kernel block, order ops based on user choice and remap 456 // operands. 457 pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter); 458 459 // 4. Emit the epilogue after the new forOp. 460 rewriter.setInsertionPointAfter(newForOp); 461 llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter); 462 463 // 5. Erase the original loop and replace the uses with the epilogue output. 464 if (forOp->getNumResults() > 0) 465 rewriter.replaceOp(forOp, returnValues); 466 else 467 rewriter.eraseOp(forOp); 468 469 return success(); 470 } 471 472 protected: 473 PipeliningOption options; 474 }; 475 476 } // namespace 477 478 void mlir::scf::populateSCFLoopPipeliningPatterns( 479 RewritePatternSet &patterns, const PipeliningOption &options) { 480 patterns.add<ForLoopPipelining>(options, patterns.getContext()); 481 } 482