1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop software pipelining 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 15 #include "mlir/Dialect/SCF/SCF.h" 16 #include "mlir/Dialect/SCF/Transforms.h" 17 #include "mlir/Dialect/SCF/Utils/Utils.h" 18 #include "mlir/IR/BlockAndValueMapping.h" 19 #include "mlir/IR/PatternMatch.h" 20 #include "mlir/Support/MathExtras.h" 21 22 using namespace mlir; 23 using namespace mlir::scf; 24 25 namespace { 26 27 /// Helper to keep internal information during pipelining transformation. 28 struct LoopPipelinerInternal { 29 /// Coarse liverange information for ops used across stages. 30 struct LiverangeInfo { 31 unsigned lastUseStage = 0; 32 unsigned defStage = 0; 33 }; 34 35 protected: 36 ForOp forOp; 37 unsigned maxStage = 0; 38 DenseMap<Operation *, unsigned> stages; 39 std::vector<Operation *> opOrder; 40 int64_t ub; 41 int64_t lb; 42 int64_t step; 43 PipeliningOption::AnnotationlFnType annotateFn = nullptr; 44 bool peelEpilogue; 45 PipeliningOption::PredicateOpFn predicateFn = nullptr; 46 47 // When peeling the kernel we generate several version of each value for 48 // different stage of the prologue. This map tracks the mapping between 49 // original Values in the loop and the different versions 50 // peeled from the loop. 51 DenseMap<Value, llvm::SmallVector<Value>> valueMapping; 52 53 /// Assign a value to `valueMapping`, this means `val` represents the version 54 /// `idx` of `key` in the epilogue. 55 void setValueMapping(Value key, Value el, int64_t idx); 56 57 public: 58 /// Initalize the information for the given `op`, return true if it 59 /// satisfies the pre-condition to apply pipelining. 60 bool initializeLoopInfo(ForOp op, const PipeliningOption &options); 61 /// Emits the prologue, this creates `maxStage - 1` part which will contain 62 /// operations from stages [0; i], where i is the part index. 63 void emitPrologue(PatternRewriter &rewriter); 64 /// Gather liverange information for Values that are used in a different stage 65 /// than its definition. 66 llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues(); 67 scf::ForOp createKernelLoop( 68 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 69 PatternRewriter &rewriter, 70 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap); 71 /// Emits the pipelined kernel. This clones loop operations following user 72 /// order and remaps operands defined in a different stage as their use. 73 void createKernel( 74 scf::ForOp newForOp, 75 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 76 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 77 PatternRewriter &rewriter); 78 /// Emits the epilogue, this creates `maxStage - 1` part which will contain 79 /// operations from stages [i; maxStage], where i is the part index. 80 llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter); 81 }; 82 83 bool LoopPipelinerInternal::initializeLoopInfo( 84 ForOp op, const PipeliningOption &options) { 85 forOp = op; 86 auto upperBoundCst = 87 forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 88 auto lowerBoundCst = 89 forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 90 auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 91 if (!upperBoundCst || !lowerBoundCst || !stepCst) 92 return false; 93 ub = upperBoundCst.value(); 94 lb = lowerBoundCst.value(); 95 step = stepCst.value(); 96 peelEpilogue = options.peelEpilogue; 97 predicateFn = options.predicateFn; 98 if (!peelEpilogue && predicateFn == nullptr) 99 return false; 100 int64_t numIteration = ceilDiv(ub - lb, step); 101 std::vector<std::pair<Operation *, unsigned>> schedule; 102 options.getScheduleFn(forOp, schedule); 103 if (schedule.empty()) 104 return false; 105 106 opOrder.reserve(schedule.size()); 107 for (auto &opSchedule : schedule) { 108 maxStage = std::max(maxStage, opSchedule.second); 109 stages[opSchedule.first] = opSchedule.second; 110 opOrder.push_back(opSchedule.first); 111 } 112 if (numIteration <= maxStage) 113 return false; 114 115 // All operations need to have a stage. 116 if (forOp 117 .walk([this](Operation *op) { 118 if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) && 119 stages.find(op) == stages.end()) 120 return WalkResult::interrupt(); 121 return WalkResult::advance(); 122 }) 123 .wasInterrupted()) 124 return false; 125 126 // Only support loop carried dependency with a distance of 1. This means the 127 // source of all the scf.yield operands needs to be defined by operations in 128 // the loop. 129 if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), 130 [this](Value operand) { 131 Operation *def = operand.getDefiningOp(); 132 return !def || stages.find(def) == stages.end(); 133 })) 134 return false; 135 annotateFn = options.annotateFn; 136 return true; 137 } 138 139 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) { 140 // Initialize the iteration argument to the loop initiale values. 141 for (BlockArgument &arg : forOp.getRegionIterArgs()) { 142 OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg); 143 setValueMapping(arg, operand.get(), 0); 144 } 145 auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator()); 146 for (int64_t i = 0; i < maxStage; i++) { 147 // special handling for induction variable as the increment is implicit. 148 Value iv = 149 rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step); 150 setValueMapping(forOp.getInductionVar(), iv, i); 151 for (Operation *op : opOrder) { 152 if (stages[op] > i) 153 continue; 154 Operation *newOp = rewriter.clone(*op); 155 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 156 auto it = valueMapping.find(op->getOperand(opIdx)); 157 if (it != valueMapping.end()) 158 newOp->setOperand(opIdx, it->second[i - stages[op]]); 159 } 160 if (annotateFn) 161 annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); 162 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 163 setValueMapping(op->getResult(destId), newOp->getResult(destId), 164 i - stages[op]); 165 // If the value is a loop carried dependency update the loop argument 166 // mapping. 167 for (OpOperand &operand : yield->getOpOperands()) { 168 if (operand.get() != op->getResult(destId)) 169 continue; 170 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 171 newOp->getResult(destId), i - stages[op] + 1); 172 } 173 } 174 } 175 } 176 } 177 178 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 179 LoopPipelinerInternal::analyzeCrossStageValues() { 180 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues; 181 for (Operation *op : opOrder) { 182 unsigned stage = stages[op]; 183 for (OpOperand &operand : op->getOpOperands()) { 184 Operation *def = operand.get().getDefiningOp(); 185 if (!def) 186 continue; 187 auto defStage = stages.find(def); 188 if (defStage == stages.end() || defStage->second == stage) 189 continue; 190 assert(stage > defStage->second); 191 LiverangeInfo &info = crossStageValues[operand.get()]; 192 info.defStage = defStage->second; 193 info.lastUseStage = std::max(info.lastUseStage, stage); 194 } 195 } 196 return crossStageValues; 197 } 198 199 scf::ForOp LoopPipelinerInternal::createKernelLoop( 200 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 201 &crossStageValues, 202 PatternRewriter &rewriter, 203 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) { 204 // Creates the list of initial values associated to values used across 205 // stages. The initial values come from the prologue created above. 206 // Keep track of the kernel argument associated to each version of the 207 // values passed to the kernel. 208 llvm::SmallVector<Value> newLoopArg; 209 // For existing loop argument initialize them with the right version from the 210 // prologue. 211 for (const auto &retVal : 212 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 213 Operation *def = retVal.value().getDefiningOp(); 214 assert(def && "Only support loop carried dependencies of distance 1"); 215 unsigned defStage = stages[def]; 216 Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]] 217 [maxStage - defStage]; 218 assert(valueVersion); 219 newLoopArg.push_back(valueVersion); 220 } 221 for (auto escape : crossStageValues) { 222 LiverangeInfo &info = escape.second; 223 Value value = escape.first; 224 for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage; 225 stageIdx++) { 226 Value valueVersion = 227 valueMapping[value][maxStage - info.lastUseStage + stageIdx]; 228 assert(valueVersion); 229 newLoopArg.push_back(valueVersion); 230 loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage - 231 stageIdx)] = newLoopArg.size() - 1; 232 } 233 } 234 235 // Create the new kernel loop. When we peel the epilgue we need to peel 236 // `numStages - 1` iterations. Then we adjust the upper bound to remove those 237 // iterations. 238 Value newUb = forOp.getUpperBound(); 239 if (peelEpilogue) 240 newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), 241 ub - maxStage * step); 242 auto newForOp = 243 rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb, 244 forOp.getStep(), newLoopArg); 245 return newForOp; 246 } 247 248 void LoopPipelinerInternal::createKernel( 249 scf::ForOp newForOp, 250 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 251 &crossStageValues, 252 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 253 PatternRewriter &rewriter) { 254 valueMapping.clear(); 255 256 // Create the kernel, we clone instruction based on the order given by 257 // user and remap operands coming from a previous stages. 258 rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); 259 BlockAndValueMapping mapping; 260 mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); 261 for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) { 262 mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); 263 } 264 SmallVector<Value> predicates(maxStage + 1, nullptr); 265 if (!peelEpilogue) { 266 // Create a predicate for each stage except the last stage. 267 for (unsigned i = 0; i < maxStage; i++) { 268 Value c = rewriter.create<arith::ConstantIndexOp>( 269 newForOp.getLoc(), ub - (maxStage - i) * step); 270 Value pred = rewriter.create<arith::CmpIOp>( 271 newForOp.getLoc(), arith::CmpIPredicate::slt, 272 newForOp.getInductionVar(), c); 273 predicates[i] = pred; 274 } 275 } 276 for (Operation *op : opOrder) { 277 int64_t useStage = stages[op]; 278 auto *newOp = rewriter.clone(*op, mapping); 279 for (OpOperand &operand : op->getOpOperands()) { 280 // Special case for the induction variable uses. We replace it with a 281 // version incremented based on the stage where it is used. 282 if (operand.get() == forOp.getInductionVar()) { 283 rewriter.setInsertionPoint(newOp); 284 Value offset = rewriter.create<arith::ConstantIndexOp>( 285 forOp.getLoc(), (maxStage - stages[op]) * step); 286 Value iv = rewriter.create<arith::AddIOp>( 287 forOp.getLoc(), newForOp.getInductionVar(), offset); 288 newOp->setOperand(operand.getOperandNumber(), iv); 289 rewriter.setInsertionPointAfter(newOp); 290 continue; 291 } 292 auto arg = operand.get().dyn_cast<BlockArgument>(); 293 if (arg && arg.getOwner() == forOp.getBody()) { 294 // If the value is a loop carried value coming from stage N + 1 remap, 295 // it will become a direct use. 296 Value ret = forOp.getBody()->getTerminator()->getOperand( 297 arg.getArgNumber() - 1); 298 Operation *dep = ret.getDefiningOp(); 299 if (!dep) 300 continue; 301 auto stageDep = stages.find(dep); 302 if (stageDep == stages.end() || stageDep->second == useStage) 303 continue; 304 assert(stageDep->second == useStage + 1); 305 newOp->setOperand(operand.getOperandNumber(), 306 mapping.lookupOrDefault(ret)); 307 continue; 308 } 309 // For operands defined in a previous stage we need to remap it to use 310 // the correct region argument. We look for the right version of the 311 // Value based on the stage where it is used. 312 Operation *def = operand.get().getDefiningOp(); 313 if (!def) 314 continue; 315 auto stageDef = stages.find(def); 316 if (stageDef == stages.end() || stageDef->second == useStage) 317 continue; 318 auto remap = loopArgMap.find( 319 std::make_pair(operand.get(), useStage - stageDef->second)); 320 assert(remap != loopArgMap.end()); 321 newOp->setOperand(operand.getOperandNumber(), 322 newForOp.getRegionIterArgs()[remap->second]); 323 } 324 if (predicates[useStage]) { 325 newOp = predicateFn(newOp, predicates[useStage], rewriter); 326 // Remap the results to the new predicated one. 327 for (auto values : llvm::zip(op->getResults(), newOp->getResults())) 328 mapping.map(std::get<0>(values), std::get<1>(values)); 329 } 330 rewriter.setInsertionPointAfter(newOp); 331 if (annotateFn) 332 annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0); 333 } 334 335 // Collect the Values that need to be returned by the forOp. For each 336 // value we need to have `LastUseStage - DefStage` number of versions 337 // returned. 338 // We create a mapping between original values and the associated loop 339 // returned values that will be needed by the epilogue. 340 llvm::SmallVector<Value> yieldOperands; 341 for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) { 342 yieldOperands.push_back(mapping.lookupOrDefault(retVal)); 343 } 344 for (auto &it : crossStageValues) { 345 int64_t version = maxStage - it.second.lastUseStage + 1; 346 unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage; 347 // add the original verstion to yield ops. 348 // If there is a liverange spanning across more than 2 stages we need to add 349 // extra arg. 350 for (unsigned i = 1; i < numVersionReturned; i++) { 351 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 352 version++); 353 yieldOperands.push_back( 354 newForOp.getBody()->getArguments()[yieldOperands.size() + 1 + 355 newForOp.getNumInductionVars()]); 356 } 357 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 358 version++); 359 yieldOperands.push_back(mapping.lookupOrDefault(it.first)); 360 } 361 // Map the yield operand to the forOp returned value. 362 for (const auto &retVal : 363 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 364 Operation *def = retVal.value().getDefiningOp(); 365 assert(def && "Only support loop carried dependencies of distance 1"); 366 unsigned defStage = stages[def]; 367 setValueMapping(forOp.getRegionIterArgs()[retVal.index()], 368 newForOp->getResult(retVal.index()), 369 maxStage - defStage + 1); 370 } 371 rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands); 372 } 373 374 llvm::SmallVector<Value> 375 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) { 376 llvm::SmallVector<Value> returnValues(forOp->getNumResults()); 377 // Emit different versions of the induction variable. They will be 378 // removed by dead code if not used. 379 for (int64_t i = 0; i < maxStage; i++) { 380 Value newlastIter = rewriter.create<arith::ConstantIndexOp>( 381 forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i)); 382 setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); 383 } 384 // Emit `maxStage - 1` epilogue part that includes operations fro stages 385 // [i; maxStage]. 386 for (int64_t i = 1; i <= maxStage; i++) { 387 for (Operation *op : opOrder) { 388 if (stages[op] < i) 389 continue; 390 Operation *newOp = rewriter.clone(*op); 391 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 392 auto it = valueMapping.find(op->getOperand(opIdx)); 393 if (it != valueMapping.end()) { 394 Value v = it->second[maxStage - stages[op] + i]; 395 assert(v); 396 newOp->setOperand(opIdx, v); 397 } 398 } 399 if (annotateFn) 400 annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1); 401 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 402 setValueMapping(op->getResult(destId), newOp->getResult(destId), 403 maxStage - stages[op] + i); 404 // If the value is a loop carried dependency update the loop argument 405 // mapping and keep track of the last version to replace the original 406 // forOp uses. 407 for (OpOperand &operand : 408 forOp.getBody()->getTerminator()->getOpOperands()) { 409 if (operand.get() != op->getResult(destId)) 410 continue; 411 unsigned version = maxStage - stages[op] + i + 1; 412 // If the version is greater than maxStage it means it maps to the 413 // original forOp returned value. 414 if (version > maxStage) { 415 returnValues[operand.getOperandNumber()] = newOp->getResult(destId); 416 continue; 417 } 418 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 419 newOp->getResult(destId), version); 420 } 421 } 422 } 423 } 424 return returnValues; 425 } 426 427 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) { 428 auto it = valueMapping.find(key); 429 // If the value is not in the map yet add a vector big enough to store all 430 // versions. 431 if (it == valueMapping.end()) 432 it = 433 valueMapping 434 .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1))) 435 .first; 436 it->second[idx] = el; 437 } 438 439 /// Generate a pipelined version of the scf.for loop based on the schedule given 440 /// as option. This applies the mechanical transformation of changing the loop 441 /// and generating the prologue/epilogue for the pipelining and doesn't make any 442 /// decision regarding the schedule. 443 /// Based on the option the loop is split into several stages. 444 /// The transformation assumes that the scheduling given by user is valid. 445 /// For example if we break a loop into 3 stages named S0, S1, S2 we would 446 /// generate the following code with the number in parenthesis the iteration 447 /// index: 448 /// S0(0) // Prologue 449 /// S0(1) S1(0) // Prologue 450 /// scf.for %I = %C0 to %N - 2 { 451 /// S0(I+2) S1(I+1) S2(I) // Pipelined kernel 452 /// } 453 /// S1(N) S2(N-1) // Epilogue 454 /// S2(N) // Epilogue 455 struct ForLoopPipelining : public OpRewritePattern<ForOp> { 456 ForLoopPipelining(const PipeliningOption &options, MLIRContext *context) 457 : OpRewritePattern<ForOp>(context), options(options) {} 458 LogicalResult matchAndRewrite(ForOp forOp, 459 PatternRewriter &rewriter) const override { 460 461 LoopPipelinerInternal pipeliner; 462 if (!pipeliner.initializeLoopInfo(forOp, options)) 463 return failure(); 464 465 // 1. Emit prologue. 466 pipeliner.emitPrologue(rewriter); 467 468 // 2. Track values used across stages. When a value cross stages it will 469 // need to be passed as loop iteration arguments. 470 // We first collect the values that are used in a different stage than where 471 // they are defined. 472 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 473 crossStageValues = pipeliner.analyzeCrossStageValues(); 474 475 // Mapping between original loop values used cross stage and the block 476 // arguments associated after pipelining. A Value may map to several 477 // arguments if its liverange spans across more than 2 stages. 478 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap; 479 // 3. Create the new kernel loop and return the block arguments mapping. 480 ForOp newForOp = 481 pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap); 482 // Create the kernel block, order ops based on user choice and remap 483 // operands. 484 pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter); 485 486 llvm::SmallVector<Value> returnValues = 487 newForOp.getResults().take_front(forOp->getNumResults()); 488 if (options.peelEpilogue) { 489 // 4. Emit the epilogue after the new forOp. 490 rewriter.setInsertionPointAfter(newForOp); 491 returnValues = pipeliner.emitEpilogue(rewriter); 492 } 493 // 5. Erase the original loop and replace the uses with the epilogue output. 494 if (forOp->getNumResults() > 0) 495 rewriter.replaceOp(forOp, returnValues); 496 else 497 rewriter.eraseOp(forOp); 498 499 return success(); 500 } 501 502 protected: 503 PipeliningOption options; 504 }; 505 506 } // namespace 507 508 void mlir::scf::populateSCFLoopPipeliningPatterns( 509 RewritePatternSet &patterns, const PipeliningOption &options) { 510 patterns.add<ForLoopPipelining>(options, patterns.getContext()); 511 } 512