1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop software pipelining 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 15 #include "mlir/Dialect/SCF/SCF.h" 16 #include "mlir/Dialect/SCF/Transforms.h" 17 #include "mlir/Dialect/SCF/Utils/Utils.h" 18 #include "mlir/IR/BlockAndValueMapping.h" 19 #include "mlir/IR/PatternMatch.h" 20 #include "mlir/Support/MathExtras.h" 21 22 using namespace mlir; 23 using namespace mlir::scf; 24 25 namespace { 26 27 /// Helper to keep internal information during pipelining transformation. 28 struct LoopPipelinerInternal { 29 /// Coarse liverange information for ops used across stages. 30 struct LiverangeInfo { 31 unsigned lastUseStage = 0; 32 unsigned defStage = 0; 33 }; 34 35 protected: 36 ForOp forOp; 37 unsigned maxStage = 0; 38 DenseMap<Operation *, unsigned> stages; 39 std::vector<Operation *> opOrder; 40 int64_t ub; 41 int64_t lb; 42 int64_t step; 43 PipeliningOption::AnnotationlFnType annotateFn = nullptr; 44 45 // When peeling the kernel we generate several version of each value for 46 // different stage of the prologue. This map tracks the mapping between 47 // original Values in the loop and the different versions 48 // peeled from the loop. 49 DenseMap<Value, llvm::SmallVector<Value>> valueMapping; 50 51 /// Assign a value to `valueMapping`, this means `val` represents the version 52 /// `idx` of `key` in the epilogue. 53 void setValueMapping(Value key, Value el, int64_t idx); 54 55 public: 56 /// Initalize the information for the given `op`, return true if it 57 /// satisfies the pre-condition to apply pipelining. 58 bool initializeLoopInfo(ForOp op, const PipeliningOption &options); 59 /// Emits the prologue, this creates `maxStage - 1` part which will contain 60 /// operations from stages [0; i], where i is the part index. 61 void emitPrologue(PatternRewriter &rewriter); 62 /// Gather liverange information for Values that are used in a different stage 63 /// than its definition. 64 llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues(); 65 scf::ForOp createKernelLoop( 66 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 67 PatternRewriter &rewriter, 68 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap); 69 /// Emits the pipelined kernel. This clones loop operations following user 70 /// order and remaps operands defined in a different stage as their use. 71 void createKernel( 72 scf::ForOp newForOp, 73 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 74 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 75 PatternRewriter &rewriter); 76 /// Emits the epilogue, this creates `maxStage - 1` part which will contain 77 /// operations from stages [i; maxStage], where i is the part index. 78 llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter); 79 }; 80 81 bool LoopPipelinerInternal::initializeLoopInfo( 82 ForOp op, const PipeliningOption &options) { 83 forOp = op; 84 auto upperBoundCst = 85 forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 86 auto lowerBoundCst = 87 forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 88 auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 89 if (!upperBoundCst || !lowerBoundCst || !stepCst) 90 return false; 91 ub = upperBoundCst.value(); 92 lb = lowerBoundCst.value(); 93 step = stepCst.value(); 94 int64_t numIteration = ceilDiv(ub - lb, step); 95 std::vector<std::pair<Operation *, unsigned>> schedule; 96 options.getScheduleFn(forOp, schedule); 97 if (schedule.empty()) 98 return false; 99 100 opOrder.reserve(schedule.size()); 101 for (auto &opSchedule : schedule) { 102 maxStage = std::max(maxStage, opSchedule.second); 103 stages[opSchedule.first] = opSchedule.second; 104 opOrder.push_back(opSchedule.first); 105 } 106 if (numIteration <= maxStage) 107 return false; 108 109 // All operations need to have a stage. 110 if (forOp 111 .walk([this](Operation *op) { 112 if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) && 113 stages.find(op) == stages.end()) 114 return WalkResult::interrupt(); 115 return WalkResult::advance(); 116 }) 117 .wasInterrupted()) 118 return false; 119 120 // Only support loop carried dependency with a distance of 1. This means the 121 // source of all the scf.yield operands needs to be defined by operations in 122 // the loop. 123 if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), 124 [this](Value operand) { 125 Operation *def = operand.getDefiningOp(); 126 return !def || stages.find(def) == stages.end(); 127 })) 128 return false; 129 annotateFn = options.annotateFn; 130 return true; 131 } 132 133 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) { 134 // Initialize the iteration argument to the loop initiale values. 135 for (BlockArgument &arg : forOp.getRegionIterArgs()) { 136 OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg); 137 setValueMapping(arg, operand.get(), 0); 138 } 139 auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator()); 140 for (int64_t i = 0; i < maxStage; i++) { 141 // special handling for induction variable as the increment is implicit. 142 Value iv = 143 rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step); 144 setValueMapping(forOp.getInductionVar(), iv, i); 145 for (Operation *op : opOrder) { 146 if (stages[op] > i) 147 continue; 148 Operation *newOp = rewriter.clone(*op); 149 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 150 auto it = valueMapping.find(op->getOperand(opIdx)); 151 if (it != valueMapping.end()) 152 newOp->setOperand(opIdx, it->second[i - stages[op]]); 153 } 154 if (annotateFn) 155 annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); 156 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 157 setValueMapping(op->getResult(destId), newOp->getResult(destId), 158 i - stages[op]); 159 // If the value is a loop carried dependency update the loop argument 160 // mapping. 161 for (OpOperand &operand : yield->getOpOperands()) { 162 if (operand.get() != op->getResult(destId)) 163 continue; 164 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 165 newOp->getResult(destId), i - stages[op] + 1); 166 } 167 } 168 } 169 } 170 } 171 172 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 173 LoopPipelinerInternal::analyzeCrossStageValues() { 174 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues; 175 for (Operation *op : opOrder) { 176 unsigned stage = stages[op]; 177 for (OpOperand &operand : op->getOpOperands()) { 178 Operation *def = operand.get().getDefiningOp(); 179 if (!def) 180 continue; 181 auto defStage = stages.find(def); 182 if (defStage == stages.end() || defStage->second == stage) 183 continue; 184 assert(stage > defStage->second); 185 LiverangeInfo &info = crossStageValues[operand.get()]; 186 info.defStage = defStage->second; 187 info.lastUseStage = std::max(info.lastUseStage, stage); 188 } 189 } 190 return crossStageValues; 191 } 192 193 scf::ForOp LoopPipelinerInternal::createKernelLoop( 194 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 195 &crossStageValues, 196 PatternRewriter &rewriter, 197 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) { 198 // Creates the list of initial values associated to values used across 199 // stages. The initial values come from the prologue created above. 200 // Keep track of the kernel argument associated to each version of the 201 // values passed to the kernel. 202 llvm::SmallVector<Value> newLoopArg; 203 // For existing loop argument initialize them with the right version from the 204 // prologue. 205 for (const auto &retVal : 206 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 207 Operation *def = retVal.value().getDefiningOp(); 208 assert(def && "Only support loop carried dependencies of distance 1"); 209 unsigned defStage = stages[def]; 210 Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]] 211 [maxStage - defStage]; 212 assert(valueVersion); 213 newLoopArg.push_back(valueVersion); 214 } 215 for (auto escape : crossStageValues) { 216 LiverangeInfo &info = escape.second; 217 Value value = escape.first; 218 for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage; 219 stageIdx++) { 220 Value valueVersion = 221 valueMapping[value][maxStage - info.lastUseStage + stageIdx]; 222 assert(valueVersion); 223 newLoopArg.push_back(valueVersion); 224 loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage - 225 stageIdx)] = newLoopArg.size() - 1; 226 } 227 } 228 229 // Create the new kernel loop. Since we need to peel `numStages - 1` 230 // iteration we change the upper bound to remove those iterations. 231 Value newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), 232 ub - maxStage * step); 233 auto newForOp = 234 rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb, 235 forOp.getStep(), newLoopArg); 236 return newForOp; 237 } 238 239 void LoopPipelinerInternal::createKernel( 240 scf::ForOp newForOp, 241 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 242 &crossStageValues, 243 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 244 PatternRewriter &rewriter) { 245 valueMapping.clear(); 246 247 // Create the kernel, we clone instruction based on the order given by 248 // user and remap operands coming from a previous stages. 249 rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); 250 BlockAndValueMapping mapping; 251 mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); 252 for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) { 253 mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); 254 } 255 for (Operation *op : opOrder) { 256 int64_t useStage = stages[op]; 257 auto *newOp = rewriter.clone(*op, mapping); 258 for (OpOperand &operand : op->getOpOperands()) { 259 // Special case for the induction variable uses. We replace it with a 260 // version incremented based on the stage where it is used. 261 if (operand.get() == forOp.getInductionVar()) { 262 rewriter.setInsertionPoint(newOp); 263 Value offset = rewriter.create<arith::ConstantIndexOp>( 264 forOp.getLoc(), (maxStage - stages[op]) * step); 265 Value iv = rewriter.create<arith::AddIOp>( 266 forOp.getLoc(), newForOp.getInductionVar(), offset); 267 newOp->setOperand(operand.getOperandNumber(), iv); 268 rewriter.setInsertionPointAfter(newOp); 269 continue; 270 } 271 auto arg = operand.get().dyn_cast<BlockArgument>(); 272 if (arg && arg.getOwner() == forOp.getBody()) { 273 // If the value is a loop carried value coming from stage N + 1 remap, 274 // it will become a direct use. 275 Value ret = forOp.getBody()->getTerminator()->getOperand( 276 arg.getArgNumber() - 1); 277 Operation *dep = ret.getDefiningOp(); 278 if (!dep) 279 continue; 280 auto stageDep = stages.find(dep); 281 if (stageDep == stages.end() || stageDep->second == useStage) 282 continue; 283 assert(stageDep->second == useStage + 1); 284 newOp->setOperand(operand.getOperandNumber(), 285 mapping.lookupOrDefault(ret)); 286 continue; 287 } 288 // For operands defined in a previous stage we need to remap it to use 289 // the correct region argument. We look for the right version of the 290 // Value based on the stage where it is used. 291 Operation *def = operand.get().getDefiningOp(); 292 if (!def) 293 continue; 294 auto stageDef = stages.find(def); 295 if (stageDef == stages.end() || stageDef->second == useStage) 296 continue; 297 auto remap = loopArgMap.find( 298 std::make_pair(operand.get(), useStage - stageDef->second)); 299 assert(remap != loopArgMap.end()); 300 newOp->setOperand(operand.getOperandNumber(), 301 newForOp.getRegionIterArgs()[remap->second]); 302 } 303 if (annotateFn) 304 annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0); 305 } 306 307 // Collect the Values that need to be returned by the forOp. For each 308 // value we need to have `LastUseStage - DefStage` number of versions 309 // returned. 310 // We create a mapping between original values and the associated loop 311 // returned values that will be needed by the epilogue. 312 llvm::SmallVector<Value> yieldOperands; 313 for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) { 314 yieldOperands.push_back(mapping.lookupOrDefault(retVal)); 315 } 316 for (auto &it : crossStageValues) { 317 int64_t version = maxStage - it.second.lastUseStage + 1; 318 unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage; 319 // add the original verstion to yield ops. 320 // If there is a liverange spanning across more than 2 stages we need to add 321 // extra arg. 322 for (unsigned i = 1; i < numVersionReturned; i++) { 323 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 324 version++); 325 yieldOperands.push_back( 326 newForOp.getBody()->getArguments()[yieldOperands.size() + 1 + 327 newForOp.getNumInductionVars()]); 328 } 329 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 330 version++); 331 yieldOperands.push_back(mapping.lookupOrDefault(it.first)); 332 } 333 // Map the yield operand to the forOp returned value. 334 for (const auto &retVal : 335 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 336 Operation *def = retVal.value().getDefiningOp(); 337 assert(def && "Only support loop carried dependencies of distance 1"); 338 unsigned defStage = stages[def]; 339 setValueMapping(forOp.getRegionIterArgs()[retVal.index()], 340 newForOp->getResult(retVal.index()), 341 maxStage - defStage + 1); 342 } 343 rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands); 344 } 345 346 llvm::SmallVector<Value> 347 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) { 348 llvm::SmallVector<Value> returnValues(forOp->getNumResults()); 349 // Emit different versions of the induction variable. They will be 350 // removed by dead code if not used. 351 for (int64_t i = 0; i < maxStage; i++) { 352 Value newlastIter = rewriter.create<arith::ConstantIndexOp>( 353 forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i)); 354 setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); 355 } 356 // Emit `maxStage - 1` epilogue part that includes operations fro stages 357 // [i; maxStage]. 358 for (int64_t i = 1; i <= maxStage; i++) { 359 for (Operation *op : opOrder) { 360 if (stages[op] < i) 361 continue; 362 Operation *newOp = rewriter.clone(*op); 363 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 364 auto it = valueMapping.find(op->getOperand(opIdx)); 365 if (it != valueMapping.end()) { 366 Value v = it->second[maxStage - stages[op] + i]; 367 assert(v); 368 newOp->setOperand(opIdx, v); 369 } 370 } 371 if (annotateFn) 372 annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1); 373 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 374 setValueMapping(op->getResult(destId), newOp->getResult(destId), 375 maxStage - stages[op] + i); 376 // If the value is a loop carried dependency update the loop argument 377 // mapping and keep track of the last version to replace the original 378 // forOp uses. 379 for (OpOperand &operand : 380 forOp.getBody()->getTerminator()->getOpOperands()) { 381 if (operand.get() != op->getResult(destId)) 382 continue; 383 unsigned version = maxStage - stages[op] + i + 1; 384 // If the version is greater than maxStage it means it maps to the 385 // original forOp returned value. 386 if (version > maxStage) { 387 returnValues[operand.getOperandNumber()] = newOp->getResult(destId); 388 continue; 389 } 390 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 391 newOp->getResult(destId), version); 392 } 393 } 394 } 395 } 396 return returnValues; 397 } 398 399 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) { 400 auto it = valueMapping.find(key); 401 // If the value is not in the map yet add a vector big enough to store all 402 // versions. 403 if (it == valueMapping.end()) 404 it = 405 valueMapping 406 .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1))) 407 .first; 408 it->second[idx] = el; 409 } 410 411 /// Generate a pipelined version of the scf.for loop based on the schedule given 412 /// as option. This applies the mechanical transformation of changing the loop 413 /// and generating the prologue/epilogue for the pipelining and doesn't make any 414 /// decision regarding the schedule. 415 /// Based on the option the loop is split into several stages. 416 /// The transformation assumes that the scheduling given by user is valid. 417 /// For example if we break a loop into 3 stages named S0, S1, S2 we would 418 /// generate the following code with the number in parenthesis the iteration 419 /// index: 420 /// S0(0) // Prologue 421 /// S0(1) S1(0) // Prologue 422 /// scf.for %I = %C0 to %N - 2 { 423 /// S0(I+2) S1(I+1) S2(I) // Pipelined kernel 424 /// } 425 /// S1(N) S2(N-1) // Epilogue 426 /// S2(N) // Epilogue 427 struct ForLoopPipelining : public OpRewritePattern<ForOp> { 428 ForLoopPipelining(const PipeliningOption &options, MLIRContext *context) 429 : OpRewritePattern<ForOp>(context), options(options) {} 430 LogicalResult matchAndRewrite(ForOp forOp, 431 PatternRewriter &rewriter) const override { 432 433 LoopPipelinerInternal pipeliner; 434 if (!pipeliner.initializeLoopInfo(forOp, options)) 435 return failure(); 436 437 // 1. Emit prologue. 438 pipeliner.emitPrologue(rewriter); 439 440 // 2. Track values used across stages. When a value cross stages it will 441 // need to be passed as loop iteration arguments. 442 // We first collect the values that are used in a different stage than where 443 // they are defined. 444 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 445 crossStageValues = pipeliner.analyzeCrossStageValues(); 446 447 // Mapping between original loop values used cross stage and the block 448 // arguments associated after pipelining. A Value may map to several 449 // arguments if its liverange spans across more than 2 stages. 450 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap; 451 // 3. Create the new kernel loop and return the block arguments mapping. 452 ForOp newForOp = 453 pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap); 454 // Create the kernel block, order ops based on user choice and remap 455 // operands. 456 pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter); 457 458 // 4. Emit the epilogue after the new forOp. 459 rewriter.setInsertionPointAfter(newForOp); 460 llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter); 461 462 // 5. Erase the original loop and replace the uses with the epilogue output. 463 if (forOp->getNumResults() > 0) 464 rewriter.replaceOp(forOp, returnValues); 465 else 466 rewriter.eraseOp(forOp); 467 468 return success(); 469 } 470 471 protected: 472 PipeliningOption options; 473 }; 474 475 } // namespace 476 477 void mlir::scf::populateSCFLoopPipeliningPatterns( 478 RewritePatternSet &patterns, const PipeliningOption &options) { 479 patterns.add<ForLoopPipelining>(options, patterns.getContext()); 480 } 481