1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop software pipelining 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PassDetail.h" 14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 15 #include "mlir/Dialect/SCF/IR/SCF.h" 16 #include "mlir/Dialect/SCF/Transforms/Patterns.h" 17 #include "mlir/Dialect/SCF/Transforms/Transforms.h" 18 #include "mlir/Dialect/SCF/Utils/Utils.h" 19 #include "mlir/IR/BlockAndValueMapping.h" 20 #include "mlir/IR/PatternMatch.h" 21 #include "mlir/Support/MathExtras.h" 22 23 using namespace mlir; 24 using namespace mlir::scf; 25 26 namespace { 27 28 /// Helper to keep internal information during pipelining transformation. 29 struct LoopPipelinerInternal { 30 /// Coarse liverange information for ops used across stages. 31 struct LiverangeInfo { 32 unsigned lastUseStage = 0; 33 unsigned defStage = 0; 34 }; 35 36 protected: 37 ForOp forOp; 38 unsigned maxStage = 0; 39 DenseMap<Operation *, unsigned> stages; 40 std::vector<Operation *> opOrder; 41 int64_t ub; 42 int64_t lb; 43 int64_t step; 44 PipeliningOption::AnnotationlFnType annotateFn = nullptr; 45 bool peelEpilogue; 46 PipeliningOption::PredicateOpFn predicateFn = nullptr; 47 48 // When peeling the kernel we generate several version of each value for 49 // different stage of the prologue. This map tracks the mapping between 50 // original Values in the loop and the different versions 51 // peeled from the loop. 52 DenseMap<Value, llvm::SmallVector<Value>> valueMapping; 53 54 /// Assign a value to `valueMapping`, this means `val` represents the version 55 /// `idx` of `key` in the epilogue. 56 void setValueMapping(Value key, Value el, int64_t idx); 57 58 public: 59 /// Initalize the information for the given `op`, return true if it 60 /// satisfies the pre-condition to apply pipelining. 61 bool initializeLoopInfo(ForOp op, const PipeliningOption &options); 62 /// Emits the prologue, this creates `maxStage - 1` part which will contain 63 /// operations from stages [0; i], where i is the part index. 64 void emitPrologue(PatternRewriter &rewriter); 65 /// Gather liverange information for Values that are used in a different stage 66 /// than its definition. 67 llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues(); 68 scf::ForOp createKernelLoop( 69 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 70 PatternRewriter &rewriter, 71 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap); 72 /// Emits the pipelined kernel. This clones loop operations following user 73 /// order and remaps operands defined in a different stage as their use. 74 void createKernel( 75 scf::ForOp newForOp, 76 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues, 77 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 78 PatternRewriter &rewriter); 79 /// Emits the epilogue, this creates `maxStage - 1` part which will contain 80 /// operations from stages [i; maxStage], where i is the part index. 81 llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter); 82 }; 83 84 bool LoopPipelinerInternal::initializeLoopInfo( 85 ForOp op, const PipeliningOption &options) { 86 forOp = op; 87 auto upperBoundCst = 88 forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 89 auto lowerBoundCst = 90 forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 91 auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 92 if (!upperBoundCst || !lowerBoundCst || !stepCst) 93 return false; 94 ub = upperBoundCst.value(); 95 lb = lowerBoundCst.value(); 96 step = stepCst.value(); 97 peelEpilogue = options.peelEpilogue; 98 predicateFn = options.predicateFn; 99 if (!peelEpilogue && predicateFn == nullptr) 100 return false; 101 int64_t numIteration = ceilDiv(ub - lb, step); 102 std::vector<std::pair<Operation *, unsigned>> schedule; 103 options.getScheduleFn(forOp, schedule); 104 if (schedule.empty()) 105 return false; 106 107 opOrder.reserve(schedule.size()); 108 for (auto &opSchedule : schedule) { 109 maxStage = std::max(maxStage, opSchedule.second); 110 stages[opSchedule.first] = opSchedule.second; 111 opOrder.push_back(opSchedule.first); 112 } 113 if (numIteration <= maxStage) 114 return false; 115 116 // All operations need to have a stage. 117 if (forOp 118 .walk([this](Operation *op) { 119 if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) && 120 stages.find(op) == stages.end()) 121 return WalkResult::interrupt(); 122 return WalkResult::advance(); 123 }) 124 .wasInterrupted()) 125 return false; 126 127 // Only support loop carried dependency with a distance of 1. This means the 128 // source of all the scf.yield operands needs to be defined by operations in 129 // the loop. 130 if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), 131 [this](Value operand) { 132 Operation *def = operand.getDefiningOp(); 133 return !def || stages.find(def) == stages.end(); 134 })) 135 return false; 136 annotateFn = options.annotateFn; 137 return true; 138 } 139 140 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) { 141 // Initialize the iteration argument to the loop initiale values. 142 for (BlockArgument &arg : forOp.getRegionIterArgs()) { 143 OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg); 144 setValueMapping(arg, operand.get(), 0); 145 } 146 auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator()); 147 for (int64_t i = 0; i < maxStage; i++) { 148 // special handling for induction variable as the increment is implicit. 149 Value iv = 150 rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step); 151 setValueMapping(forOp.getInductionVar(), iv, i); 152 for (Operation *op : opOrder) { 153 if (stages[op] > i) 154 continue; 155 Operation *newOp = rewriter.clone(*op); 156 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 157 auto it = valueMapping.find(op->getOperand(opIdx)); 158 if (it != valueMapping.end()) 159 newOp->setOperand(opIdx, it->second[i - stages[op]]); 160 } 161 if (annotateFn) 162 annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i); 163 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 164 setValueMapping(op->getResult(destId), newOp->getResult(destId), 165 i - stages[op]); 166 // If the value is a loop carried dependency update the loop argument 167 // mapping. 168 for (OpOperand &operand : yield->getOpOperands()) { 169 if (operand.get() != op->getResult(destId)) 170 continue; 171 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 172 newOp->getResult(destId), i - stages[op] + 1); 173 } 174 } 175 } 176 } 177 } 178 179 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 180 LoopPipelinerInternal::analyzeCrossStageValues() { 181 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues; 182 for (Operation *op : opOrder) { 183 unsigned stage = stages[op]; 184 for (OpOperand &operand : op->getOpOperands()) { 185 Operation *def = operand.get().getDefiningOp(); 186 if (!def) 187 continue; 188 auto defStage = stages.find(def); 189 if (defStage == stages.end() || defStage->second == stage) 190 continue; 191 assert(stage > defStage->second); 192 LiverangeInfo &info = crossStageValues[operand.get()]; 193 info.defStage = defStage->second; 194 info.lastUseStage = std::max(info.lastUseStage, stage); 195 } 196 } 197 return crossStageValues; 198 } 199 200 scf::ForOp LoopPipelinerInternal::createKernelLoop( 201 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 202 &crossStageValues, 203 PatternRewriter &rewriter, 204 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) { 205 // Creates the list of initial values associated to values used across 206 // stages. The initial values come from the prologue created above. 207 // Keep track of the kernel argument associated to each version of the 208 // values passed to the kernel. 209 llvm::SmallVector<Value> newLoopArg; 210 // For existing loop argument initialize them with the right version from the 211 // prologue. 212 for (const auto &retVal : 213 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 214 Operation *def = retVal.value().getDefiningOp(); 215 assert(def && "Only support loop carried dependencies of distance 1"); 216 unsigned defStage = stages[def]; 217 Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]] 218 [maxStage - defStage]; 219 assert(valueVersion); 220 newLoopArg.push_back(valueVersion); 221 } 222 for (auto escape : crossStageValues) { 223 LiverangeInfo &info = escape.second; 224 Value value = escape.first; 225 for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage; 226 stageIdx++) { 227 Value valueVersion = 228 valueMapping[value][maxStage - info.lastUseStage + stageIdx]; 229 assert(valueVersion); 230 newLoopArg.push_back(valueVersion); 231 loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage - 232 stageIdx)] = newLoopArg.size() - 1; 233 } 234 } 235 236 // Create the new kernel loop. When we peel the epilgue we need to peel 237 // `numStages - 1` iterations. Then we adjust the upper bound to remove those 238 // iterations. 239 Value newUb = forOp.getUpperBound(); 240 if (peelEpilogue) 241 newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), 242 ub - maxStage * step); 243 auto newForOp = 244 rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb, 245 forOp.getStep(), newLoopArg); 246 return newForOp; 247 } 248 249 void LoopPipelinerInternal::createKernel( 250 scf::ForOp newForOp, 251 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 252 &crossStageValues, 253 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap, 254 PatternRewriter &rewriter) { 255 valueMapping.clear(); 256 257 // Create the kernel, we clone instruction based on the order given by 258 // user and remap operands coming from a previous stages. 259 rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); 260 BlockAndValueMapping mapping; 261 mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); 262 for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) { 263 mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); 264 } 265 SmallVector<Value> predicates(maxStage + 1, nullptr); 266 if (!peelEpilogue) { 267 // Create a predicate for each stage except the last stage. 268 for (unsigned i = 0; i < maxStage; i++) { 269 Value c = rewriter.create<arith::ConstantIndexOp>( 270 newForOp.getLoc(), ub - (maxStage - i) * step); 271 Value pred = rewriter.create<arith::CmpIOp>( 272 newForOp.getLoc(), arith::CmpIPredicate::slt, 273 newForOp.getInductionVar(), c); 274 predicates[i] = pred; 275 } 276 } 277 for (Operation *op : opOrder) { 278 int64_t useStage = stages[op]; 279 auto *newOp = rewriter.clone(*op, mapping); 280 for (OpOperand &operand : op->getOpOperands()) { 281 // Special case for the induction variable uses. We replace it with a 282 // version incremented based on the stage where it is used. 283 if (operand.get() == forOp.getInductionVar()) { 284 rewriter.setInsertionPoint(newOp); 285 Value offset = rewriter.create<arith::ConstantIndexOp>( 286 forOp.getLoc(), (maxStage - stages[op]) * step); 287 Value iv = rewriter.create<arith::AddIOp>( 288 forOp.getLoc(), newForOp.getInductionVar(), offset); 289 newOp->setOperand(operand.getOperandNumber(), iv); 290 rewriter.setInsertionPointAfter(newOp); 291 continue; 292 } 293 auto arg = operand.get().dyn_cast<BlockArgument>(); 294 if (arg && arg.getOwner() == forOp.getBody()) { 295 // If the value is a loop carried value coming from stage N + 1 remap, 296 // it will become a direct use. 297 Value ret = forOp.getBody()->getTerminator()->getOperand( 298 arg.getArgNumber() - 1); 299 Operation *dep = ret.getDefiningOp(); 300 if (!dep) 301 continue; 302 auto stageDep = stages.find(dep); 303 if (stageDep == stages.end() || stageDep->second == useStage) 304 continue; 305 assert(stageDep->second == useStage + 1); 306 newOp->setOperand(operand.getOperandNumber(), 307 mapping.lookupOrDefault(ret)); 308 continue; 309 } 310 // For operands defined in a previous stage we need to remap it to use 311 // the correct region argument. We look for the right version of the 312 // Value based on the stage where it is used. 313 Operation *def = operand.get().getDefiningOp(); 314 if (!def) 315 continue; 316 auto stageDef = stages.find(def); 317 if (stageDef == stages.end() || stageDef->second == useStage) 318 continue; 319 auto remap = loopArgMap.find( 320 std::make_pair(operand.get(), useStage - stageDef->second)); 321 assert(remap != loopArgMap.end()); 322 newOp->setOperand(operand.getOperandNumber(), 323 newForOp.getRegionIterArgs()[remap->second]); 324 } 325 if (predicates[useStage]) { 326 newOp = predicateFn(newOp, predicates[useStage], rewriter); 327 // Remap the results to the new predicated one. 328 for (auto values : llvm::zip(op->getResults(), newOp->getResults())) 329 mapping.map(std::get<0>(values), std::get<1>(values)); 330 } 331 rewriter.setInsertionPointAfter(newOp); 332 if (annotateFn) 333 annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0); 334 } 335 336 // Collect the Values that need to be returned by the forOp. For each 337 // value we need to have `LastUseStage - DefStage` number of versions 338 // returned. 339 // We create a mapping between original values and the associated loop 340 // returned values that will be needed by the epilogue. 341 llvm::SmallVector<Value> yieldOperands; 342 for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) { 343 yieldOperands.push_back(mapping.lookupOrDefault(retVal)); 344 } 345 for (auto &it : crossStageValues) { 346 int64_t version = maxStage - it.second.lastUseStage + 1; 347 unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage; 348 // add the original verstion to yield ops. 349 // If there is a liverange spanning across more than 2 stages we need to add 350 // extra arg. 351 for (unsigned i = 1; i < numVersionReturned; i++) { 352 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 353 version++); 354 yieldOperands.push_back( 355 newForOp.getBody()->getArguments()[yieldOperands.size() + 1 + 356 newForOp.getNumInductionVars()]); 357 } 358 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()), 359 version++); 360 yieldOperands.push_back(mapping.lookupOrDefault(it.first)); 361 } 362 // Map the yield operand to the forOp returned value. 363 for (const auto &retVal : 364 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) { 365 Operation *def = retVal.value().getDefiningOp(); 366 assert(def && "Only support loop carried dependencies of distance 1"); 367 unsigned defStage = stages[def]; 368 setValueMapping(forOp.getRegionIterArgs()[retVal.index()], 369 newForOp->getResult(retVal.index()), 370 maxStage - defStage + 1); 371 } 372 rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands); 373 } 374 375 llvm::SmallVector<Value> 376 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) { 377 llvm::SmallVector<Value> returnValues(forOp->getNumResults()); 378 // Emit different versions of the induction variable. They will be 379 // removed by dead code if not used. 380 for (int64_t i = 0; i < maxStage; i++) { 381 Value newlastIter = rewriter.create<arith::ConstantIndexOp>( 382 forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i)); 383 setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i); 384 } 385 // Emit `maxStage - 1` epilogue part that includes operations fro stages 386 // [i; maxStage]. 387 for (int64_t i = 1; i <= maxStage; i++) { 388 for (Operation *op : opOrder) { 389 if (stages[op] < i) 390 continue; 391 Operation *newOp = rewriter.clone(*op); 392 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) { 393 auto it = valueMapping.find(op->getOperand(opIdx)); 394 if (it != valueMapping.end()) { 395 Value v = it->second[maxStage - stages[op] + i]; 396 assert(v); 397 newOp->setOperand(opIdx, v); 398 } 399 } 400 if (annotateFn) 401 annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1); 402 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) { 403 setValueMapping(op->getResult(destId), newOp->getResult(destId), 404 maxStage - stages[op] + i); 405 // If the value is a loop carried dependency update the loop argument 406 // mapping and keep track of the last version to replace the original 407 // forOp uses. 408 for (OpOperand &operand : 409 forOp.getBody()->getTerminator()->getOpOperands()) { 410 if (operand.get() != op->getResult(destId)) 411 continue; 412 unsigned version = maxStage - stages[op] + i + 1; 413 // If the version is greater than maxStage it means it maps to the 414 // original forOp returned value. 415 if (version > maxStage) { 416 returnValues[operand.getOperandNumber()] = newOp->getResult(destId); 417 continue; 418 } 419 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()], 420 newOp->getResult(destId), version); 421 } 422 } 423 } 424 } 425 return returnValues; 426 } 427 428 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) { 429 auto it = valueMapping.find(key); 430 // If the value is not in the map yet add a vector big enough to store all 431 // versions. 432 if (it == valueMapping.end()) 433 it = 434 valueMapping 435 .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1))) 436 .first; 437 it->second[idx] = el; 438 } 439 440 } // namespace 441 442 FailureOr<ForOp> ForLoopPipeliningPattern::returningMatchAndRewrite( 443 ForOp forOp, PatternRewriter &rewriter) const { 444 445 LoopPipelinerInternal pipeliner; 446 if (!pipeliner.initializeLoopInfo(forOp, options)) 447 return failure(); 448 449 // 1. Emit prologue. 450 pipeliner.emitPrologue(rewriter); 451 452 // 2. Track values used across stages. When a value cross stages it will 453 // need to be passed as loop iteration arguments. 454 // We first collect the values that are used in a different stage than where 455 // they are defined. 456 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> 457 crossStageValues = pipeliner.analyzeCrossStageValues(); 458 459 // Mapping between original loop values used cross stage and the block 460 // arguments associated after pipelining. A Value may map to several 461 // arguments if its liverange spans across more than 2 stages. 462 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap; 463 // 3. Create the new kernel loop and return the block arguments mapping. 464 ForOp newForOp = 465 pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap); 466 // Create the kernel block, order ops based on user choice and remap 467 // operands. 468 pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter); 469 470 llvm::SmallVector<Value> returnValues = 471 newForOp.getResults().take_front(forOp->getNumResults()); 472 if (options.peelEpilogue) { 473 // 4. Emit the epilogue after the new forOp. 474 rewriter.setInsertionPointAfter(newForOp); 475 returnValues = pipeliner.emitEpilogue(rewriter); 476 } 477 // 5. Erase the original loop and replace the uses with the epilogue output. 478 if (forOp->getNumResults() > 0) 479 rewriter.replaceOp(forOp, returnValues); 480 else 481 rewriter.eraseOp(forOp); 482 483 return newForOp; 484 } 485 486 void mlir::scf::populateSCFLoopPipeliningPatterns( 487 RewritePatternSet &patterns, const PipeliningOption &options) { 488 patterns.add<ForLoopPipeliningPattern>(options, patterns.getContext()); 489 } 490