1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements miscellaneous loop transformation routines. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/SCF/Utils/Utils.h" 14 #include "mlir/Analysis/SliceAnalysis.h" 15 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 16 #include "mlir/Dialect/Func/IR/FuncOps.h" 17 #include "mlir/Dialect/SCF/SCF.h" 18 #include "mlir/IR/BlockAndValueMapping.h" 19 #include "mlir/IR/BuiltinOps.h" 20 #include "mlir/IR/PatternMatch.h" 21 #include "mlir/Support/MathExtras.h" 22 #include "mlir/Transforms/RegionUtils.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/SetVector.h" 25 #include "llvm/ADT/SmallPtrSet.h" 26 27 using namespace mlir; 28 29 namespace { 30 // This structure is to pass and return sets of loop parameters without 31 // confusing the order. 32 struct LoopParams { 33 Value lowerBound; 34 Value upperBound; 35 Value step; 36 }; 37 } // namespace 38 39 scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop, 40 ValueRange newIterOperands, 41 ValueRange newYieldedValues, 42 bool replaceLoopResults) { 43 assert(newIterOperands.size() == newYieldedValues.size() && 44 "newIterOperands must be of the same size as newYieldedValues"); 45 46 // Create a new loop before the existing one, with the extra operands. 47 OpBuilder::InsertionGuard g(b); 48 b.setInsertionPoint(loop); 49 auto operands = llvm::to_vector<4>(loop.getIterOperands()); 50 operands.append(newIterOperands.begin(), newIterOperands.end()); 51 scf::ForOp newLoop = 52 b.create<scf::ForOp>(loop.getLoc(), loop.getLowerBound(), 53 loop.getUpperBound(), loop.getStep(), operands); 54 55 auto &loopBody = *loop.getBody(); 56 auto &newLoopBody = *newLoop.getBody(); 57 // Clone / erase the yield inside the original loop to both: 58 // 1. augment its operands with the newYieldedValues. 59 // 2. automatically apply the BlockAndValueMapping on its operand 60 auto yield = cast<scf::YieldOp>(loopBody.getTerminator()); 61 b.setInsertionPoint(yield); 62 auto yieldOperands = llvm::to_vector<4>(yield.getOperands()); 63 yieldOperands.append(newYieldedValues.begin(), newYieldedValues.end()); 64 auto newYield = b.create<scf::YieldOp>(yield.getLoc(), yieldOperands); 65 66 // Clone the loop body with remaps. 67 BlockAndValueMapping bvm; 68 // a. remap the induction variable. 69 bvm.map(loop.getInductionVar(), newLoop.getInductionVar()); 70 // b. remap the BB args. 71 bvm.map(loopBody.getArguments(), 72 newLoopBody.getArguments().take_front(loopBody.getNumArguments())); 73 // c. remap the iter args. 74 bvm.map(newIterOperands, 75 newLoop.getRegionIterArgs().take_back(newIterOperands.size())); 76 b.setInsertionPointToStart(&newLoopBody); 77 // Skip the original yield terminator which does not have enough operands. 78 for (auto &o : loopBody.without_terminator()) 79 b.clone(o, bvm); 80 81 // Replace `loop`'s results if requested. 82 if (replaceLoopResults) { 83 for (auto it : llvm::zip(loop.getResults(), newLoop.getResults().take_front( 84 loop.getNumResults()))) 85 std::get<0>(it).replaceAllUsesWith(std::get<1>(it)); 86 } 87 88 // TODO: this is unsafe in the context of a PatternRewrite. 89 newYield.erase(); 90 91 return newLoop; 92 } 93 94 /// Outline a region with a single block into a new FuncOp. 95 /// Assumes the FuncOp result types is the type of the yielded operands of the 96 /// single block. This constraint makes it easy to determine the result. 97 /// This method also clones the `arith::ConstantIndexOp` at the start of 98 /// `outlinedFuncBody` to alloc simple canonicalizations. 99 // TODO: support more than single-block regions. 100 // TODO: more flexible constant handling. 101 FailureOr<FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter, 102 Location loc, Region ®ion, 103 StringRef funcName) { 104 assert(!funcName.empty() && "funcName cannot be empty"); 105 if (!region.hasOneBlock()) 106 return failure(); 107 108 Block *originalBlock = ®ion.front(); 109 Operation *originalTerminator = originalBlock->getTerminator(); 110 111 // Outline before current function. 112 OpBuilder::InsertionGuard g(rewriter); 113 rewriter.setInsertionPoint(region.getParentOfType<FuncOp>()); 114 115 SetVector<Value> captures; 116 getUsedValuesDefinedAbove(region, captures); 117 118 ValueRange outlinedValues(captures.getArrayRef()); 119 SmallVector<Type> outlinedFuncArgTypes; 120 SmallVector<Location> outlinedFuncArgLocs; 121 // Region's arguments are exactly the first block's arguments as per 122 // Region::getArguments(). 123 // Func's arguments are cat(regions's arguments, captures arguments). 124 for (BlockArgument arg : region.getArguments()) { 125 outlinedFuncArgTypes.push_back(arg.getType()); 126 outlinedFuncArgLocs.push_back(arg.getLoc()); 127 } 128 for (Value value : outlinedValues) { 129 outlinedFuncArgTypes.push_back(value.getType()); 130 outlinedFuncArgLocs.push_back(value.getLoc()); 131 } 132 FunctionType outlinedFuncType = 133 FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes, 134 originalTerminator->getOperandTypes()); 135 auto outlinedFunc = rewriter.create<FuncOp>(loc, funcName, outlinedFuncType); 136 Block *outlinedFuncBody = outlinedFunc.addEntryBlock(); 137 138 // Merge blocks while replacing the original block operands. 139 // Warning: `mergeBlocks` erases the original block, reconstruct it later. 140 int64_t numOriginalBlockArguments = originalBlock->getNumArguments(); 141 auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments(); 142 { 143 OpBuilder::InsertionGuard g(rewriter); 144 rewriter.setInsertionPointToEnd(outlinedFuncBody); 145 rewriter.mergeBlocks( 146 originalBlock, outlinedFuncBody, 147 outlinedFuncBlockArgs.take_front(numOriginalBlockArguments)); 148 // Explicitly set up a new ReturnOp terminator. 149 rewriter.setInsertionPointToEnd(outlinedFuncBody); 150 rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(), 151 originalTerminator->getOperands()); 152 } 153 154 // Reconstruct the block that was deleted and add a 155 // terminator(call_results). 156 Block *newBlock = rewriter.createBlock( 157 ®ion, region.begin(), 158 TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments), 159 ArrayRef<Location>(outlinedFuncArgLocs) 160 .take_front(numOriginalBlockArguments)); 161 { 162 OpBuilder::InsertionGuard g(rewriter); 163 rewriter.setInsertionPointToEnd(newBlock); 164 SmallVector<Value> callValues; 165 llvm::append_range(callValues, newBlock->getArguments()); 166 llvm::append_range(callValues, outlinedValues); 167 Operation *call = 168 rewriter.create<func::CallOp>(loc, outlinedFunc, callValues); 169 170 // `originalTerminator` was moved to `outlinedFuncBody` and is still valid. 171 // Clone `originalTerminator` to take the callOp results then erase it from 172 // `outlinedFuncBody`. 173 BlockAndValueMapping bvm; 174 bvm.map(originalTerminator->getOperands(), call->getResults()); 175 rewriter.clone(*originalTerminator, bvm); 176 rewriter.eraseOp(originalTerminator); 177 } 178 179 // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`. 180 // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`. 181 for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back( 182 outlinedValues.size()))) { 183 Value orig = std::get<0>(it); 184 Value repl = std::get<1>(it); 185 { 186 OpBuilder::InsertionGuard g(rewriter); 187 rewriter.setInsertionPointToStart(outlinedFuncBody); 188 if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) { 189 BlockAndValueMapping bvm; 190 repl = rewriter.clone(*cst, bvm)->getResult(0); 191 } 192 } 193 orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) { 194 return outlinedFunc->isProperAncestor(opOperand.getOwner()); 195 }); 196 } 197 198 return outlinedFunc; 199 } 200 201 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp, FuncOp *thenFn, 202 StringRef thenFnName, FuncOp *elseFn, 203 StringRef elseFnName) { 204 IRRewriter rewriter(b); 205 Location loc = ifOp.getLoc(); 206 FailureOr<FuncOp> outlinedFuncOpOrFailure; 207 if (thenFn && !ifOp.getThenRegion().empty()) { 208 outlinedFuncOpOrFailure = outlineSingleBlockRegion( 209 rewriter, loc, ifOp.getThenRegion(), thenFnName); 210 if (failed(outlinedFuncOpOrFailure)) 211 return failure(); 212 *thenFn = *outlinedFuncOpOrFailure; 213 } 214 if (elseFn && !ifOp.getElseRegion().empty()) { 215 outlinedFuncOpOrFailure = outlineSingleBlockRegion( 216 rewriter, loc, ifOp.getElseRegion(), elseFnName); 217 if (failed(outlinedFuncOpOrFailure)) 218 return failure(); 219 *elseFn = *outlinedFuncOpOrFailure; 220 } 221 return success(); 222 } 223 224 bool mlir::getInnermostParallelLoops(Operation *rootOp, 225 SmallVectorImpl<scf::ParallelOp> &result) { 226 assert(rootOp != nullptr && "Root operation must not be a nullptr."); 227 bool rootEnclosesPloops = false; 228 for (Region ®ion : rootOp->getRegions()) { 229 for (Block &block : region.getBlocks()) { 230 for (Operation &op : block) { 231 bool enclosesPloops = getInnermostParallelLoops(&op, result); 232 rootEnclosesPloops |= enclosesPloops; 233 if (auto ploop = dyn_cast<scf::ParallelOp>(op)) { 234 rootEnclosesPloops = true; 235 236 // Collect parallel loop if it is an innermost one. 237 if (!enclosesPloops) 238 result.push_back(ploop); 239 } 240 } 241 } 242 } 243 return rootEnclosesPloops; 244 } 245 246 // Build the IR that performs ceil division of a positive value by a constant: 247 // ceildiv(a, B) = divis(a + (B-1), B) 248 // where divis is rounding-to-zero division. 249 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, 250 int64_t divisor) { 251 assert(divisor > 0 && "expected positive divisor"); 252 assert(dividend.getType().isIndex() && "expected index-typed value"); 253 254 Value divisorMinusOneCst = 255 builder.create<arith::ConstantIndexOp>(loc, divisor - 1); 256 Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor); 257 Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst); 258 return builder.create<arith::DivSIOp>(loc, sum, divisorCst); 259 } 260 261 // Build the IR that performs ceil division of a positive value by another 262 // positive value: 263 // ceildiv(a, b) = divis(a + (b - 1), b) 264 // where divis is rounding-to-zero division. 265 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, 266 Value divisor) { 267 assert(dividend.getType().isIndex() && "expected index-typed value"); 268 269 Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1); 270 Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne); 271 Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne); 272 return builder.create<arith::DivSIOp>(loc, sum, divisor); 273 } 274 275 /// Helper to replace uses of loop carried values (iter_args) and loop 276 /// yield values while promoting single iteration scf.for ops. 277 static void replaceIterArgsAndYieldResults(scf::ForOp forOp) { 278 // Replace uses of iter arguments with iter operands (initial values). 279 auto iterOperands = forOp.getIterOperands(); 280 auto iterArgs = forOp.getRegionIterArgs(); 281 for (auto e : llvm::zip(iterOperands, iterArgs)) 282 std::get<1>(e).replaceAllUsesWith(std::get<0>(e)); 283 284 // Replace uses of loop results with the values yielded by the loop. 285 auto outerResults = forOp.getResults(); 286 auto innerResults = forOp.getBody()->getTerminator()->getOperands(); 287 for (auto e : llvm::zip(outerResults, innerResults)) 288 std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); 289 } 290 291 /// Promotes the loop body of a forOp to its containing block if the forOp 292 /// it can be determined that the loop has a single iteration. 293 LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) { 294 auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 295 auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 296 auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 297 if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 || 298 ubCstOp.value() < 0 || stepCstOp.value() < 0) 299 return failure(); 300 int64_t tripCount = 301 mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value()); 302 if (tripCount != 1) 303 return failure(); 304 auto iv = forOp.getInductionVar(); 305 iv.replaceAllUsesWith(lbCstOp); 306 307 replaceIterArgsAndYieldResults(forOp); 308 309 // Move the loop body operations, except for its terminator, to the loop's 310 // containing block. 311 auto *parentBlock = forOp->getBlock(); 312 forOp.getBody()->getTerminator()->erase(); 313 parentBlock->getOperations().splice(Block::iterator(forOp), 314 forOp.getBody()->getOperations()); 315 forOp.erase(); 316 return success(); 317 } 318 319 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with 320 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 321 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each 322 /// unrolled iteration using annotateFn. 323 static void generateUnrolledLoop( 324 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, 325 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn, 326 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, 327 ValueRange iterArgs, ValueRange yieldedValues) { 328 // Builder to insert unrolled bodies just before the terminator of the body of 329 // 'forOp'. 330 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock); 331 332 if (!annotateFn) 333 annotateFn = [](unsigned, Operation *, OpBuilder) {}; 334 335 // Keep a pointer to the last non-terminator operation in the original block 336 // so that we know what to clone (since we are doing this in-place). 337 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2); 338 339 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). 340 SmallVector<Value, 4> lastYielded(yieldedValues); 341 342 for (unsigned i = 1; i < unrollFactor; i++) { 343 BlockAndValueMapping operandMap; 344 345 // Prepare operand map. 346 operandMap.map(iterArgs, lastYielded); 347 348 // If the induction variable is used, create a remapping to the value for 349 // this unrolled instance. 350 if (!forOpIV.use_empty()) { 351 Value ivUnroll = ivRemapFn(i, forOpIV, builder); 352 operandMap.map(forOpIV, ivUnroll); 353 } 354 355 // Clone the original body of 'forOp'. 356 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) { 357 Operation *clonedOp = builder.clone(*it, operandMap); 358 annotateFn(i, clonedOp, builder); 359 } 360 361 // Update yielded values. 362 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) 363 lastYielded[i] = operandMap.lookup(yieldedValues[i]); 364 } 365 366 // Make sure we annotate the Ops in the original body. We do this last so that 367 // any annotations are not copied into the cloned Ops above. 368 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) 369 annotateFn(0, &*it, builder); 370 371 // Update operands of the yield statement. 372 loopBodyBlock->getTerminator()->setOperands(lastYielded); 373 } 374 375 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled. 376 LogicalResult mlir::loopUnrollByFactor( 377 scf::ForOp forOp, uint64_t unrollFactor, 378 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) { 379 assert(unrollFactor > 0 && "expected positive unroll factor"); 380 381 // Return if the loop body is empty. 382 if (llvm::hasSingleElement(forOp.getBody()->getOperations())) 383 return success(); 384 385 // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate 386 // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases. 387 OpBuilder boundsBuilder(forOp); 388 auto loc = forOp.getLoc(); 389 auto step = forOp.getStep(); 390 Value upperBoundUnrolled; 391 Value stepUnrolled; 392 bool generateEpilogueLoop = true; 393 394 auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 395 auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 396 auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 397 if (lbCstOp && ubCstOp && stepCstOp) { 398 // Constant loop bounds computation. 399 int64_t lbCst = lbCstOp.value(); 400 int64_t ubCst = ubCstOp.value(); 401 int64_t stepCst = stepCstOp.value(); 402 assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 && 403 "expected positive loop bounds and step"); 404 int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst); 405 406 if (unrollFactor == 1) { 407 if (tripCount == 1 && failed(promoteIfSingleIteration(forOp))) 408 return failure(); 409 return success(); 410 } 411 412 int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor); 413 int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst; 414 assert(upperBoundUnrolledCst <= ubCst); 415 int64_t stepUnrolledCst = stepCst * unrollFactor; 416 417 // Create constant for 'upperBoundUnrolled' and set epilogue loop flag. 418 generateEpilogueLoop = upperBoundUnrolledCst < ubCst; 419 if (generateEpilogueLoop) 420 upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>( 421 loc, upperBoundUnrolledCst); 422 else 423 upperBoundUnrolled = ubCstOp; 424 425 // Create constant for 'stepUnrolled'. 426 stepUnrolled = stepCst == stepUnrolledCst 427 ? step 428 : boundsBuilder.create<arith::ConstantIndexOp>( 429 loc, stepUnrolledCst); 430 } else { 431 // Dynamic loop bounds computation. 432 // TODO: Add dynamic asserts for negative lb/ub/step, or 433 // consider using ceilDiv from AffineApplyExpander. 434 auto lowerBound = forOp.getLowerBound(); 435 auto upperBound = forOp.getUpperBound(); 436 Value diff = 437 boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound); 438 Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step); 439 Value unrollFactorCst = 440 boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor); 441 Value tripCountRem = 442 boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst); 443 // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor) 444 Value tripCountEvenMultiple = 445 boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem); 446 // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step 447 upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>( 448 loc, lowerBound, 449 boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step)); 450 // Scale 'step' by 'unrollFactor'. 451 stepUnrolled = 452 boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst); 453 } 454 455 // Create epilogue clean up loop starting at 'upperBoundUnrolled'. 456 if (generateEpilogueLoop) { 457 OpBuilder epilogueBuilder(forOp->getContext()); 458 epilogueBuilder.setInsertionPoint(forOp->getBlock(), 459 std::next(Block::iterator(forOp))); 460 auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp)); 461 epilogueForOp.setLowerBound(upperBoundUnrolled); 462 463 // Update uses of loop results. 464 auto results = forOp.getResults(); 465 auto epilogueResults = epilogueForOp.getResults(); 466 auto epilogueIterOperands = epilogueForOp.getIterOperands(); 467 468 for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) { 469 std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); 470 epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e)); 471 } 472 (void)promoteIfSingleIteration(epilogueForOp); 473 } 474 475 // Create unrolled loop. 476 forOp.setUpperBound(upperBoundUnrolled); 477 forOp.setStep(stepUnrolled); 478 479 auto iterArgs = ValueRange(forOp.getRegionIterArgs()); 480 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands(); 481 482 generateUnrolledLoop( 483 forOp.getBody(), forOp.getInductionVar(), unrollFactor, 484 [&](unsigned i, Value iv, OpBuilder b) { 485 // iv' = iv + step * i; 486 auto stride = b.create<arith::MulIOp>( 487 loc, step, b.create<arith::ConstantIndexOp>(loc, i)); 488 return b.create<arith::AddIOp>(loc, iv, stride); 489 }, 490 annotateFn, iterArgs, yieldedValues); 491 // Promote the loop body up if this has turned into a single iteration loop. 492 (void)promoteIfSingleIteration(forOp); 493 return success(); 494 } 495 496 /// Return the new lower bound, upper bound, and step in that order. Insert any 497 /// additional bounds calculations before the given builder and any additional 498 /// conversion back to the original loop induction value inside the given Block. 499 static LoopParams normalizeLoop(OpBuilder &boundsBuilder, 500 OpBuilder &insideLoopBuilder, Location loc, 501 Value lowerBound, Value upperBound, Value step, 502 Value inductionVar) { 503 // Check if the loop is already known to have a constant zero lower bound or 504 // a constant one step. 505 bool isZeroBased = false; 506 if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>()) 507 isZeroBased = ubCst.value() == 0; 508 509 bool isStepOne = false; 510 if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>()) 511 isStepOne = stepCst.value() == 1; 512 513 // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) 514 // assuming the step is strictly positive. Update the bounds and the step 515 // of the loop to go from 0 to the number of iterations, if necessary. 516 // TODO: introduce support for negative steps or emit dynamic asserts 517 // on step positivity, whatever gets implemented first. 518 if (isZeroBased && isStepOne) 519 return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound, 520 /*step=*/step}; 521 522 Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound); 523 Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step); 524 525 Value newLowerBound = 526 isZeroBased ? lowerBound 527 : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0); 528 Value newStep = 529 isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1); 530 531 // Insert code computing the value of the original loop induction variable 532 // from the "normalized" one. 533 Value scaled = 534 isStepOne 535 ? inductionVar 536 : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step); 537 Value shifted = 538 isZeroBased 539 ? scaled 540 : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound); 541 542 SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(), 543 shifted.getDefiningOp()}; 544 inductionVar.replaceAllUsesExcept(shifted, preserve); 545 return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound, 546 /*step=*/newStep}; 547 } 548 549 /// Transform a loop with a strictly positive step 550 /// for %i = %lb to %ub step %s 551 /// into a 0-based loop with step 1 552 /// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { 553 /// %i = %ii * %s + %lb 554 /// Insert the induction variable remapping in the body of `inner`, which is 555 /// expected to be either `loop` or another loop perfectly nested under `loop`. 556 /// Insert the definition of new bounds immediate before `outer`, which is 557 /// expected to be either `loop` or its parent in the loop nest. 558 static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) { 559 OpBuilder builder(outer); 560 OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody()); 561 auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(), 562 loop.getLowerBound(), loop.getUpperBound(), 563 loop.getStep(), loop.getInductionVar()); 564 565 loop.setLowerBound(loopPieces.lowerBound); 566 loop.setUpperBound(loopPieces.upperBound); 567 loop.setStep(loopPieces.step); 568 } 569 570 void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) { 571 if (loops.size() < 2) 572 return; 573 574 scf::ForOp innermost = loops.back(); 575 scf::ForOp outermost = loops.front(); 576 577 // 1. Make sure all loops iterate from 0 to upperBound with step 1. This 578 // allows the following code to assume upperBound is the number of iterations. 579 for (auto loop : loops) 580 normalizeLoop(loop, outermost, innermost); 581 582 // 2. Emit code computing the upper bound of the coalesced loop as product 583 // of the number of iterations of all loops. 584 OpBuilder builder(outermost); 585 Location loc = outermost.getLoc(); 586 Value upperBound = outermost.getUpperBound(); 587 for (auto loop : loops.drop_front()) 588 upperBound = 589 builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound()); 590 outermost.setUpperBound(upperBound); 591 592 builder.setInsertionPointToStart(outermost.getBody()); 593 594 // 3. Remap induction variables. For each original loop, the value of the 595 // induction variable can be obtained by dividing the induction variable of 596 // the linearized loop by the total number of iterations of the loops nested 597 // in it modulo the number of iterations in this loop (remove the values 598 // related to the outer loops): 599 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. 600 // Compute these iteratively from the innermost loop by creating a "running 601 // quotient" of division by the range. 602 Value previous = outermost.getInductionVar(); 603 for (unsigned i = 0, e = loops.size(); i < e; ++i) { 604 unsigned idx = loops.size() - i - 1; 605 if (i != 0) 606 previous = builder.create<arith::DivSIOp>(loc, previous, 607 loops[idx + 1].getUpperBound()); 608 609 Value iv = (i == e - 1) ? previous 610 : builder.create<arith::RemSIOp>( 611 loc, previous, loops[idx].getUpperBound()); 612 replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, 613 loops.back().getRegion()); 614 } 615 616 // 4. Move the operations from the innermost just above the second-outermost 617 // loop, delete the extra terminator and the second-outermost loop. 618 scf::ForOp second = loops[1]; 619 innermost.getBody()->back().erase(); 620 outermost.getBody()->getOperations().splice( 621 Block::iterator(second.getOperation()), 622 innermost.getBody()->getOperations()); 623 second.erase(); 624 } 625 626 void mlir::collapseParallelLoops( 627 scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) { 628 OpBuilder outsideBuilder(loops); 629 Location loc = loops.getLoc(); 630 631 // Presort combined dimensions. 632 auto sortedDimensions = llvm::to_vector<3>(combinedDimensions); 633 for (auto &dims : sortedDimensions) 634 std::sort(dims.begin(), dims.end()); 635 636 // Normalize ParallelOp's iteration pattern. 637 SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps, 638 normalizedUpperBounds; 639 for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) { 640 OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody()); 641 auto resultBounds = 642 normalizeLoop(outsideBuilder, insideLoopBuilder, loc, 643 loops.getLowerBound()[i], loops.getUpperBound()[i], 644 loops.getStep()[i], loops.getBody()->getArgument(i)); 645 646 normalizedLowerBounds.push_back(resultBounds.lowerBound); 647 normalizedUpperBounds.push_back(resultBounds.upperBound); 648 normalizedSteps.push_back(resultBounds.step); 649 } 650 651 // Combine iteration spaces. 652 SmallVector<Value, 3> lowerBounds, upperBounds, steps; 653 auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0); 654 auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1); 655 for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) { 656 Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1); 657 for (auto idx : sortedDimensions[i]) { 658 newUpperBound = outsideBuilder.create<arith::MulIOp>( 659 loc, newUpperBound, normalizedUpperBounds[idx]); 660 } 661 lowerBounds.push_back(cst0); 662 steps.push_back(cst1); 663 upperBounds.push_back(newUpperBound); 664 } 665 666 // Create new ParallelLoop with conversions to the original induction values. 667 // The loop below uses divisions to get the relevant range of values in the 668 // new induction value that represent each range of the original induction 669 // value. The remainders then determine based on that range, which iteration 670 // of the original induction value this represents. This is a normalized value 671 // that is un-normalized already by the previous logic. 672 auto newPloop = outsideBuilder.create<scf::ParallelOp>( 673 loc, lowerBounds, upperBounds, steps, 674 [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) { 675 for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { 676 Value previous = ploopIVs[i]; 677 unsigned numberCombinedDimensions = combinedDimensions[i].size(); 678 // Iterate over all except the last induction value. 679 for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) { 680 unsigned idx = combinedDimensions[i][j]; 681 682 // Determine the current induction value's current loop iteration 683 Value iv = insideBuilder.create<arith::RemSIOp>( 684 loc, previous, normalizedUpperBounds[idx]); 685 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv, 686 loops.getRegion()); 687 688 // Remove the effect of the current induction value to prepare for 689 // the next value. 690 previous = insideBuilder.create<arith::DivSIOp>( 691 loc, previous, normalizedUpperBounds[idx]); 692 } 693 694 // The final induction value is just the remaining value. 695 unsigned idx = combinedDimensions[i][0]; 696 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), 697 previous, loops.getRegion()); 698 } 699 }); 700 701 // Replace the old loop with the new loop. 702 loops.getBody()->back().erase(); 703 newPloop.getBody()->getOperations().splice( 704 Block::iterator(newPloop.getBody()->back()), 705 loops.getBody()->getOperations()); 706 loops.erase(); 707 } 708 709 // Hoist the ops within `outer` that appear before `inner`. 710 // Such ops include the ops that have been introduced by parametric tiling. 711 // Ops that come from triangular loops (i.e. that belong to the program slice 712 // rooted at `outer`) and ops that have side effects cannot be hoisted. 713 // Return failure when any op fails to hoist. 714 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) { 715 SetVector<Operation *> forwardSlice; 716 getForwardSlice( 717 outer.getInductionVar(), &forwardSlice, 718 [&inner](Operation *op) { return op != inner.getOperation(); }); 719 LogicalResult status = success(); 720 SmallVector<Operation *, 8> toHoist; 721 for (auto &op : outer.getBody()->without_terminator()) { 722 // Stop when encountering the inner loop. 723 if (&op == inner.getOperation()) 724 break; 725 // Skip over non-hoistable ops. 726 if (forwardSlice.count(&op) > 0) { 727 status = failure(); 728 continue; 729 } 730 // Skip intermediate scf::ForOp, these are not considered a failure. 731 if (isa<scf::ForOp>(op)) 732 continue; 733 // Skip other ops with regions. 734 if (op.getNumRegions() > 0) { 735 status = failure(); 736 continue; 737 } 738 // Skip if op has side effects. 739 // TODO: loads to immutable memory regions are ok. 740 if (!MemoryEffectOpInterface::hasNoEffect(&op)) { 741 status = failure(); 742 continue; 743 } 744 toHoist.push_back(&op); 745 } 746 auto *outerForOp = outer.getOperation(); 747 for (auto *op : toHoist) 748 op->moveBefore(outerForOp); 749 return status; 750 } 751 752 // Traverse the interTile and intraTile loops and try to hoist ops such that 753 // bands of perfectly nested loops are isolated. 754 // Return failure if either perfect interTile or perfect intraTile bands cannot 755 // be formed. 756 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) { 757 LogicalResult status = success(); 758 const Loops &interTile = tileLoops.first; 759 const Loops &intraTile = tileLoops.second; 760 auto size = interTile.size(); 761 assert(size == intraTile.size()); 762 if (size <= 1) 763 return success(); 764 for (unsigned s = 1; s < size; ++s) 765 status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s]) 766 : failure(); 767 for (unsigned s = 1; s < size; ++s) 768 status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s]) 769 : failure(); 770 return status; 771 } 772 773 /// Collect perfectly nested loops starting from `rootForOps`. Loops are 774 /// perfectly nested if each loop is the first and only non-terminator operation 775 /// in the parent loop. Collect at most `maxLoops` loops and append them to 776 /// `forOps`. 777 template <typename T> 778 static void getPerfectlyNestedLoopsImpl( 779 SmallVectorImpl<T> &forOps, T rootForOp, 780 unsigned maxLoops = std::numeric_limits<unsigned>::max()) { 781 for (unsigned i = 0; i < maxLoops; ++i) { 782 forOps.push_back(rootForOp); 783 Block &body = rootForOp.getRegion().front(); 784 if (body.begin() != std::prev(body.end(), 2)) 785 return; 786 787 rootForOp = dyn_cast<T>(&body.front()); 788 if (!rootForOp) 789 return; 790 } 791 } 792 793 static Loops stripmineSink(scf::ForOp forOp, Value factor, 794 ArrayRef<scf::ForOp> targets) { 795 auto originalStep = forOp.getStep(); 796 auto iv = forOp.getInductionVar(); 797 798 OpBuilder b(forOp); 799 forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor)); 800 801 Loops innerLoops; 802 for (auto t : targets) { 803 // Save information for splicing ops out of t when done 804 auto begin = t.getBody()->begin(); 805 auto nOps = t.getBody()->getOperations().size(); 806 807 // Insert newForOp before the terminator of `t`. 808 auto b = OpBuilder::atBlockTerminator((t.getBody())); 809 Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep()); 810 Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt, 811 forOp.getUpperBound(), stepped); 812 Value ub = b.create<arith::SelectOp>(t.getLoc(), less, 813 forOp.getUpperBound(), stepped); 814 815 // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses. 816 auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep); 817 newForOp.getBody()->getOperations().splice( 818 newForOp.getBody()->getOperations().begin(), 819 t.getBody()->getOperations(), begin, std::next(begin, nOps - 1)); 820 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), 821 newForOp.getRegion()); 822 823 innerLoops.push_back(newForOp); 824 } 825 826 return innerLoops; 827 } 828 829 // Stripmines a `forOp` by `factor` and sinks it under a single `target`. 830 // Returns the new for operation, nested immediately under `target`. 831 template <typename SizeType> 832 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor, 833 scf::ForOp target) { 834 // TODO: Use cheap structural assertions that targets are nested under 835 // forOp and that targets are not nested under each other when DominanceInfo 836 // exposes the capability. It seems overkill to construct a whole function 837 // dominance tree at this point. 838 auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target)); 839 assert(res.size() == 1 && "Expected 1 inner forOp"); 840 return res[0]; 841 } 842 843 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps, 844 ArrayRef<Value> sizes, 845 ArrayRef<scf::ForOp> targets) { 846 SmallVector<SmallVector<scf::ForOp, 8>, 8> res; 847 SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end()); 848 for (auto it : llvm::zip(forOps, sizes)) { 849 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets); 850 res.push_back(step); 851 currentTargets = step; 852 } 853 return res; 854 } 855 856 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes, 857 scf::ForOp target) { 858 SmallVector<scf::ForOp, 8> res; 859 for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) { 860 assert(loops.size() == 1); 861 res.push_back(loops[0]); 862 } 863 return res; 864 } 865 866 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) { 867 // Collect perfectly nested loops. If more size values provided than nested 868 // loops available, truncate `sizes`. 869 SmallVector<scf::ForOp, 4> forOps; 870 forOps.reserve(sizes.size()); 871 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); 872 if (forOps.size() < sizes.size()) 873 sizes = sizes.take_front(forOps.size()); 874 875 return ::tile(forOps, sizes, forOps.back()); 876 } 877 878 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops, 879 scf::ForOp root) { 880 getPerfectlyNestedLoopsImpl(nestedLoops, root); 881 } 882 883 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp, 884 ArrayRef<int64_t> sizes) { 885 // Collect perfectly nested loops. If more size values provided than nested 886 // loops available, truncate `sizes`. 887 SmallVector<scf::ForOp, 4> forOps; 888 forOps.reserve(sizes.size()); 889 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); 890 if (forOps.size() < sizes.size()) 891 sizes = sizes.take_front(forOps.size()); 892 893 // Compute the tile sizes such that i-th outer loop executes size[i] 894 // iterations. Given that the loop current executes 895 // numIterations = ceildiv((upperBound - lowerBound), step) 896 // iterations, we need to tile with size ceildiv(numIterations, size[i]). 897 SmallVector<Value, 4> tileSizes; 898 tileSizes.reserve(sizes.size()); 899 for (unsigned i = 0, e = sizes.size(); i < e; ++i) { 900 assert(sizes[i] > 0 && "expected strictly positive size for strip-mining"); 901 902 auto forOp = forOps[i]; 903 OpBuilder builder(forOp); 904 auto loc = forOp.getLoc(); 905 Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(), 906 forOp.getLowerBound()); 907 Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep()); 908 Value iterationsPerBlock = 909 ceilDivPositive(builder, loc, numIterations, sizes[i]); 910 tileSizes.push_back(iterationsPerBlock); 911 } 912 913 // Call parametric tiling with the given sizes. 914 auto intraTile = tile(forOps, tileSizes, forOps.back()); 915 TileLoops tileLoops = std::make_pair(forOps, intraTile); 916 917 // TODO: for now we just ignore the result of band isolation. 918 // In the future, mapping decisions may be impacted by the ability to 919 // isolate perfectly nested bands. 920 (void)tryIsolateBands(tileLoops); 921 922 return tileLoops; 923 } 924