1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements miscellaneous loop transformation routines. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/SCF/Utils/Utils.h" 14 #include "mlir/Analysis/SliceAnalysis.h" 15 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 16 #include "mlir/Dialect/Func/IR/FuncOps.h" 17 #include "mlir/Dialect/SCF/SCF.h" 18 #include "mlir/IR/BlockAndValueMapping.h" 19 #include "mlir/IR/BuiltinOps.h" 20 #include "mlir/IR/PatternMatch.h" 21 #include "mlir/Support/MathExtras.h" 22 #include "mlir/Transforms/RegionUtils.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/SetVector.h" 25 #include "llvm/ADT/SmallPtrSet.h" 26 #include "llvm/ADT/SmallVector.h" 27 28 using namespace mlir; 29 30 namespace { 31 // This structure is to pass and return sets of loop parameters without 32 // confusing the order. 33 struct LoopParams { 34 Value lowerBound; 35 Value upperBound; 36 Value step; 37 }; 38 } // namespace 39 40 scf::ForOp 41 mlir::replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop, 42 ValueRange newIterOperands, 43 const NewYieldValueFn &newYieldValuesFn) { 44 // Create a new loop before the existing one, with the extra operands. 45 OpBuilder::InsertionGuard g(builder); 46 builder.setInsertionPoint(loop); 47 auto operands = llvm::to_vector(loop.getIterOperands()); 48 operands.append(newIterOperands.begin(), newIterOperands.end()); 49 scf::ForOp newLoop = builder.create<scf::ForOp>( 50 loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(), loop.getStep(), 51 operands, [](OpBuilder &, Location, Value, ValueRange) {}); 52 53 Block *loopBody = loop.getBody(); 54 Block *newLoopBody = newLoop.getBody(); 55 56 // Move the body of the original loop to the new loop. 57 newLoopBody->getOperations().splice(newLoopBody->end(), 58 loopBody->getOperations()); 59 60 // Generate the new yield values to use by using the callback and append the 61 // yield values to the scf.yield operation. 62 auto yield = cast<scf::YieldOp>(newLoopBody->getTerminator()); 63 ArrayRef<BlockArgument> newBBArgs = 64 newLoopBody->getArguments().take_back(newIterOperands.size()); 65 { 66 OpBuilder::InsertionGuard g(builder); 67 builder.setInsertionPoint(yield); 68 SmallVector<Value> newYieldedValues = 69 newYieldValuesFn(builder, loop.getLoc(), newBBArgs); 70 assert(newIterOperands.size() == newYieldedValues.size() && 71 "expected as many new yield values as new iter operands"); 72 yield.getResultsMutable().append(newYieldedValues); 73 } 74 75 // Remap the BlockArguments from the original loop to the new loop 76 // BlockArguments. 77 ArrayRef<BlockArgument> bbArgs = loopBody->getArguments(); 78 for (auto it : 79 llvm::zip(bbArgs, newLoopBody->getArguments().take_front(bbArgs.size()))) 80 std::get<0>(it).replaceAllUsesWith(std::get<1>(it)); 81 82 // Replace all uses of `newIterOperands` with the corresponding basic block 83 // arguments. 84 for (auto it : llvm::zip(newIterOperands, newBBArgs)) { 85 std::get<0>(it).replaceUsesWithIf(std::get<1>(it), [&](OpOperand &use) { 86 Operation *user = use.getOwner(); 87 return newLoop->isProperAncestor(user); 88 }); 89 } 90 91 // Replace all uses of the original loop with corresponding values from the 92 // new loop. 93 loop.replaceAllUsesWith( 94 newLoop.getResults().take_front(loop.getNumResults())); 95 96 // Add a fake yield to the original loop body that just returns the 97 // BlockArguments corresponding to the iter_args. This makes it a no-op loop. 98 // The loop is dead. The caller is expected to erase it. 99 builder.setInsertionPointToEnd(loopBody); 100 builder.create<scf::YieldOp>(loop->getLoc(), loop.getRegionIterArgs()); 101 102 return newLoop; 103 } 104 105 SmallVector<scf::ForOp> mlir::replaceLoopNestWithNewYields( 106 OpBuilder &builder, ArrayRef<scf::ForOp> loopNest, 107 ValueRange newIterOperands, NewYieldValueFn newYieldValueFn) { 108 if (loopNest.empty()) 109 return {}; 110 SmallVector<scf::ForOp> newLoopNest(loopNest.size()); 111 112 newLoopNest.back() = replaceLoopWithNewYields( 113 builder, loopNest.back(), newIterOperands, newYieldValueFn); 114 115 for (unsigned loopDepth : 116 llvm::reverse(llvm::seq<unsigned>(0, loopNest.size() - 1))) { 117 NewYieldValueFn fn = [&](OpBuilder &innerBuilder, Location loc, 118 ArrayRef<BlockArgument> innerNewBBArgs) { 119 SmallVector<Value> newYields( 120 newLoopNest[loopDepth + 1]->getResults().take_back( 121 newIterOperands.size())); 122 return newYields; 123 }; 124 newLoopNest[loopDepth] = replaceLoopWithNewYields( 125 builder, loopNest[loopDepth], newIterOperands, fn); 126 } 127 return newLoopNest; 128 } 129 130 /// Outline a region with a single block into a new FuncOp. 131 /// Assumes the FuncOp result types is the type of the yielded operands of the 132 /// single block. This constraint makes it easy to determine the result. 133 /// This method also clones the `arith::ConstantIndexOp` at the start of 134 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is 135 /// provided, it will be set to point to the operation that calls the outlined 136 /// function. 137 // TODO: support more than single-block regions. 138 // TODO: more flexible constant handling. 139 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter, 140 Location loc, 141 Region ®ion, 142 StringRef funcName, 143 func::CallOp *callOp) { 144 assert(!funcName.empty() && "funcName cannot be empty"); 145 if (!region.hasOneBlock()) 146 return failure(); 147 148 Block *originalBlock = ®ion.front(); 149 Operation *originalTerminator = originalBlock->getTerminator(); 150 151 // Outline before current function. 152 OpBuilder::InsertionGuard g(rewriter); 153 rewriter.setInsertionPoint(region.getParentOfType<func::FuncOp>()); 154 155 SetVector<Value> captures; 156 getUsedValuesDefinedAbove(region, captures); 157 158 ValueRange outlinedValues(captures.getArrayRef()); 159 SmallVector<Type> outlinedFuncArgTypes; 160 SmallVector<Location> outlinedFuncArgLocs; 161 // Region's arguments are exactly the first block's arguments as per 162 // Region::getArguments(). 163 // Func's arguments are cat(regions's arguments, captures arguments). 164 for (BlockArgument arg : region.getArguments()) { 165 outlinedFuncArgTypes.push_back(arg.getType()); 166 outlinedFuncArgLocs.push_back(arg.getLoc()); 167 } 168 for (Value value : outlinedValues) { 169 outlinedFuncArgTypes.push_back(value.getType()); 170 outlinedFuncArgLocs.push_back(value.getLoc()); 171 } 172 FunctionType outlinedFuncType = 173 FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes, 174 originalTerminator->getOperandTypes()); 175 auto outlinedFunc = 176 rewriter.create<func::FuncOp>(loc, funcName, outlinedFuncType); 177 Block *outlinedFuncBody = outlinedFunc.addEntryBlock(); 178 179 // Merge blocks while replacing the original block operands. 180 // Warning: `mergeBlocks` erases the original block, reconstruct it later. 181 int64_t numOriginalBlockArguments = originalBlock->getNumArguments(); 182 auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments(); 183 { 184 OpBuilder::InsertionGuard g(rewriter); 185 rewriter.setInsertionPointToEnd(outlinedFuncBody); 186 rewriter.mergeBlocks( 187 originalBlock, outlinedFuncBody, 188 outlinedFuncBlockArgs.take_front(numOriginalBlockArguments)); 189 // Explicitly set up a new ReturnOp terminator. 190 rewriter.setInsertionPointToEnd(outlinedFuncBody); 191 rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(), 192 originalTerminator->getOperands()); 193 } 194 195 // Reconstruct the block that was deleted and add a 196 // terminator(call_results). 197 Block *newBlock = rewriter.createBlock( 198 ®ion, region.begin(), 199 TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments), 200 ArrayRef<Location>(outlinedFuncArgLocs) 201 .take_front(numOriginalBlockArguments)); 202 { 203 OpBuilder::InsertionGuard g(rewriter); 204 rewriter.setInsertionPointToEnd(newBlock); 205 SmallVector<Value> callValues; 206 llvm::append_range(callValues, newBlock->getArguments()); 207 llvm::append_range(callValues, outlinedValues); 208 auto call = rewriter.create<func::CallOp>(loc, outlinedFunc, callValues); 209 if (callOp) 210 *callOp = call; 211 212 // `originalTerminator` was moved to `outlinedFuncBody` and is still valid. 213 // Clone `originalTerminator` to take the callOp results then erase it from 214 // `outlinedFuncBody`. 215 BlockAndValueMapping bvm; 216 bvm.map(originalTerminator->getOperands(), call->getResults()); 217 rewriter.clone(*originalTerminator, bvm); 218 rewriter.eraseOp(originalTerminator); 219 } 220 221 // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`. 222 // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`. 223 for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back( 224 outlinedValues.size()))) { 225 Value orig = std::get<0>(it); 226 Value repl = std::get<1>(it); 227 { 228 OpBuilder::InsertionGuard g(rewriter); 229 rewriter.setInsertionPointToStart(outlinedFuncBody); 230 if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) { 231 BlockAndValueMapping bvm; 232 repl = rewriter.clone(*cst, bvm)->getResult(0); 233 } 234 } 235 orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) { 236 return outlinedFunc->isProperAncestor(opOperand.getOwner()); 237 }); 238 } 239 240 return outlinedFunc; 241 } 242 243 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp, 244 func::FuncOp *thenFn, StringRef thenFnName, 245 func::FuncOp *elseFn, StringRef elseFnName) { 246 IRRewriter rewriter(b); 247 Location loc = ifOp.getLoc(); 248 FailureOr<func::FuncOp> outlinedFuncOpOrFailure; 249 if (thenFn && !ifOp.getThenRegion().empty()) { 250 outlinedFuncOpOrFailure = outlineSingleBlockRegion( 251 rewriter, loc, ifOp.getThenRegion(), thenFnName); 252 if (failed(outlinedFuncOpOrFailure)) 253 return failure(); 254 *thenFn = *outlinedFuncOpOrFailure; 255 } 256 if (elseFn && !ifOp.getElseRegion().empty()) { 257 outlinedFuncOpOrFailure = outlineSingleBlockRegion( 258 rewriter, loc, ifOp.getElseRegion(), elseFnName); 259 if (failed(outlinedFuncOpOrFailure)) 260 return failure(); 261 *elseFn = *outlinedFuncOpOrFailure; 262 } 263 return success(); 264 } 265 266 bool mlir::getInnermostParallelLoops(Operation *rootOp, 267 SmallVectorImpl<scf::ParallelOp> &result) { 268 assert(rootOp != nullptr && "Root operation must not be a nullptr."); 269 bool rootEnclosesPloops = false; 270 for (Region ®ion : rootOp->getRegions()) { 271 for (Block &block : region.getBlocks()) { 272 for (Operation &op : block) { 273 bool enclosesPloops = getInnermostParallelLoops(&op, result); 274 rootEnclosesPloops |= enclosesPloops; 275 if (auto ploop = dyn_cast<scf::ParallelOp>(op)) { 276 rootEnclosesPloops = true; 277 278 // Collect parallel loop if it is an innermost one. 279 if (!enclosesPloops) 280 result.push_back(ploop); 281 } 282 } 283 } 284 } 285 return rootEnclosesPloops; 286 } 287 288 // Build the IR that performs ceil division of a positive value by a constant: 289 // ceildiv(a, B) = divis(a + (B-1), B) 290 // where divis is rounding-to-zero division. 291 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, 292 int64_t divisor) { 293 assert(divisor > 0 && "expected positive divisor"); 294 assert(dividend.getType().isIndex() && "expected index-typed value"); 295 296 Value divisorMinusOneCst = 297 builder.create<arith::ConstantIndexOp>(loc, divisor - 1); 298 Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor); 299 Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst); 300 return builder.create<arith::DivSIOp>(loc, sum, divisorCst); 301 } 302 303 // Build the IR that performs ceil division of a positive value by another 304 // positive value: 305 // ceildiv(a, b) = divis(a + (b - 1), b) 306 // where divis is rounding-to-zero division. 307 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, 308 Value divisor) { 309 assert(dividend.getType().isIndex() && "expected index-typed value"); 310 311 Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1); 312 Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne); 313 Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne); 314 return builder.create<arith::DivSIOp>(loc, sum, divisor); 315 } 316 317 /// Helper to replace uses of loop carried values (iter_args) and loop 318 /// yield values while promoting single iteration scf.for ops. 319 static void replaceIterArgsAndYieldResults(scf::ForOp forOp) { 320 // Replace uses of iter arguments with iter operands (initial values). 321 auto iterOperands = forOp.getIterOperands(); 322 auto iterArgs = forOp.getRegionIterArgs(); 323 for (auto e : llvm::zip(iterOperands, iterArgs)) 324 std::get<1>(e).replaceAllUsesWith(std::get<0>(e)); 325 326 // Replace uses of loop results with the values yielded by the loop. 327 auto outerResults = forOp.getResults(); 328 auto innerResults = forOp.getBody()->getTerminator()->getOperands(); 329 for (auto e : llvm::zip(outerResults, innerResults)) 330 std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); 331 } 332 333 /// Promotes the loop body of a forOp to its containing block if the forOp 334 /// it can be determined that the loop has a single iteration. 335 LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) { 336 auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 337 auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 338 auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 339 if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 || 340 ubCstOp.value() < 0 || stepCstOp.value() < 0) 341 return failure(); 342 int64_t tripCount = 343 mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value()); 344 if (tripCount != 1) 345 return failure(); 346 auto iv = forOp.getInductionVar(); 347 iv.replaceAllUsesWith(lbCstOp); 348 349 replaceIterArgsAndYieldResults(forOp); 350 351 // Move the loop body operations, except for its terminator, to the loop's 352 // containing block. 353 auto *parentBlock = forOp->getBlock(); 354 forOp.getBody()->getTerminator()->erase(); 355 parentBlock->getOperations().splice(Block::iterator(forOp), 356 forOp.getBody()->getOperations()); 357 forOp.erase(); 358 return success(); 359 } 360 361 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with 362 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 363 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each 364 /// unrolled iteration using annotateFn. 365 static void generateUnrolledLoop( 366 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, 367 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn, 368 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, 369 ValueRange iterArgs, ValueRange yieldedValues) { 370 // Builder to insert unrolled bodies just before the terminator of the body of 371 // 'forOp'. 372 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock); 373 374 if (!annotateFn) 375 annotateFn = [](unsigned, Operation *, OpBuilder) {}; 376 377 // Keep a pointer to the last non-terminator operation in the original block 378 // so that we know what to clone (since we are doing this in-place). 379 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2); 380 381 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). 382 SmallVector<Value, 4> lastYielded(yieldedValues); 383 384 for (unsigned i = 1; i < unrollFactor; i++) { 385 BlockAndValueMapping operandMap; 386 387 // Prepare operand map. 388 operandMap.map(iterArgs, lastYielded); 389 390 // If the induction variable is used, create a remapping to the value for 391 // this unrolled instance. 392 if (!forOpIV.use_empty()) { 393 Value ivUnroll = ivRemapFn(i, forOpIV, builder); 394 operandMap.map(forOpIV, ivUnroll); 395 } 396 397 // Clone the original body of 'forOp'. 398 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) { 399 Operation *clonedOp = builder.clone(*it, operandMap); 400 annotateFn(i, clonedOp, builder); 401 } 402 403 // Update yielded values. 404 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) 405 lastYielded[i] = operandMap.lookup(yieldedValues[i]); 406 } 407 408 // Make sure we annotate the Ops in the original body. We do this last so that 409 // any annotations are not copied into the cloned Ops above. 410 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) 411 annotateFn(0, &*it, builder); 412 413 // Update operands of the yield statement. 414 loopBodyBlock->getTerminator()->setOperands(lastYielded); 415 } 416 417 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled. 418 LogicalResult mlir::loopUnrollByFactor( 419 scf::ForOp forOp, uint64_t unrollFactor, 420 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) { 421 assert(unrollFactor > 0 && "expected positive unroll factor"); 422 423 // Return if the loop body is empty. 424 if (llvm::hasSingleElement(forOp.getBody()->getOperations())) 425 return success(); 426 427 // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate 428 // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases. 429 OpBuilder boundsBuilder(forOp); 430 auto loc = forOp.getLoc(); 431 auto step = forOp.getStep(); 432 Value upperBoundUnrolled; 433 Value stepUnrolled; 434 bool generateEpilogueLoop = true; 435 436 auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>(); 437 auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>(); 438 auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>(); 439 if (lbCstOp && ubCstOp && stepCstOp) { 440 // Constant loop bounds computation. 441 int64_t lbCst = lbCstOp.value(); 442 int64_t ubCst = ubCstOp.value(); 443 int64_t stepCst = stepCstOp.value(); 444 assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 && 445 "expected positive loop bounds and step"); 446 int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst); 447 448 if (unrollFactor == 1) { 449 if (tripCount == 1 && failed(promoteIfSingleIteration(forOp))) 450 return failure(); 451 return success(); 452 } 453 454 int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor); 455 int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst; 456 assert(upperBoundUnrolledCst <= ubCst); 457 int64_t stepUnrolledCst = stepCst * unrollFactor; 458 459 // Create constant for 'upperBoundUnrolled' and set epilogue loop flag. 460 generateEpilogueLoop = upperBoundUnrolledCst < ubCst; 461 if (generateEpilogueLoop) 462 upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>( 463 loc, upperBoundUnrolledCst); 464 else 465 upperBoundUnrolled = ubCstOp; 466 467 // Create constant for 'stepUnrolled'. 468 stepUnrolled = stepCst == stepUnrolledCst 469 ? step 470 : boundsBuilder.create<arith::ConstantIndexOp>( 471 loc, stepUnrolledCst); 472 } else { 473 // Dynamic loop bounds computation. 474 // TODO: Add dynamic asserts for negative lb/ub/step, or 475 // consider using ceilDiv from AffineApplyExpander. 476 auto lowerBound = forOp.getLowerBound(); 477 auto upperBound = forOp.getUpperBound(); 478 Value diff = 479 boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound); 480 Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step); 481 Value unrollFactorCst = 482 boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor); 483 Value tripCountRem = 484 boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst); 485 // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor) 486 Value tripCountEvenMultiple = 487 boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem); 488 // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step 489 upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>( 490 loc, lowerBound, 491 boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step)); 492 // Scale 'step' by 'unrollFactor'. 493 stepUnrolled = 494 boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst); 495 } 496 497 // Create epilogue clean up loop starting at 'upperBoundUnrolled'. 498 if (generateEpilogueLoop) { 499 OpBuilder epilogueBuilder(forOp->getContext()); 500 epilogueBuilder.setInsertionPoint(forOp->getBlock(), 501 std::next(Block::iterator(forOp))); 502 auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp)); 503 epilogueForOp.setLowerBound(upperBoundUnrolled); 504 505 // Update uses of loop results. 506 auto results = forOp.getResults(); 507 auto epilogueResults = epilogueForOp.getResults(); 508 509 for (auto e : llvm::zip(results, epilogueResults)) { 510 std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); 511 } 512 epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(), 513 epilogueForOp.getNumIterOperands(), results); 514 (void)promoteIfSingleIteration(epilogueForOp); 515 } 516 517 // Create unrolled loop. 518 forOp.setUpperBound(upperBoundUnrolled); 519 forOp.setStep(stepUnrolled); 520 521 auto iterArgs = ValueRange(forOp.getRegionIterArgs()); 522 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands(); 523 524 generateUnrolledLoop( 525 forOp.getBody(), forOp.getInductionVar(), unrollFactor, 526 [&](unsigned i, Value iv, OpBuilder b) { 527 // iv' = iv + step * i; 528 auto stride = b.create<arith::MulIOp>( 529 loc, step, b.create<arith::ConstantIndexOp>(loc, i)); 530 return b.create<arith::AddIOp>(loc, iv, stride); 531 }, 532 annotateFn, iterArgs, yieldedValues); 533 // Promote the loop body up if this has turned into a single iteration loop. 534 (void)promoteIfSingleIteration(forOp); 535 return success(); 536 } 537 538 /// Return the new lower bound, upper bound, and step in that order. Insert any 539 /// additional bounds calculations before the given builder and any additional 540 /// conversion back to the original loop induction value inside the given Block. 541 static LoopParams normalizeLoop(OpBuilder &boundsBuilder, 542 OpBuilder &insideLoopBuilder, Location loc, 543 Value lowerBound, Value upperBound, Value step, 544 Value inductionVar) { 545 // Check if the loop is already known to have a constant zero lower bound or 546 // a constant one step. 547 bool isZeroBased = false; 548 if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>()) 549 isZeroBased = ubCst.value() == 0; 550 551 bool isStepOne = false; 552 if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>()) 553 isStepOne = stepCst.value() == 1; 554 555 // Compute the number of iterations the loop executes: ceildiv(ub - lb, step) 556 // assuming the step is strictly positive. Update the bounds and the step 557 // of the loop to go from 0 to the number of iterations, if necessary. 558 // TODO: introduce support for negative steps or emit dynamic asserts 559 // on step positivity, whatever gets implemented first. 560 if (isZeroBased && isStepOne) 561 return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound, 562 /*step=*/step}; 563 564 Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound); 565 Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step); 566 567 Value newLowerBound = 568 isZeroBased ? lowerBound 569 : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0); 570 Value newStep = 571 isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1); 572 573 // Insert code computing the value of the original loop induction variable 574 // from the "normalized" one. 575 Value scaled = 576 isStepOne 577 ? inductionVar 578 : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step); 579 Value shifted = 580 isZeroBased 581 ? scaled 582 : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound); 583 584 SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(), 585 shifted.getDefiningOp()}; 586 inductionVar.replaceAllUsesExcept(shifted, preserve); 587 return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound, 588 /*step=*/newStep}; 589 } 590 591 /// Transform a loop with a strictly positive step 592 /// for %i = %lb to %ub step %s 593 /// into a 0-based loop with step 1 594 /// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 { 595 /// %i = %ii * %s + %lb 596 /// Insert the induction variable remapping in the body of `inner`, which is 597 /// expected to be either `loop` or another loop perfectly nested under `loop`. 598 /// Insert the definition of new bounds immediate before `outer`, which is 599 /// expected to be either `loop` or its parent in the loop nest. 600 static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) { 601 OpBuilder builder(outer); 602 OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody()); 603 auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(), 604 loop.getLowerBound(), loop.getUpperBound(), 605 loop.getStep(), loop.getInductionVar()); 606 607 loop.setLowerBound(loopPieces.lowerBound); 608 loop.setUpperBound(loopPieces.upperBound); 609 loop.setStep(loopPieces.step); 610 } 611 612 void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) { 613 if (loops.size() < 2) 614 return; 615 616 scf::ForOp innermost = loops.back(); 617 scf::ForOp outermost = loops.front(); 618 619 // 1. Make sure all loops iterate from 0 to upperBound with step 1. This 620 // allows the following code to assume upperBound is the number of iterations. 621 for (auto loop : loops) 622 normalizeLoop(loop, outermost, innermost); 623 624 // 2. Emit code computing the upper bound of the coalesced loop as product 625 // of the number of iterations of all loops. 626 OpBuilder builder(outermost); 627 Location loc = outermost.getLoc(); 628 Value upperBound = outermost.getUpperBound(); 629 for (auto loop : loops.drop_front()) 630 upperBound = 631 builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound()); 632 outermost.setUpperBound(upperBound); 633 634 builder.setInsertionPointToStart(outermost.getBody()); 635 636 // 3. Remap induction variables. For each original loop, the value of the 637 // induction variable can be obtained by dividing the induction variable of 638 // the linearized loop by the total number of iterations of the loops nested 639 // in it modulo the number of iterations in this loop (remove the values 640 // related to the outer loops): 641 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. 642 // Compute these iteratively from the innermost loop by creating a "running 643 // quotient" of division by the range. 644 Value previous = outermost.getInductionVar(); 645 for (unsigned i = 0, e = loops.size(); i < e; ++i) { 646 unsigned idx = loops.size() - i - 1; 647 if (i != 0) 648 previous = builder.create<arith::DivSIOp>(loc, previous, 649 loops[idx + 1].getUpperBound()); 650 651 Value iv = (i == e - 1) ? previous 652 : builder.create<arith::RemSIOp>( 653 loc, previous, loops[idx].getUpperBound()); 654 replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv, 655 loops.back().getRegion()); 656 } 657 658 // 4. Move the operations from the innermost just above the second-outermost 659 // loop, delete the extra terminator and the second-outermost loop. 660 scf::ForOp second = loops[1]; 661 innermost.getBody()->back().erase(); 662 outermost.getBody()->getOperations().splice( 663 Block::iterator(second.getOperation()), 664 innermost.getBody()->getOperations()); 665 second.erase(); 666 } 667 668 void mlir::collapseParallelLoops( 669 scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) { 670 OpBuilder outsideBuilder(loops); 671 Location loc = loops.getLoc(); 672 673 // Presort combined dimensions. 674 auto sortedDimensions = llvm::to_vector<3>(combinedDimensions); 675 for (auto &dims : sortedDimensions) 676 std::sort(dims.begin(), dims.end()); 677 678 // Normalize ParallelOp's iteration pattern. 679 SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps, 680 normalizedUpperBounds; 681 for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) { 682 OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody()); 683 auto resultBounds = 684 normalizeLoop(outsideBuilder, insideLoopBuilder, loc, 685 loops.getLowerBound()[i], loops.getUpperBound()[i], 686 loops.getStep()[i], loops.getBody()->getArgument(i)); 687 688 normalizedLowerBounds.push_back(resultBounds.lowerBound); 689 normalizedUpperBounds.push_back(resultBounds.upperBound); 690 normalizedSteps.push_back(resultBounds.step); 691 } 692 693 // Combine iteration spaces. 694 SmallVector<Value, 3> lowerBounds, upperBounds, steps; 695 auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0); 696 auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1); 697 for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) { 698 Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1); 699 for (auto idx : sortedDimensions[i]) { 700 newUpperBound = outsideBuilder.create<arith::MulIOp>( 701 loc, newUpperBound, normalizedUpperBounds[idx]); 702 } 703 lowerBounds.push_back(cst0); 704 steps.push_back(cst1); 705 upperBounds.push_back(newUpperBound); 706 } 707 708 // Create new ParallelLoop with conversions to the original induction values. 709 // The loop below uses divisions to get the relevant range of values in the 710 // new induction value that represent each range of the original induction 711 // value. The remainders then determine based on that range, which iteration 712 // of the original induction value this represents. This is a normalized value 713 // that is un-normalized already by the previous logic. 714 auto newPloop = outsideBuilder.create<scf::ParallelOp>( 715 loc, lowerBounds, upperBounds, steps, 716 [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) { 717 for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) { 718 Value previous = ploopIVs[i]; 719 unsigned numberCombinedDimensions = combinedDimensions[i].size(); 720 // Iterate over all except the last induction value. 721 for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) { 722 unsigned idx = combinedDimensions[i][j]; 723 724 // Determine the current induction value's current loop iteration 725 Value iv = insideBuilder.create<arith::RemSIOp>( 726 loc, previous, normalizedUpperBounds[idx]); 727 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv, 728 loops.getRegion()); 729 730 // Remove the effect of the current induction value to prepare for 731 // the next value. 732 previous = insideBuilder.create<arith::DivSIOp>( 733 loc, previous, normalizedUpperBounds[idx]); 734 } 735 736 // The final induction value is just the remaining value. 737 unsigned idx = combinedDimensions[i][0]; 738 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), 739 previous, loops.getRegion()); 740 } 741 }); 742 743 // Replace the old loop with the new loop. 744 loops.getBody()->back().erase(); 745 newPloop.getBody()->getOperations().splice( 746 Block::iterator(newPloop.getBody()->back()), 747 loops.getBody()->getOperations()); 748 loops.erase(); 749 } 750 751 // Hoist the ops within `outer` that appear before `inner`. 752 // Such ops include the ops that have been introduced by parametric tiling. 753 // Ops that come from triangular loops (i.e. that belong to the program slice 754 // rooted at `outer`) and ops that have side effects cannot be hoisted. 755 // Return failure when any op fails to hoist. 756 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) { 757 SetVector<Operation *> forwardSlice; 758 getForwardSlice( 759 outer.getInductionVar(), &forwardSlice, 760 [&inner](Operation *op) { return op != inner.getOperation(); }); 761 LogicalResult status = success(); 762 SmallVector<Operation *, 8> toHoist; 763 for (auto &op : outer.getBody()->without_terminator()) { 764 // Stop when encountering the inner loop. 765 if (&op == inner.getOperation()) 766 break; 767 // Skip over non-hoistable ops. 768 if (forwardSlice.count(&op) > 0) { 769 status = failure(); 770 continue; 771 } 772 // Skip intermediate scf::ForOp, these are not considered a failure. 773 if (isa<scf::ForOp>(op)) 774 continue; 775 // Skip other ops with regions. 776 if (op.getNumRegions() > 0) { 777 status = failure(); 778 continue; 779 } 780 // Skip if op has side effects. 781 // TODO: loads to immutable memory regions are ok. 782 if (!MemoryEffectOpInterface::hasNoEffect(&op)) { 783 status = failure(); 784 continue; 785 } 786 toHoist.push_back(&op); 787 } 788 auto *outerForOp = outer.getOperation(); 789 for (auto *op : toHoist) 790 op->moveBefore(outerForOp); 791 return status; 792 } 793 794 // Traverse the interTile and intraTile loops and try to hoist ops such that 795 // bands of perfectly nested loops are isolated. 796 // Return failure if either perfect interTile or perfect intraTile bands cannot 797 // be formed. 798 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) { 799 LogicalResult status = success(); 800 const Loops &interTile = tileLoops.first; 801 const Loops &intraTile = tileLoops.second; 802 auto size = interTile.size(); 803 assert(size == intraTile.size()); 804 if (size <= 1) 805 return success(); 806 for (unsigned s = 1; s < size; ++s) 807 status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s]) 808 : failure(); 809 for (unsigned s = 1; s < size; ++s) 810 status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s]) 811 : failure(); 812 return status; 813 } 814 815 /// Collect perfectly nested loops starting from `rootForOps`. Loops are 816 /// perfectly nested if each loop is the first and only non-terminator operation 817 /// in the parent loop. Collect at most `maxLoops` loops and append them to 818 /// `forOps`. 819 template <typename T> 820 static void getPerfectlyNestedLoopsImpl( 821 SmallVectorImpl<T> &forOps, T rootForOp, 822 unsigned maxLoops = std::numeric_limits<unsigned>::max()) { 823 for (unsigned i = 0; i < maxLoops; ++i) { 824 forOps.push_back(rootForOp); 825 Block &body = rootForOp.getRegion().front(); 826 if (body.begin() != std::prev(body.end(), 2)) 827 return; 828 829 rootForOp = dyn_cast<T>(&body.front()); 830 if (!rootForOp) 831 return; 832 } 833 } 834 835 static Loops stripmineSink(scf::ForOp forOp, Value factor, 836 ArrayRef<scf::ForOp> targets) { 837 auto originalStep = forOp.getStep(); 838 auto iv = forOp.getInductionVar(); 839 840 OpBuilder b(forOp); 841 forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor)); 842 843 Loops innerLoops; 844 for (auto t : targets) { 845 // Save information for splicing ops out of t when done 846 auto begin = t.getBody()->begin(); 847 auto nOps = t.getBody()->getOperations().size(); 848 849 // Insert newForOp before the terminator of `t`. 850 auto b = OpBuilder::atBlockTerminator((t.getBody())); 851 Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep()); 852 Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt, 853 forOp.getUpperBound(), stepped); 854 Value ub = b.create<arith::SelectOp>(t.getLoc(), less, 855 forOp.getUpperBound(), stepped); 856 857 // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses. 858 auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep); 859 newForOp.getBody()->getOperations().splice( 860 newForOp.getBody()->getOperations().begin(), 861 t.getBody()->getOperations(), begin, std::next(begin, nOps - 1)); 862 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), 863 newForOp.getRegion()); 864 865 innerLoops.push_back(newForOp); 866 } 867 868 return innerLoops; 869 } 870 871 // Stripmines a `forOp` by `factor` and sinks it under a single `target`. 872 // Returns the new for operation, nested immediately under `target`. 873 template <typename SizeType> 874 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor, 875 scf::ForOp target) { 876 // TODO: Use cheap structural assertions that targets are nested under 877 // forOp and that targets are not nested under each other when DominanceInfo 878 // exposes the capability. It seems overkill to construct a whole function 879 // dominance tree at this point. 880 auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target)); 881 assert(res.size() == 1 && "Expected 1 inner forOp"); 882 return res[0]; 883 } 884 885 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps, 886 ArrayRef<Value> sizes, 887 ArrayRef<scf::ForOp> targets) { 888 SmallVector<SmallVector<scf::ForOp, 8>, 8> res; 889 SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end()); 890 for (auto it : llvm::zip(forOps, sizes)) { 891 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets); 892 res.push_back(step); 893 currentTargets = step; 894 } 895 return res; 896 } 897 898 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes, 899 scf::ForOp target) { 900 SmallVector<scf::ForOp, 8> res; 901 for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) { 902 assert(loops.size() == 1); 903 res.push_back(loops[0]); 904 } 905 return res; 906 } 907 908 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) { 909 // Collect perfectly nested loops. If more size values provided than nested 910 // loops available, truncate `sizes`. 911 SmallVector<scf::ForOp, 4> forOps; 912 forOps.reserve(sizes.size()); 913 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); 914 if (forOps.size() < sizes.size()) 915 sizes = sizes.take_front(forOps.size()); 916 917 return ::tile(forOps, sizes, forOps.back()); 918 } 919 920 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops, 921 scf::ForOp root) { 922 getPerfectlyNestedLoopsImpl(nestedLoops, root); 923 } 924 925 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp, 926 ArrayRef<int64_t> sizes) { 927 // Collect perfectly nested loops. If more size values provided than nested 928 // loops available, truncate `sizes`. 929 SmallVector<scf::ForOp, 4> forOps; 930 forOps.reserve(sizes.size()); 931 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size()); 932 if (forOps.size() < sizes.size()) 933 sizes = sizes.take_front(forOps.size()); 934 935 // Compute the tile sizes such that i-th outer loop executes size[i] 936 // iterations. Given that the loop current executes 937 // numIterations = ceildiv((upperBound - lowerBound), step) 938 // iterations, we need to tile with size ceildiv(numIterations, size[i]). 939 SmallVector<Value, 4> tileSizes; 940 tileSizes.reserve(sizes.size()); 941 for (unsigned i = 0, e = sizes.size(); i < e; ++i) { 942 assert(sizes[i] > 0 && "expected strictly positive size for strip-mining"); 943 944 auto forOp = forOps[i]; 945 OpBuilder builder(forOp); 946 auto loc = forOp.getLoc(); 947 Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(), 948 forOp.getLowerBound()); 949 Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep()); 950 Value iterationsPerBlock = 951 ceilDivPositive(builder, loc, numIterations, sizes[i]); 952 tileSizes.push_back(iterationsPerBlock); 953 } 954 955 // Call parametric tiling with the given sizes. 956 auto intraTile = tile(forOps, tileSizes, forOps.back()); 957 TileLoops tileLoops = std::make_pair(forOps, intraTile); 958 959 // TODO: for now we just ignore the result of band isolation. 960 // In the future, mapping decisions may be impacted by the ability to 961 // isolate perfectly nested bands. 962 (void)tryIsolateBands(tileLoops); 963 964 return tileLoops; 965 } 966