1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements miscellaneous loop transformation routines. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Affine/LoopUtils.h" 14 #include "mlir/Analysis/SliceAnalysis.h" 15 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" 16 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" 17 #include "mlir/Dialect/Affine/Analysis/Utils.h" 18 #include "mlir/Dialect/Affine/IR/AffineOps.h" 19 #include "mlir/Dialect/Affine/IR/AffineValueMap.h" 20 #include "mlir/Dialect/Affine/Utils.h" 21 #include "mlir/Dialect/Func/IR/FuncOps.h" 22 #include "mlir/Dialect/MemRef/IR/MemRef.h" 23 #include "mlir/Dialect/SCF/IR/SCF.h" 24 #include "mlir/IR/BlockAndValueMapping.h" 25 #include "mlir/IR/IntegerSet.h" 26 #include "mlir/Support/MathExtras.h" 27 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 28 #include "mlir/Transforms/RegionUtils.h" 29 #include "llvm/ADT/MapVector.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 34 #define DEBUG_TYPE "LoopUtils" 35 36 using namespace mlir; 37 using namespace presburger; 38 using llvm::SmallMapVector; 39 40 namespace { 41 // This structure is to pass and return sets of loop parameters without 42 // confusing the order. 43 struct LoopParams { 44 Value lowerBound; 45 Value upperBound; 46 Value step; 47 }; 48 } // namespace 49 50 /// Computes the cleanup loop lower bound of the loop being unrolled with 51 /// the specified unroll factor; this bound will also be upper bound of the main 52 /// part of the unrolled loop. Computes the bound as an AffineMap with its 53 /// operands or a null map when the trip count can't be expressed as an affine 54 /// expression. 55 static void 56 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, 57 AffineMap &cleanupLbMap, 58 SmallVectorImpl<Value> &cleanupLbOperands) { 59 AffineMap tripCountMap; 60 SmallVector<Value, 4> tripCountOperands; 61 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands); 62 // Trip count can't be computed. 63 if (!tripCountMap) { 64 cleanupLbMap = AffineMap(); 65 return; 66 } 67 68 OpBuilder b(forOp); 69 auto lbMap = forOp.getLowerBoundMap(); 70 auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap, 71 forOp.getLowerBoundOperands()); 72 73 // For each upper bound expr, get the range. 74 // Eg: affine.for %i = lb to min (ub1, ub2), 75 // where tripCountExprs yield (tr1, tr2), we create affine.apply's: 76 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all 77 // these affine.apply's make up the cleanup loop lower bound. 78 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults()); 79 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults()); 80 int64_t step = forOp.getStep(); 81 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) { 82 auto tripCountExpr = tripCountMap.getResult(i); 83 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step; 84 auto bumpMap = AffineMap::get(tripCountMap.getNumDims(), 85 tripCountMap.getNumSymbols(), bumpExprs[i]); 86 bumpValues[i] = 87 b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands); 88 } 89 90 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults()); 91 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++) 92 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1); 93 94 cleanupLbOperands.clear(); 95 cleanupLbOperands.push_back(lb); 96 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end()); 97 cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs, 98 b.getContext()); 99 // Simplify the cleanupLbMap + cleanupLbOperands. 100 fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands); 101 cleanupLbMap = simplifyAffineMap(cleanupLbMap); 102 canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands); 103 // Remove any affine.apply's that became dead from the simplification above. 104 for (auto v : bumpValues) 105 if (v.use_empty()) 106 v.getDefiningOp()->erase(); 107 108 if (lb.use_empty()) 109 lb.erase(); 110 } 111 112 /// Helper to replace uses of loop carried values (iter_args) and loop 113 /// yield values while promoting single iteration affine.for ops. 114 static void replaceIterArgsAndYieldResults(AffineForOp forOp) { 115 // Replace uses of iter arguments with iter operands (initial values). 116 auto iterOperands = forOp.getIterOperands(); 117 auto iterArgs = forOp.getRegionIterArgs(); 118 for (auto e : llvm::zip(iterOperands, iterArgs)) 119 std::get<1>(e).replaceAllUsesWith(std::get<0>(e)); 120 121 // Replace uses of loop results with the values yielded by the loop. 122 auto outerResults = forOp.getResults(); 123 auto innerResults = forOp.getBody()->getTerminator()->getOperands(); 124 for (auto e : llvm::zip(outerResults, innerResults)) 125 std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); 126 } 127 128 /// Promotes the loop body of a forOp to its containing block if the forOp 129 /// was known to have a single iteration. 130 // TODO: extend this for arbitrary affine bounds. 131 LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) { 132 Optional<uint64_t> tripCount = getConstantTripCount(forOp); 133 if (!tripCount || *tripCount != 1) 134 return failure(); 135 136 if (forOp.getLowerBoundMap().getNumResults() != 1) 137 return failure(); 138 139 // Replaces all IV uses to its single iteration value. 140 auto iv = forOp.getInductionVar(); 141 auto *parentBlock = forOp->getBlock(); 142 if (!iv.use_empty()) { 143 if (forOp.hasConstantLowerBound()) { 144 OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody()); 145 auto constOp = topBuilder.create<arith::ConstantIndexOp>( 146 forOp.getLoc(), forOp.getConstantLowerBound()); 147 iv.replaceAllUsesWith(constOp); 148 } else { 149 auto lbOperands = forOp.getLowerBoundOperands(); 150 auto lbMap = forOp.getLowerBoundMap(); 151 OpBuilder builder(forOp); 152 if (lbMap == builder.getDimIdentityMap()) { 153 // No need of generating an affine.apply. 154 iv.replaceAllUsesWith(lbOperands[0]); 155 } else { 156 auto affineApplyOp = 157 builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands); 158 iv.replaceAllUsesWith(affineApplyOp); 159 } 160 } 161 } 162 163 replaceIterArgsAndYieldResults(forOp); 164 165 // Move the loop body operations, except for its terminator, to the loop's 166 // containing block. 167 forOp.getBody()->back().erase(); 168 parentBlock->getOperations().splice(Block::iterator(forOp), 169 forOp.getBody()->getOperations()); 170 forOp.erase(); 171 return success(); 172 } 173 174 /// Generates an affine.for op with the specified lower and upper bounds 175 /// while generating the right IV remappings to realize shifts for operations in 176 /// its body. The operations that go into the loop body are specified in 177 /// opGroupQueue starting from the specified offset, and in that order. The 178 /// first element of the pair specifies the shift applied to that group of 179 /// operations; the shift is multiplied by the loop step before being applied. 180 /// Returns nullptr if the generated loop simplifies to a single iteration one. 181 static AffineForOp generateShiftedLoop( 182 AffineMap lbMap, AffineMap ubMap, 183 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue, 184 unsigned offset, AffineForOp srcForOp, OpBuilder b) { 185 auto lbOperands = srcForOp.getLowerBoundOperands(); 186 auto ubOperands = srcForOp.getUpperBoundOperands(); 187 188 assert(lbMap.getNumInputs() == lbOperands.size()); 189 assert(ubMap.getNumInputs() == ubOperands.size()); 190 191 auto loopChunk = b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap, 192 ubOperands, ubMap, srcForOp.getStep()); 193 auto loopChunkIV = loopChunk.getInductionVar(); 194 auto srcIV = srcForOp.getInductionVar(); 195 196 BlockAndValueMapping operandMap; 197 198 auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody()); 199 for (auto it = opGroupQueue.begin() + offset, e = opGroupQueue.end(); it != e; 200 ++it) { 201 uint64_t shift = it->first; 202 auto ops = it->second; 203 // All 'same shift' operations get added with their operands being 204 // remapped to results of cloned operations, and their IV used remapped. 205 // Generate the remapping if the shift is not zero: remappedIV = newIV - 206 // shift. 207 if (!srcIV.use_empty() && shift != 0) { 208 auto ivRemap = bodyBuilder.create<AffineApplyOp>( 209 srcForOp.getLoc(), 210 bodyBuilder.getSingleDimShiftAffineMap( 211 -static_cast<int64_t>(srcForOp.getStep() * shift)), 212 loopChunkIV); 213 operandMap.map(srcIV, ivRemap); 214 } else { 215 operandMap.map(srcIV, loopChunkIV); 216 } 217 for (auto *op : ops) 218 bodyBuilder.clone(*op, operandMap); 219 }; 220 if (succeeded(promoteIfSingleIteration(loopChunk))) 221 return AffineForOp(); 222 return loopChunk; 223 } 224 225 // The skewing of operations with respect to one another can be used for 226 // example to allow overlap of asynchronous operations (such as DMA 227 // communication) with computation, or just relative shifting of operations 228 // for better register reuse, locality or parallelism. As such, the shifts are 229 // typically expected to be at most of the order of the number of operations. 230 // This method should not be used as a substitute for loop distribution/fission. 231 // This method uses an algorithm// in time linear in the number of operations 232 // in the body of the for loop - (using the 'sweep line' paradigm). This method 233 // asserts preservation of SSA dominance. A check for that as well as that for 234 // memory-based dependence preservation check rests with the users of this 235 // method. 236 LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp, 237 ArrayRef<uint64_t> shifts, 238 bool unrollPrologueEpilogue) { 239 assert(forOp.getBody()->getOperations().size() == shifts.size() && 240 "too few/many shifts"); 241 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) 242 return success(); 243 244 // If the trip counts aren't constant, we would need versioning and 245 // conditional guards (or context information to prevent such versioning). The 246 // better way to pipeline for such loops is to first tile them and extract 247 // constant trip count "full tiles" before applying this. 248 auto mayBeConstTripCount = getConstantTripCount(forOp); 249 if (!mayBeConstTripCount) { 250 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled")); 251 return success(); 252 } 253 uint64_t tripCount = *mayBeConstTripCount; 254 255 assert(isOpwiseShiftValid(forOp, shifts) && 256 "shifts will lead to an invalid transformation\n"); 257 258 int64_t step = forOp.getStep(); 259 260 unsigned numChildOps = shifts.size(); 261 262 // Do a linear time (counting) sort for the shifts. 263 uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end()); 264 if (maxShift >= numChildOps) { 265 // Large shifts are not the typical use case. 266 forOp.emitWarning("not shifting because shifts are unrealistically large"); 267 return success(); 268 } 269 270 // An array of operation groups sorted by shift amount; each group has all 271 // operations with the same shift in the order in which they appear in the 272 // body of the 'affine.for' op. 273 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1); 274 unsigned pos = 0; 275 for (auto &op : forOp.getBody()->without_terminator()) { 276 auto shift = shifts[pos++]; 277 sortedOpGroups[shift].push_back(&op); 278 } 279 280 // Unless the shifts have a specific pattern (which actually would be the 281 // common use case), prologue and epilogue are not meaningfully defined. 282 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first 283 // loop generated as the prologue and the last as epilogue and unroll these 284 // fully. 285 AffineForOp prologue, epilogue; 286 287 // Do a sweep over the sorted shifts while storing open groups in a 288 // vector, and generating loop portions as necessary during the sweep. A block 289 // of operations is paired with its shift. 290 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue; 291 292 auto origLbMap = forOp.getLowerBoundMap(); 293 uint64_t lbShift = 0; 294 OpBuilder b(forOp); 295 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) { 296 // If nothing is shifted by d, continue. 297 if (sortedOpGroups[d].empty()) 298 continue; 299 if (!opGroupQueue.empty()) { 300 assert(d > 0 && 301 "Queue expected to be empty when the first block is found"); 302 // The interval for which the loop needs to be generated here is: 303 // [lbShift, min(lbShift + tripCount, d)) and the body of the 304 // loop needs to have all operations in opQueue in that order. 305 AffineForOp res; 306 if (lbShift + tripCount * step < d * step) { 307 res = generateShiftedLoop( 308 b.getShiftedAffineMap(origLbMap, lbShift), 309 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step), 310 opGroupQueue, /*offset=*/0, forOp, b); 311 // Entire loop for the queued op groups generated, empty it. 312 opGroupQueue.clear(); 313 lbShift += tripCount * step; 314 } else { 315 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift), 316 b.getShiftedAffineMap(origLbMap, d), 317 opGroupQueue, /*offset=*/0, forOp, b); 318 lbShift = d * step; 319 } 320 321 if (res) { 322 // Simplify/canonicalize the affine.for. 323 RewritePatternSet patterns(res.getContext()); 324 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext()); 325 bool erased; 326 (void)applyOpPatternsAndFold(res, std::move(patterns), &erased); 327 328 if (!erased && !prologue) 329 prologue = res; 330 if (!erased) 331 epilogue = res; 332 } 333 } else { 334 // Start of first interval. 335 lbShift = d * step; 336 } 337 // Augment the list of operations that get into the current open interval. 338 opGroupQueue.emplace_back(d, sortedOpGroups[d]); 339 } 340 341 // Those operations groups left in the queue now need to be processed (FIFO) 342 // and their loops completed. 343 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) { 344 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step; 345 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift), 346 b.getShiftedAffineMap(origLbMap, ubShift), 347 opGroupQueue, /*offset=*/i, forOp, b); 348 lbShift = ubShift; 349 if (!prologue) 350 prologue = epilogue; 351 } 352 353 // Erase the original for op. 354 forOp.erase(); 355 356 if (unrollPrologueEpilogue && prologue) 357 (void)loopUnrollFull(prologue); 358 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue) 359 (void)loopUnrollFull(epilogue); 360 361 return success(); 362 } 363 364 /// Checks the legality of tiling of a hyper-rectangular loop nest by simply 365 /// checking if there is a 'negative' dependence in the memrefs present in 366 /// the loop nest. If yes then tiling is invalid. 367 static bool 368 checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) { 369 assert(!origLoops.empty() && "no original loops provided"); 370 371 // We first find out all dependences we intend to check. 372 SmallVector<Operation *, 8> loadAndStoreOps; 373 origLoops[0]->walk([&](Operation *op) { 374 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) 375 loadAndStoreOps.push_back(op); 376 }); 377 378 unsigned numOps = loadAndStoreOps.size(); 379 unsigned numLoops = origLoops.size(); 380 FlatAffineValueConstraints dependenceConstraints; 381 for (unsigned d = 1; d <= numLoops + 1; ++d) { 382 for (unsigned i = 0; i < numOps; ++i) { 383 Operation *srcOp = loadAndStoreOps[i]; 384 MemRefAccess srcAccess(srcOp); 385 for (unsigned j = 0; j < numOps; ++j) { 386 Operation *dstOp = loadAndStoreOps[j]; 387 MemRefAccess dstAccess(dstOp); 388 389 SmallVector<DependenceComponent, 2> depComps; 390 dependenceConstraints.reset(); 391 DependenceResult result = checkMemrefAccessDependence( 392 srcAccess, dstAccess, d, &dependenceConstraints, &depComps); 393 394 // Skip if there is no dependence in this case. 395 if (!hasDependence(result)) 396 continue; 397 398 // Check whether there is any negative direction vector in the 399 // dependence components found above, which means that dependence is 400 // violated by the default hyper-rect tiling method. 401 LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " 402 "for dependence at depth: " 403 << Twine(d) << " between:\n";); 404 LLVM_DEBUG(srcAccess.opInst->dump();); 405 LLVM_DEBUG(dstAccess.opInst->dump();); 406 for (unsigned k = 0, e = depComps.size(); k < e; k++) { 407 DependenceComponent depComp = depComps[k]; 408 if (depComp.lb.has_value() && depComp.ub.has_value() && 409 depComp.lb.value() < depComp.ub.value() && 410 depComp.ub.value() < 0) { 411 LLVM_DEBUG(llvm::dbgs() 412 << "Dependence component lb = " 413 << Twine(depComp.lb.value()) 414 << " ub = " << Twine(depComp.ub.value()) 415 << " is negative at depth: " << Twine(d) 416 << " and thus violates the legality rule.\n"); 417 return false; 418 } 419 } 420 } 421 } 422 } 423 424 return true; 425 } 426 427 /// Checks whether hyper-rectangular loop tiling of the nest 428 /// represented by `origLoops` is valid. The validity condition is from Irigoin 429 /// and Triolet, which states that two tiles cannot depend on each other. We 430 /// simplify such condition to just checking whether there is any negative 431 /// dependence direction, since we have the prior knowledge that the tiling 432 /// results will be hyper-rectangles, which are scheduled in the 433 /// lexicographically increasing order on the vector of loop indices. This 434 /// function will return failure when any dependence component is negative along 435 /// any of `origLoops`. 436 LogicalResult 437 checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) { 438 return success(checkTilingLegalityImpl(origLoops)); 439 } 440 441 /// Checks whether a loop nest is hyper-rectangular or not. 442 LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) { 443 FlatAffineValueConstraints cst; 444 SmallVector<Operation *, 8> ops(input.begin(), input.end()); 445 // 0-d or 1-d is trivially hyper-rectangular. 446 if (input.size() <= 1) 447 return success(); 448 if (failed(getIndexSet(ops, &cst))) { 449 LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n"); 450 return failure(); 451 } 452 if (!cst.isHyperRectangular(0, input.size())) { 453 LLVM_DEBUG(llvm::dbgs() 454 << "Non-hyperrectangular nests not supported for tiling!\n"); 455 return failure(); 456 } 457 return success(); 458 } 459 460 /// Check if the input nest is supported for tiling and whether tiling would be 461 /// legal or not. 462 template <typename t> 463 LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input, 464 ArrayRef<t> tileSizes) { 465 assert(input.size() == tileSizes.size() && "Too few/many tile sizes"); 466 467 if (llvm::any_of(input, 468 [](AffineForOp op) { return op.getNumResults() > 0; })) { 469 LLVM_DEBUG(llvm::dbgs() 470 << "Cannot tile nest where a loop has yield values\n"); 471 return failure(); 472 } 473 474 // Check if the supplied `for` ops are all successively nested. 475 if (!isPerfectlyNested(input)) { 476 LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested"); 477 return failure(); 478 } 479 480 if (failed(checkIfHyperRectangular(input))) 481 return failure(); 482 483 // Check if tiling is legal. 484 if (failed(checkTilingLegality(input))) { 485 input[0].emitRemark("tiling code is illegal due to dependences"); 486 return failure(); 487 } 488 489 return success(); 490 } 491 492 /// Move the loop body of AffineForOp 'src' from 'src' into the specified 493 /// location in destination's body, ignoring the terminator. 494 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, 495 Block::iterator loc) { 496 auto &ops = src.getBody()->getOperations(); 497 dest.getBody()->getOperations().splice(loc, ops, ops.begin(), 498 std::prev(ops.end())); 499 } 500 501 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest 502 /// body. 503 void moveLoopBody(AffineForOp src, AffineForOp dest) { 504 moveLoopBodyImpl(src, dest, dest.getBody()->begin()); 505 } 506 507 /// Constructs tiled loop nest, without setting the loop bounds and move the 508 /// body of the original loop nest to the tiled loop nest. 509 void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops, 510 AffineForOp rootAffineForOp, unsigned width, 511 MutableArrayRef<AffineForOp> tiledLoops) { 512 Location loc = rootAffineForOp.getLoc(); 513 514 // The outermost among the loops as we add more.. 515 Operation *topLoop = rootAffineForOp.getOperation(); 516 AffineForOp innermostPointLoop; 517 518 // Add intra-tile (or point) loops. 519 for (unsigned i = 0; i < width; i++) { 520 OpBuilder b(topLoop); 521 // Loop bounds will be set later. 522 AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0); 523 pointLoop.getBody()->getOperations().splice( 524 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), 525 topLoop); 526 tiledLoops[2 * width - 1 - i] = pointLoop; 527 topLoop = pointLoop.getOperation(); 528 if (i == 0) 529 innermostPointLoop = pointLoop; 530 } 531 532 // Add tile space loops; 533 for (unsigned i = width; i < 2 * width; i++) { 534 OpBuilder b(topLoop); 535 // Loop bounds will be set later. 536 AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0); 537 tileSpaceLoop.getBody()->getOperations().splice( 538 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), 539 topLoop); 540 tiledLoops[2 * width - i - 1] = tileSpaceLoop; 541 topLoop = tileSpaceLoop.getOperation(); 542 } 543 544 // Move the loop body of the original nest to the new one. 545 moveLoopBody(origLoops.back(), innermostPointLoop); 546 } 547 548 /// Set lower and upper bounds of intra-tile loops for parametric tiling. 549 // TODO: Handle non-constant lower bounds. 550 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, 551 AffineForOp newInterTileLoop, 552 AffineForOp newIntraTileLoop, 553 Value tileSize) { 554 // The lower bound for the intra-tile loop is represented by an affine map 555 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound 556 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i 557 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV 558 // of the corresponding inter-tile loop, %t0 is the corresponding tiling 559 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the 560 // corresponding inter-tile loop. 561 562 assert(origLoop.hasConstantLowerBound() && 563 "expected input loops to have constant lower bound."); 564 565 // Get lower bound of original loop as an affine expression. 566 AffineExpr origLowerBoundExpr; 567 origLowerBoundExpr = 568 b.getAffineConstantExpr(origLoop.getConstantLowerBound()); 569 570 // Add dim operands from original lower/upper bound. 571 SmallVector<Value, 4> lbOperands, ubOperands; 572 AffineBound lb = origLoop.getLowerBound(); 573 AffineBound ub = origLoop.getUpperBound(); 574 lbOperands.reserve(lb.getNumOperands() + 2); 575 ubOperands.reserve(ub.getNumOperands() + 2); 576 AffineMap origLbMap = lb.getMap(); 577 AffineMap origUbMap = ub.getMap(); 578 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j) 579 lbOperands.push_back(lb.getOperand(j)); 580 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) 581 ubOperands.push_back(ub.getOperand(j)); 582 583 // Add a new dim operand in lb/ubOperands corresponding to the origLoop 584 // IV. 585 lbOperands.push_back(newInterTileLoop.getInductionVar()); 586 ubOperands.push_back(newInterTileLoop.getInductionVar()); 587 588 // Get loop IV as an affine expression for lower/upper bound. Size of 589 // lb/ubOperands is guaranteed to be atleast one. 590 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1); 591 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1); 592 593 // Add symbol operands from original lower/upper bound. 594 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) 595 lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); 596 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) 597 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); 598 599 // Add a new symbol operand which is the tile size for this loop. 600 lbOperands.push_back(tileSize); 601 ubOperands.push_back(tileSize); 602 603 SmallVector<AffineExpr, 4> lbBoundExprs; 604 SmallVector<AffineExpr, 4> ubBoundExprs; 605 lbBoundExprs.reserve(origLbMap.getNumResults()); 606 ubBoundExprs.reserve(origUbMap.getNumResults()); 607 608 // Get tiling parameter as an affine expression for lb/ub. 609 AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols()); 610 AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols()); 611 612 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb. 613 lbBoundExprs.push_back( 614 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) + 615 origLowerBoundExpr); 616 617 // Get the origLoopStep as an affine expression. 618 AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep()); 619 620 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) + 621 // (tilingParameter * origLoopStep) + origlb. 622 ubBoundExprs.push_back( 623 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) + 624 (ubTileParameter * origLoopStep) + origLowerBoundExpr); 625 626 ubBoundExprs.append(origUbMap.getResults().begin(), 627 origUbMap.getResults().end()); 628 629 AffineMap lbMap = 630 AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1, 631 lbBoundExprs, b.getContext()); 632 newIntraTileLoop.setLowerBound(lbOperands, lbMap); 633 634 AffineMap ubMap = 635 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1, 636 ubBoundExprs, b.getContext()); 637 newIntraTileLoop.setUpperBound(ubOperands, ubMap); 638 639 // Original loop step must be preserved. 640 newIntraTileLoop.setStep(origLoop.getStep()); 641 } 642 643 /// Set lower and upper bounds of inter-tile loops for parametric tiling. 644 // TODO: Handle non-constant lower bounds. 645 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, 646 AffineForOp newLoop, Value tileSize) { 647 OperandRange newLbOperands = origLoop.getLowerBoundOperands(); 648 649 // The lower bounds for inter-tile loops are same as the corresponding lower 650 // bounds of original loops. 651 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap()); 652 653 // The new upper bound map for inter-tile loops, assuming constant lower 654 // bounds, are now originalLowerBound + ceildiv((originalUpperBound - 655 // originalLowerBound), tiling parameter); where tiling parameter is the 656 // respective tile size for that loop. For e.g. if the original ubmap was 657 // ()->(1024), the new map will be 658 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter. 659 // Therefore a new symbol operand is inserted in the map and the result 660 // expression is overwritten. 661 662 assert(origLoop.hasConstantLowerBound() && 663 "expected input loops to have constant lower bound."); 664 665 // Get lower bound of original loop as an affine expression. 666 AffineExpr origLowerBoundExpr; 667 origLowerBoundExpr = 668 b.getAffineConstantExpr(origLoop.getConstantLowerBound()); 669 670 // Add dim operands from original upper bound. 671 SmallVector<Value, 4> ubOperands; 672 AffineBound ub = origLoop.getUpperBound(); 673 ubOperands.reserve(ub.getNumOperands() + 1); 674 AffineMap origUbMap = ub.getMap(); 675 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) 676 ubOperands.push_back(ub.getOperand(j)); 677 678 // Add symbol operands from original upper bound. 679 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) 680 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); 681 682 // Add a new symbol operand which is the tile size for this loop. 683 ubOperands.push_back(tileSize); 684 685 // Get tiling parameter as an affine expression. 686 AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols()); 687 688 SmallVector<AffineExpr, 4> boundExprs; 689 boundExprs.reserve(origUbMap.getNumResults()); 690 int64_t origUpperBound; 691 AffineExpr origUpperBoundExpr; 692 693 // If upper bound for the original loop is constant, then the constant can 694 // be obtained as an affine expression straight away. 695 if (origLoop.hasConstantUpperBound()) { 696 origUpperBound = origLoop.getConstantUpperBound(); 697 698 // Get original constant upper bound as an affine expression. 699 origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound); 700 701 // Insert the bound as originalLowerBoundceildiv((originalUpperBound - 702 // originalLowerBound), tilingParameter). 703 boundExprs.push_back( 704 origLowerBoundExpr + 705 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); 706 } else { 707 // If upper bound for the original loop is not constant then two cases 708 // are possible, although there handeling is the same, 1.) The result of 709 // ubmap has only one result expression. For e.g. 710 // affine.for %i = 5 to %ub 711 // 712 // A symbol operand is added which represents the tiling parameter. The 713 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5) 714 // where 's0' is the original upper bound and 's1' is the tiling 715 // parameter. 2.) When ubMap has more than one result expression. For e.g. 716 // #map0 = affine_map<()[s0, s1] -> (s0, s1) 717 // affine.for %i = 5 to min #map0()[%s0, %s1] 718 // 719 // A symbol operand is added which represents the tiling parameter. The 720 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5, 721 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter. 722 723 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound - 724 // originalLowerBound), tilingParameter). 725 for (AffineExpr origUpperBoundExpr : origUbMap.getResults()) 726 boundExprs.push_back( 727 origLowerBoundExpr + 728 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter)); 729 } 730 731 AffineMap ubMap = 732 AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1, 733 boundExprs, b.getContext()); 734 newLoop.setUpperBound(ubOperands, ubMap); 735 736 // Original loop step must be preserved. 737 newLoop.setStep(origLoop.getStep()); 738 } 739 740 /// Constructs and sets new loop bounds after tiling for the case of 741 /// hyper-rectangular index sets, where the bounds of one dimension do not 742 /// depend on other dimensions and tiling parameters are captured from SSA 743 /// values. Bounds of each dimension can thus be treated independently, 744 /// and deriving the new bounds is much simpler and faster than for the case of 745 /// tiling arbitrary polyhedral shapes. 746 static void constructParametricallyTiledIndexSetHyperRect( 747 MutableArrayRef<AffineForOp> origLoops, 748 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) { 749 assert(!origLoops.empty() && "expected atleast one loop in band"); 750 assert(origLoops.size() == tileSizes.size() && 751 "expected tiling parameter for each loop in band."); 752 753 OpBuilder b(origLoops[0].getOperation()); 754 unsigned width = origLoops.size(); 755 756 // Set bounds for tile space loops. 757 for (unsigned i = 0; i < width; ++i) { 758 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]); 759 } 760 761 // Set bounds for intra-tile loops. 762 for (unsigned i = 0; i < width; ++i) { 763 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i], 764 newLoops[i + width], tileSizes[i]); 765 } 766 } 767 768 /// Constructs and sets new loop bounds after tiling for the case of 769 /// hyper-rectangular index sets, where the bounds of one dimension do not 770 /// depend on other dimensions. Bounds of each dimension can thus be treated 771 /// independently, and deriving the new bounds is much simpler and faster 772 /// than for the case of tiling arbitrary polyhedral shapes. 773 static void 774 constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops, 775 MutableArrayRef<AffineForOp> newLoops, 776 ArrayRef<unsigned> tileSizes) { 777 assert(!origLoops.empty()); 778 assert(origLoops.size() == tileSizes.size()); 779 780 OpBuilder b(origLoops[0].getOperation()); 781 unsigned width = origLoops.size(); 782 783 // Bounds for tile space loops. 784 for (unsigned i = 0; i < width; i++) { 785 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands(); 786 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands(); 787 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap()); 788 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap()); 789 // If the step size of original loop is x and tileSize is y then after 790 // tiling the tile space loops' step size becomes x*y. 791 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStep()); 792 } 793 // Bounds for intra-tile loops. 794 for (unsigned i = 0; i < width; i++) { 795 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]); 796 Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]); 797 // The lower bound is just the tile-space loop. 798 AffineMap lbMap = b.getDimIdentityMap(); 799 newLoops[width + i].setLowerBound( 800 /*operands=*/newLoops[i].getInductionVar(), lbMap); 801 // The step sizes of intra-tile loops is just the original loops' step size. 802 newLoops[width + i].setStep(origLoops[i].getStep()); 803 804 // Set the upper bound. 805 if (mayBeConstantCount && mayBeConstantCount.value() < tileSizes[i]) { 806 // Trip count is less than the tile size: upper bound is lower bound + 807 // trip count * stepSize. 808 AffineMap ubMap = b.getSingleDimShiftAffineMap( 809 mayBeConstantCount.value() * origLoops[i].getStep()); 810 newLoops[width + i].setUpperBound( 811 /*operands=*/newLoops[i].getInductionVar(), ubMap); 812 } else if (largestDiv % tileSizes[i] != 0) { 813 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i). 814 // Construct the upper bound map; the operands are the original operands 815 // with 'i' (tile-space loop) appended to it. The new upper bound map is 816 // the original one with an additional expression i + tileSize * stepSize 817 // appended. 818 819 // Add dim operands from original upper bound. 820 SmallVector<Value, 4> ubOperands; 821 AffineBound ub = origLoops[i].getUpperBound(); 822 ubOperands.reserve(ub.getNumOperands() + 1); 823 AffineMap origUbMap = ub.getMap(); 824 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) 825 ubOperands.push_back(ub.getOperand(j)); 826 827 // Add dim operand for new loop upper bound. 828 ubOperands.push_back(newLoops[i].getInductionVar()); 829 830 // Add symbol operands from original upper bound. 831 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) 832 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); 833 834 SmallVector<AffineExpr, 4> boundExprs; 835 boundExprs.reserve(1 + origUbMap.getNumResults()); 836 AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims()); 837 // The new upper bound map is the original one with an additional 838 // expression i + tileSize * stepSize (of original loop) appended. 839 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStep()); 840 boundExprs.append(origUbMap.getResults().begin(), 841 origUbMap.getResults().end()); 842 AffineMap ubMap = 843 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(), 844 boundExprs, b.getContext()); 845 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap); 846 } else { 847 // No need of the min expression. 848 AffineExpr dim = b.getAffineDimExpr(0); 849 AffineMap ubMap = 850 AffineMap::get(1, 0, dim + tileSizes[i] * origLoops[i].getStep()); 851 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap); 852 } 853 } 854 } 855 856 /// Tiles the specified band of perfectly nested loops creating tile-space loops 857 /// and intra-tile loops. A band is a contiguous set of loops. 858 // TODO: handle non hyper-rectangular spaces. 859 LogicalResult 860 mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input, 861 ArrayRef<unsigned> tileSizes, 862 SmallVectorImpl<AffineForOp> *tiledNest) { 863 if (input.empty()) 864 return success(); 865 866 if (failed(performPreTilingChecks(input, tileSizes))) 867 return failure(); 868 869 MutableArrayRef<AffineForOp> origLoops = input; 870 AffineForOp rootAffineForOp = origLoops[0]; 871 872 // Note that width is at least one since the band isn't empty. 873 unsigned width = input.size(); 874 SmallVector<AffineForOp, 6> tiledLoops(2 * width); 875 876 // Construct a tiled loop nest without setting their bounds. Bounds are 877 // set later. 878 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops); 879 880 SmallVector<Value, 8> origLoopIVs; 881 extractForInductionVars(input, &origLoopIVs); 882 883 // Set loop bounds for the tiled loop nest. 884 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes); 885 886 // Replace original IVs with intra-tile loop IVs. 887 for (unsigned i = 0; i < width; i++) 888 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); 889 890 // Erase the old loop nest. 891 rootAffineForOp.erase(); 892 893 if (tiledNest) 894 *tiledNest = std::move(tiledLoops); 895 896 return success(); 897 } 898 899 /// Tiles the specified band of perfectly nested loops creating tile-space 900 /// loops and intra-tile loops, using SSA values as tiling parameters. A band 901 /// is a contiguous set of loops. 902 // TODO: handle non hyper-rectangular spaces. 903 LogicalResult 904 mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input, 905 ArrayRef<Value> tileSizes, 906 SmallVectorImpl<AffineForOp> *tiledNest) { 907 if (input.empty()) 908 return success(); 909 910 if (failed(performPreTilingChecks(input, tileSizes))) 911 return failure(); 912 913 MutableArrayRef<AffineForOp> origLoops = input; 914 AffineForOp rootAffineForOp = origLoops[0]; 915 unsigned width = input.size(); 916 SmallVector<AffineForOp, 6> tiledLoops(2 * width); 917 918 // Construct a tiled loop nest without setting their bounds. Bounds are 919 // set later. 920 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops); 921 922 SmallVector<Value, 8> origLoopIVs; 923 extractForInductionVars(input, &origLoopIVs); 924 925 // Set loop bounds for the tiled loop nest. 926 constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops, 927 tileSizes); 928 929 // Replace original IVs with intra-tile loop IVs. 930 for (unsigned i = 0; i < width; i++) 931 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); 932 933 // Erase the old loop nest. 934 rootAffineForOp.erase(); 935 936 if (tiledNest) 937 *tiledNest = std::move(tiledLoops); 938 939 return success(); 940 } 941 942 /// Get perfectly nested sequence of loops starting at root of loop nest 943 /// (the first op being another AffineFor, and the second op - a terminator). 944 /// A loop is perfectly nested iff: the first op in the loop's body is another 945 /// AffineForOp, and the second op is a terminator). 946 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops, 947 AffineForOp root) { 948 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) { 949 nestedLoops.push_back(root); 950 Block &body = root.getRegion().front(); 951 if (body.begin() != std::prev(body.end(), 2)) 952 return; 953 954 root = dyn_cast<AffineForOp>(&body.front()); 955 if (!root) 956 return; 957 } 958 } 959 960 /// Identify valid and profitable bands of loops to tile. This is currently just 961 /// a temporary placeholder to test the mechanics of tiled code generation. 962 /// Returns all maximal outermost perfect loop nests to tile. 963 void mlir::getTileableBands(func::FuncOp f, 964 std::vector<SmallVector<AffineForOp, 6>> *bands) { 965 // Get maximal perfect nest of 'affine.for' insts starting from root 966 // (inclusive). 967 for (AffineForOp forOp : f.getOps<AffineForOp>()) { 968 SmallVector<AffineForOp, 6> band; 969 getPerfectlyNestedLoops(band, forOp); 970 bands->push_back(band); 971 } 972 } 973 974 /// Unrolls this loop completely. 975 LogicalResult mlir::loopUnrollFull(AffineForOp forOp) { 976 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 977 if (mayBeConstantTripCount.has_value()) { 978 uint64_t tripCount = mayBeConstantTripCount.value(); 979 if (tripCount == 0) 980 return success(); 981 if (tripCount == 1) 982 return promoteIfSingleIteration(forOp); 983 return loopUnrollByFactor(forOp, tripCount); 984 } 985 return failure(); 986 } 987 988 /// Unrolls this loop by the specified factor or by the trip count (if constant) 989 /// whichever is lower. 990 LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp, 991 uint64_t unrollFactor) { 992 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 993 if (mayBeConstantTripCount.has_value() && 994 mayBeConstantTripCount.value() < unrollFactor) 995 return loopUnrollByFactor(forOp, mayBeConstantTripCount.value()); 996 return loopUnrollByFactor(forOp, unrollFactor); 997 } 998 999 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated 1000 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each 1001 /// unrolled body. If specified, annotates the Ops in each unrolled iteration 1002 /// using annotateFn. 1003 static void generateUnrolledLoop( 1004 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, 1005 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn, 1006 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, 1007 ValueRange iterArgs, ValueRange yieldedValues) { 1008 // Builder to insert unrolled bodies just before the terminator of the body of 1009 // 'forOp'. 1010 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock); 1011 1012 if (!annotateFn) 1013 annotateFn = [](unsigned, Operation *, OpBuilder) {}; 1014 1015 // Keep a pointer to the last non-terminator operation in the original block 1016 // so that we know what to clone (since we are doing this in-place). 1017 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2); 1018 1019 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). 1020 SmallVector<Value, 4> lastYielded(yieldedValues); 1021 1022 for (unsigned i = 1; i < unrollFactor; i++) { 1023 BlockAndValueMapping operandMap; 1024 1025 // Prepare operand map. 1026 operandMap.map(iterArgs, lastYielded); 1027 1028 // If the induction variable is used, create a remapping to the value for 1029 // this unrolled instance. 1030 if (!forOpIV.use_empty()) { 1031 Value ivUnroll = ivRemapFn(i, forOpIV, builder); 1032 operandMap.map(forOpIV, ivUnroll); 1033 } 1034 1035 // Clone the original body of 'forOp'. 1036 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) { 1037 Operation *clonedOp = builder.clone(*it, operandMap); 1038 annotateFn(i, clonedOp, builder); 1039 } 1040 1041 // Update yielded values. 1042 for (unsigned i = 0, e = lastYielded.size(); i < e; i++) 1043 lastYielded[i] = operandMap.lookup(yieldedValues[i]); 1044 } 1045 1046 // Make sure we annotate the Ops in the original body. We do this last so that 1047 // any annotations are not copied into the cloned Ops above. 1048 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) 1049 annotateFn(0, &*it, builder); 1050 1051 // Update operands of the yield statement. 1052 loopBodyBlock->getTerminator()->setOperands(lastYielded); 1053 } 1054 1055 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip 1056 /// count is not a multiple of `unrollFactor`. 1057 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, 1058 uint64_t unrollFactor) { 1059 // Insert the cleanup loop right after 'forOp'. 1060 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp))); 1061 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp)); 1062 1063 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result 1064 // and produce results for the original users of `forOp` results. 1065 auto results = forOp.getResults(); 1066 auto cleanupResults = cleanupForOp.getResults(); 1067 auto cleanupIterOperands = cleanupForOp.getIterOperands(); 1068 1069 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) { 1070 std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); 1071 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e)); 1072 } 1073 1074 AffineMap cleanupMap; 1075 SmallVector<Value, 4> cleanupOperands; 1076 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands); 1077 if (!cleanupMap) 1078 return failure(); 1079 1080 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap); 1081 // Promote the loop body up if this has turned into a single iteration loop. 1082 (void)promoteIfSingleIteration(cleanupForOp); 1083 1084 // Adjust upper bound of the original loop; this is the same as the lower 1085 // bound of the cleanup loop. 1086 forOp.setUpperBound(cleanupOperands, cleanupMap); 1087 return success(); 1088 } 1089 1090 /// Unrolls this loop by the specified factor. Returns success if the loop 1091 /// is successfully unrolled. 1092 LogicalResult mlir::loopUnrollByFactor( 1093 AffineForOp forOp, uint64_t unrollFactor, 1094 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) { 1095 assert(unrollFactor > 0 && "unroll factor should be positive"); 1096 1097 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 1098 if (unrollFactor == 1) { 1099 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && 1100 failed(promoteIfSingleIteration(forOp))) 1101 return failure(); 1102 return success(); 1103 } 1104 1105 // Nothing in the loop body other than the terminator. 1106 if (llvm::hasSingleElement(forOp.getBody()->getOperations())) 1107 return success(); 1108 1109 // If the trip count is lower than the unroll factor, no unrolled body. 1110 // TODO: option to specify cleanup loop unrolling. 1111 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor) 1112 return failure(); 1113 1114 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor. 1115 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { 1116 // Loops where the lower bound is a max expression or the upper bound is 1117 // a min expression and the trip count doesn't divide the unroll factor 1118 // can't be unrolled since the lower bound of the cleanup loop in such cases 1119 // cannot be expressed as an affine function or a max over affine functions. 1120 if (forOp.getLowerBoundMap().getNumResults() != 1 || 1121 forOp.getUpperBoundMap().getNumResults() != 1) 1122 return failure(); 1123 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor))) 1124 assert(false && "cleanup loop lower bound map for single result lower " 1125 "and upper bound maps can always be determined"); 1126 } 1127 1128 ValueRange iterArgs(forOp.getRegionIterArgs()); 1129 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands(); 1130 1131 // Scale the step of loop being unrolled by unroll factor. 1132 int64_t step = forOp.getStep(); 1133 forOp.setStep(step * unrollFactor); 1134 generateUnrolledLoop( 1135 forOp.getBody(), forOp.getInductionVar(), unrollFactor, 1136 [&](unsigned i, Value iv, OpBuilder b) { 1137 // iv' = iv + i * step 1138 auto d0 = b.getAffineDimExpr(0); 1139 auto bumpMap = AffineMap::get(1, 0, d0 + i * step); 1140 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv); 1141 }, 1142 /*annotateFn=*/annotateFn, 1143 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues); 1144 1145 // Promote the loop body up if this has turned into a single iteration loop. 1146 (void)promoteIfSingleIteration(forOp); 1147 return success(); 1148 } 1149 1150 LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp, 1151 uint64_t unrollJamFactor) { 1152 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 1153 if (mayBeConstantTripCount.has_value() && 1154 mayBeConstantTripCount.value() < unrollJamFactor) 1155 return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.value()); 1156 return loopUnrollJamByFactor(forOp, unrollJamFactor); 1157 } 1158 1159 /// Check if all control operands of all loops are defined outside of `forOp` 1160 /// and return false if not. 1161 static bool areInnerBoundsInvariant(AffineForOp forOp) { 1162 auto walkResult = forOp.walk([&](AffineForOp aForOp) { 1163 for (auto controlOperand : aForOp.getControlOperands()) { 1164 if (!forOp.isDefinedOutsideOfLoop(controlOperand)) 1165 return WalkResult::interrupt(); 1166 } 1167 return WalkResult::advance(); 1168 }); 1169 return !walkResult.wasInterrupted(); 1170 } 1171 1172 // Gathers all maximal sub-blocks of operations that do not themselves 1173 // include a for op (a operation could have a descendant for op though 1174 // in its tree). Ignore the block terminators. 1175 struct JamBlockGatherer { 1176 // Store iterators to the first and last op of each sub-block found. 1177 std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks; 1178 1179 // This is a linear time walk. 1180 void walk(Operation *op) { 1181 for (auto ®ion : op->getRegions()) 1182 for (auto &block : region) 1183 walk(block); 1184 } 1185 1186 void walk(Block &block) { 1187 for (auto it = block.begin(), e = std::prev(block.end()); it != e;) { 1188 auto subBlockStart = it; 1189 while (it != e && !isa<AffineForOp>(&*it)) 1190 ++it; 1191 if (it != subBlockStart) 1192 subBlocks.emplace_back(subBlockStart, std::prev(it)); 1193 // Process all for ops that appear next. 1194 while (it != e && isa<AffineForOp>(&*it)) 1195 walk(&*it++); 1196 } 1197 } 1198 }; 1199 1200 /// Unrolls and jams this loop by the specified factor. 1201 LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp, 1202 uint64_t unrollJamFactor) { 1203 assert(unrollJamFactor > 0 && "unroll jam factor should be positive"); 1204 1205 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp); 1206 if (unrollJamFactor == 1) { 1207 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && 1208 failed(promoteIfSingleIteration(forOp))) 1209 return failure(); 1210 return success(); 1211 } 1212 1213 // Nothing in the loop body other than the terminator. 1214 if (llvm::hasSingleElement(forOp.getBody()->getOperations())) 1215 return success(); 1216 1217 // If the trip count is lower than the unroll jam factor, no unroll jam. 1218 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) { 1219 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n"); 1220 return failure(); 1221 } 1222 1223 // If any control operand of any inner loop of `forOp` is defined within 1224 // `forOp`, no unroll jam. 1225 if (!areInnerBoundsInvariant(forOp)) 1226 return failure(); 1227 1228 // Gather all sub-blocks to jam upon the loop being unrolled. 1229 JamBlockGatherer jbg; 1230 jbg.walk(forOp); 1231 auto &subBlocks = jbg.subBlocks; 1232 1233 // Collect loops with iter_args. 1234 SmallVector<AffineForOp, 4> loopsWithIterArgs; 1235 forOp.walk([&](AffineForOp aForOp) { 1236 if (aForOp.getNumIterOperands() > 0) 1237 loopsWithIterArgs.push_back(aForOp); 1238 }); 1239 1240 // Get supported reductions to be used for creating reduction ops at the end. 1241 SmallVector<LoopReduction> reductions; 1242 if (forOp.getNumIterOperands() > 0) 1243 getSupportedReductions(forOp, reductions); 1244 1245 // Generate the cleanup loop if trip count isn't a multiple of 1246 // unrollJamFactor. 1247 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) { 1248 // Loops where the lower bound is a max expression or the upper bound is 1249 // a min expression and the trip count doesn't divide the unroll factor 1250 // can't be unrolled since the lower bound of the cleanup loop in such cases 1251 // cannot be expressed as an affine function or a max over affine functions. 1252 if (forOp.getLowerBoundMap().getNumResults() != 1 || 1253 forOp.getUpperBoundMap().getNumResults() != 1) 1254 return failure(); 1255 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor))) 1256 assert(false && "cleanup loop lower bound map for single result lower " 1257 "and upper bound maps can always be determined"); 1258 } 1259 1260 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled 1261 // iteration. There are (`unrollJamFactor` - 1) iterations. 1262 SmallVector<BlockAndValueMapping, 4> operandMaps(unrollJamFactor - 1); 1263 1264 // For any loop with iter_args, replace it with a new loop that has 1265 // `unrollJamFactor` copies of its iterOperands, iter_args and yield 1266 // operands. 1267 SmallVector<AffineForOp, 4> newLoopsWithIterArgs; 1268 OpBuilder builder(forOp.getContext()); 1269 for (AffineForOp oldForOp : loopsWithIterArgs) { 1270 SmallVector<Value, 4> dupIterOperands, dupIterArgs, dupYieldOperands; 1271 ValueRange oldIterOperands = oldForOp.getIterOperands(); 1272 ValueRange oldIterArgs = oldForOp.getRegionIterArgs(); 1273 ValueRange oldYieldOperands = 1274 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands(); 1275 // Get additional iterOperands, iterArgs, and yield operands. We will 1276 // fix iterOperands and yield operands after cloning of sub-blocks. 1277 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { 1278 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end()); 1279 dupIterArgs.append(oldIterArgs.begin(), oldIterArgs.end()); 1280 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end()); 1281 } 1282 // Create a new loop with additional iterOperands, iter_args and yield 1283 // operands. This new loop will take the loop body of the original loop. 1284 AffineForOp newForOp = mlir::replaceForOpWithNewYields( 1285 builder, oldForOp, dupIterOperands, dupYieldOperands, dupIterArgs); 1286 newLoopsWithIterArgs.push_back(newForOp); 1287 // `forOp` has been replaced with a new loop. 1288 if (oldForOp == forOp) 1289 forOp = newForOp; 1290 assert(oldForOp.use_empty() && "old for op should not have any user"); 1291 oldForOp.erase(); 1292 // Update `operandMaps` for `newForOp` iterArgs and results. 1293 ValueRange newIterArgs = newForOp.getRegionIterArgs(); 1294 unsigned oldNumIterArgs = oldIterArgs.size(); 1295 ValueRange newResults = newForOp.getResults(); 1296 unsigned oldNumResults = newResults.size() / unrollJamFactor; 1297 assert(oldNumIterArgs == oldNumResults && 1298 "oldNumIterArgs must be the same as oldNumResults"); 1299 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { 1300 for (unsigned j = 0; j < oldNumIterArgs; ++j) { 1301 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and 1302 // results. Update `operandMaps[i - 1]` to map old iterArgs and results 1303 // to those in the `i`th new set. 1304 operandMaps[i - 1].map(newIterArgs[j], 1305 newIterArgs[i * oldNumIterArgs + j]); 1306 operandMaps[i - 1].map(newResults[j], 1307 newResults[i * oldNumResults + j]); 1308 } 1309 } 1310 } 1311 1312 // Scale the step of loop being unroll-jammed by the unroll-jam factor. 1313 int64_t step = forOp.getStep(); 1314 forOp.setStep(step * unrollJamFactor); 1315 1316 auto forOpIV = forOp.getInductionVar(); 1317 // Unroll and jam (appends unrollJamFactor - 1 additional copies). 1318 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { 1319 for (auto &subBlock : subBlocks) { 1320 // Builder to insert unroll-jammed bodies. Insert right at the end of 1321 // sub-block. 1322 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second)); 1323 1324 // If the induction variable is used, create a remapping to the value for 1325 // this unrolled instance. 1326 if (!forOpIV.use_empty()) { 1327 // iv' = iv + i * step, i = 1 to unrollJamFactor-1. 1328 auto d0 = builder.getAffineDimExpr(0); 1329 auto bumpMap = AffineMap::get(1, 0, d0 + i * step); 1330 auto ivUnroll = 1331 builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV); 1332 operandMaps[i - 1].map(forOpIV, ivUnroll); 1333 } 1334 // Clone the sub-block being unroll-jammed. 1335 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) 1336 builder.clone(*it, operandMaps[i - 1]); 1337 } 1338 // Fix iterOperands and yield op operands of newly created loops. 1339 for (auto newForOp : newLoopsWithIterArgs) { 1340 unsigned oldNumIterOperands = 1341 newForOp.getNumIterOperands() / unrollJamFactor; 1342 unsigned numControlOperands = newForOp.getNumControlOperands(); 1343 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator()); 1344 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor; 1345 assert(oldNumIterOperands == oldNumYieldOperands && 1346 "oldNumIterOperands must be the same as oldNumYieldOperands"); 1347 for (unsigned j = 0; j < oldNumIterOperands; ++j) { 1348 // The `i`th duplication of an old iterOperand or yield op operand 1349 // needs to be replaced with a mapped value from `operandMaps[i - 1]` 1350 // if such mapped value exists. 1351 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j, 1352 operandMaps[i - 1].lookupOrDefault( 1353 newForOp.getOperand(numControlOperands + j))); 1354 yieldOp.setOperand( 1355 i * oldNumYieldOperands + j, 1356 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j))); 1357 } 1358 } 1359 } 1360 if (forOp.getNumResults() > 0) { 1361 // Create reduction ops to combine every `unrollJamFactor` related results 1362 // into one value. For example, for %0:2 = affine.for ... and addf, we add 1363 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with 1364 // %1. 1365 builder.setInsertionPointAfter(forOp); 1366 auto loc = forOp.getLoc(); 1367 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor; 1368 for (LoopReduction &reduction : reductions) { 1369 unsigned pos = reduction.iterArgPosition; 1370 Value lhs = forOp.getResult(pos); 1371 Value rhs; 1372 SmallPtrSet<Operation *, 4> newOps; 1373 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { 1374 rhs = forOp.getResult(i * oldNumResults + pos); 1375 // Create ops based on reduction type. 1376 lhs = arith::getReductionOp(reduction.kind, builder, loc, lhs, rhs); 1377 if (!lhs) 1378 return failure(); 1379 Operation *op = lhs.getDefiningOp(); 1380 assert(op && "Reduction op should have been created"); 1381 newOps.insert(op); 1382 } 1383 // Replace all uses except those in newly created reduction ops. 1384 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps); 1385 } 1386 } 1387 1388 // Promote the loop body up if this has turned into a single iteration loop. 1389 (void)promoteIfSingleIteration(forOp); 1390 return success(); 1391 } 1392 1393 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is 1394 /// nested within 'forOpA' as the only non-terminator operation in its block. 1395 void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) { 1396 assert(&*forOpA.getBody()->begin() == forOpB.getOperation()); 1397 auto &forOpABody = forOpA.getBody()->getOperations(); 1398 auto &forOpBBody = forOpB.getBody()->getOperations(); 1399 1400 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just 1401 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's 1402 // body containing only the terminator. 1403 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA), 1404 forOpABody, forOpABody.begin(), 1405 std::prev(forOpABody.end())); 1406 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's 1407 // body (this leaves forOpB's body containing only the terminator). 1408 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(), 1409 std::prev(forOpBBody.end())); 1410 // 3) Splice forOpA into the beginning of forOpB's body. 1411 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(), 1412 Block::iterator(forOpA)); 1413 } 1414 1415 // Checks each dependence component against the permutation to see if the 1416 // desired loop interchange would violate dependences by making the 1417 // dependence component lexicographically negative. 1418 static bool checkLoopInterchangeDependences( 1419 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec, 1420 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { 1421 // Invert permutation map. 1422 unsigned maxLoopDepth = loops.size(); 1423 SmallVector<unsigned, 4> loopPermMapInv; 1424 loopPermMapInv.resize(maxLoopDepth); 1425 for (unsigned i = 0; i < maxLoopDepth; ++i) 1426 loopPermMapInv[loopPermMap[i]] = i; 1427 1428 // Check each dependence component against the permutation to see if the 1429 // desired loop interchange permutation would make the dependence vectors 1430 // lexicographically negative. 1431 // Example 1: [-1, 1][0, 0] 1432 // Example 2: [0, 0][-1, 1] 1433 for (const auto &depComps : depCompsVec) { 1434 assert(depComps.size() >= maxLoopDepth); 1435 // Check if the first non-zero dependence component is positive. 1436 // This iterates through loops in the desired order. 1437 for (unsigned j = 0; j < maxLoopDepth; ++j) { 1438 unsigned permIndex = loopPermMapInv[j]; 1439 assert(depComps[permIndex].lb); 1440 int64_t depCompLb = *depComps[permIndex].lb; 1441 if (depCompLb > 0) 1442 break; 1443 if (depCompLb < 0) 1444 return false; 1445 } 1446 } 1447 return true; 1448 } 1449 1450 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly 1451 /// nested sequence of loops in 'loops' would violate dependences. 1452 bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops, 1453 ArrayRef<unsigned> loopPermMap) { 1454 // Gather dependence components for dependences between all ops in loop nest 1455 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth]. 1456 assert(loopPermMap.size() == loops.size()); 1457 unsigned maxLoopDepth = loops.size(); 1458 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec; 1459 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec); 1460 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap); 1461 } 1462 1463 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear 1464 /// in it from outermost to innermost. 1465 bool LLVM_ATTRIBUTE_UNUSED 1466 mlir::isPerfectlyNested(ArrayRef<AffineForOp> loops) { 1467 assert(!loops.empty() && "no loops provided"); 1468 1469 // We already know that the block can't be empty. 1470 auto hasTwoElements = [](Block *block) { 1471 auto secondOpIt = std::next(block->begin()); 1472 return secondOpIt != block->end() && &*secondOpIt == &block->back(); 1473 }; 1474 1475 auto enclosingLoop = loops.front(); 1476 for (auto loop : loops.drop_front()) { 1477 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp()); 1478 // parentForOp's body should be just this loop and the terminator. 1479 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody())) 1480 return false; 1481 enclosingLoop = loop; 1482 } 1483 return true; 1484 } 1485 1486 // input[i] should move from position i -> permMap[i]. Returns the position in 1487 // `input` that becomes the new outermost loop. 1488 unsigned mlir::permuteLoops(MutableArrayRef<AffineForOp> input, 1489 ArrayRef<unsigned> permMap) { 1490 assert(input.size() == permMap.size() && "invalid permutation map size"); 1491 // Check whether the permutation spec is valid. This is a small vector - we'll 1492 // just sort and check if it's iota. 1493 SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end()); 1494 llvm::sort(checkPermMap); 1495 if (llvm::any_of(llvm::enumerate(checkPermMap), 1496 [](const auto &en) { return en.value() != en.index(); })) 1497 assert(false && "invalid permutation map"); 1498 1499 // Nothing to do. 1500 if (input.size() < 2) 1501 return 0; 1502 1503 assert(isPerfectlyNested(input) && "input not perfectly nested"); 1504 1505 // Compute the inverse mapping, invPermMap: since input[i] goes to position 1506 // permMap[i], position i of the permuted nest is at input[invPermMap[i]]. 1507 SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap; 1508 for (unsigned i = 0, e = input.size(); i < e; ++i) 1509 invPermMap.push_back({permMap[i], i}); 1510 llvm::sort(invPermMap); 1511 1512 // Move the innermost loop body to the loop that would be the innermost in the 1513 // permuted nest (only if the innermost loop is going to change). 1514 if (permMap.back() != input.size() - 1) { 1515 auto *destBody = input[invPermMap.back().second].getBody(); 1516 auto *srcBody = input.back().getBody(); 1517 destBody->getOperations().splice(destBody->begin(), 1518 srcBody->getOperations(), srcBody->begin(), 1519 std::prev(srcBody->end())); 1520 } 1521 1522 // We'll move each loop in `input` in the reverse order so that its body is 1523 // empty when we are moving it; this incurs zero copies and no erasing. 1524 for (int i = input.size() - 1; i >= 0; --i) { 1525 // If this has to become the outermost loop after permutation, add it to the 1526 // parent block of the original root. 1527 if (permMap[i] == 0) { 1528 // If the root remains the same, nothing to do. 1529 if (i == 0) 1530 continue; 1531 // Make input[i] the new outermost loop moving it into parentBlock. 1532 auto *parentBlock = input[0]->getBlock(); 1533 parentBlock->getOperations().splice(Block::iterator(input[0]), 1534 input[i]->getBlock()->getOperations(), 1535 Block::iterator(input[i])); 1536 continue; 1537 } 1538 1539 // If the parent in the permuted order is the same as in the original, 1540 // nothing to do. 1541 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second; 1542 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput) 1543 continue; 1544 1545 // Move input[i] to its surrounding loop in the transformed nest. 1546 auto *destBody = input[parentPosInInput].getBody(); 1547 destBody->getOperations().splice(destBody->begin(), 1548 input[i]->getBlock()->getOperations(), 1549 Block::iterator(input[i])); 1550 } 1551 1552 return invPermMap[0].second; 1553 } 1554 1555 // Sinks all sequential loops to the innermost levels (while preserving 1556 // relative order among them) and moves all parallel loops to the 1557 // outermost (while again preserving relative order among them). 1558 AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) { 1559 SmallVector<AffineForOp, 4> loops; 1560 getPerfectlyNestedLoops(loops, forOp); 1561 if (loops.size() < 2) 1562 return forOp; 1563 1564 // Gather dependence components for dependences between all ops in loop nest 1565 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth]. 1566 unsigned maxLoopDepth = loops.size(); 1567 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec; 1568 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec); 1569 1570 // Mark loops as either parallel or sequential. 1571 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true); 1572 for (auto &depComps : depCompsVec) { 1573 assert(depComps.size() >= maxLoopDepth); 1574 for (unsigned j = 0; j < maxLoopDepth; ++j) { 1575 DependenceComponent &depComp = depComps[j]; 1576 assert(depComp.lb.has_value() && depComp.ub.has_value()); 1577 if (depComp.lb.value() != 0 || depComp.ub.value() != 0) 1578 isParallelLoop[j] = false; 1579 } 1580 } 1581 1582 // Count the number of parallel loops. 1583 unsigned numParallelLoops = 0; 1584 for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i) 1585 if (isParallelLoop[i]) 1586 ++numParallelLoops; 1587 1588 // Compute permutation of loops that sinks sequential loops (and thus raises 1589 // parallel loops) while preserving relative order. 1590 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth); 1591 unsigned nextSequentialLoop = numParallelLoops; 1592 unsigned nextParallelLoop = 0; 1593 for (unsigned i = 0; i < maxLoopDepth; ++i) { 1594 if (isParallelLoop[i]) { 1595 loopPermMap[i] = nextParallelLoop++; 1596 } else { 1597 loopPermMap[i] = nextSequentialLoop++; 1598 } 1599 } 1600 1601 // Check if permutation 'loopPermMap' would violate dependences. 1602 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap)) 1603 return forOp; 1604 // Perform loop interchange according to permutation 'loopPermMap'. 1605 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap); 1606 return loops[loopNestRootIndex]; 1607 } 1608 1609 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the 1610 // lower (resp. upper) loop bound. When called for both the lower and upper 1611 // bounds, the resulting IR resembles: 1612 // 1613 // ```mlir 1614 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) { 1615 // ... 1616 // } 1617 // ``` 1618 static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, 1619 SmallVector<Value, 4> *operands, 1620 int64_t offset = 0) { 1621 auto bounds = llvm::to_vector<4>(map->getResults()); 1622 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset); 1623 operands->insert(operands->begin() + map->getNumDims(), iv); 1624 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds, 1625 b.getContext()); 1626 canonicalizeMapAndOperands(map, operands); 1627 } 1628 1629 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`. 1630 // Stripmine-sink is a primitive building block for generalized tiling of 1631 // imperfectly nested loops. 1632 // This transformation is purely mechanical and does not check legality, 1633 // profitability or even structural correctness. It is the user's 1634 // responsibility to specify `targets` that are dominated by `forOp`. 1635 // Returns the new AffineForOps, one per `targets`, nested immediately under 1636 // each of the `targets`. 1637 static SmallVector<AffineForOp, 8> 1638 stripmineSink(AffineForOp forOp, uint64_t factor, 1639 ArrayRef<AffineForOp> targets) { 1640 auto originalStep = forOp.getStep(); 1641 auto scaledStep = originalStep * factor; 1642 forOp.setStep(scaledStep); 1643 1644 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp))); 1645 1646 // Lower-bound map creation. 1647 auto lbMap = forOp.getLowerBoundMap(); 1648 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands()); 1649 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands); 1650 1651 // Upper-bound map creation. 1652 auto ubMap = forOp.getUpperBoundMap(); 1653 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands()); 1654 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands, 1655 /*offset=*/scaledStep); 1656 1657 auto iv = forOp.getInductionVar(); 1658 SmallVector<AffineForOp, 8> innerLoops; 1659 for (auto t : targets) { 1660 // Insert newForOp before the terminator of `t`. 1661 auto b = OpBuilder::atBlockTerminator(t.getBody()); 1662 auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap, 1663 ubOperands, ubMap, originalStep); 1664 auto begin = t.getBody()->begin(); 1665 // Skip terminator and `newForOp` which is just before the terminator. 1666 auto nOps = t.getBody()->getOperations().size() - 2; 1667 newForOp.getBody()->getOperations().splice( 1668 newForOp.getBody()->getOperations().begin(), 1669 t.getBody()->getOperations(), begin, std::next(begin, nOps)); 1670 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(), 1671 newForOp.getRegion()); 1672 innerLoops.push_back(newForOp); 1673 } 1674 1675 return innerLoops; 1676 } 1677 1678 // Stripmines a `forOp` by `factor` and sinks it under a single `target`. 1679 // Returns the new AffineForOps, nested immediately under `target`. 1680 template <typename SizeType> 1681 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor, 1682 AffineForOp target) { 1683 // TODO: Use cheap structural assertions that targets are nested under 1684 // forOp and that targets are not nested under each other when DominanceInfo 1685 // exposes the capability. It seems overkill to construct a whole function 1686 // dominance tree at this point. 1687 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target)); 1688 assert(res.size() == 1 && "Expected 1 inner forOp"); 1689 return res[0]; 1690 } 1691 1692 SmallVector<SmallVector<AffineForOp, 8>, 8> 1693 mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes, 1694 ArrayRef<AffineForOp> targets) { 1695 SmallVector<SmallVector<AffineForOp, 8>, 8> res; 1696 SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end()); 1697 for (auto it : llvm::zip(forOps, sizes)) { 1698 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets); 1699 res.push_back(step); 1700 currentTargets = step; 1701 } 1702 return res; 1703 } 1704 1705 SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps, 1706 ArrayRef<uint64_t> sizes, 1707 AffineForOp target) { 1708 SmallVector<AffineForOp, 8> res; 1709 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) { 1710 assert(loops.size() == 1); 1711 res.push_back(loops[0]); 1712 } 1713 return res; 1714 } 1715 1716 LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) { 1717 if (loops.size() < 2) 1718 return success(); 1719 1720 AffineForOp innermost = loops.back(); 1721 AffineForOp outermost = loops.front(); 1722 AffineBound ub = outermost.getUpperBound(); 1723 AffineMap origUbMap = ub.getMap(); 1724 Location loc = outermost.getLoc(); 1725 OpBuilder builder(outermost); 1726 for (AffineForOp loop : loops) { 1727 // We only work on normalized loops. 1728 if (loop.getStep() != 1 || !loop.hasConstantLowerBound() || 1729 loop.getConstantLowerBound() != 0) 1730 return failure(); 1731 } 1732 SmallVector<Value, 4> upperBoundSymbols; 1733 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(), 1734 ub.getOperands().end()); 1735 1736 // 1. Store the upper bound of the outermost loop in a variable. 1737 Value prev; 1738 if (!llvm::hasSingleElement(origUbMap.getResults())) 1739 prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands); 1740 else 1741 prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands); 1742 upperBoundSymbols.push_back(prev); 1743 1744 // 2. Emit code computing the upper bound of the coalesced loop as product of 1745 // the number of iterations of all loops. 1746 for (AffineForOp loop : loops.drop_front()) { 1747 ub = loop.getUpperBound(); 1748 origUbMap = ub.getMap(); 1749 ubOperands = ub.getOperands(); 1750 Value upperBound; 1751 // If upper bound map has more than one result, take their minimum. 1752 if (!llvm::hasSingleElement(origUbMap.getResults())) 1753 upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands); 1754 else 1755 upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands); 1756 upperBoundSymbols.push_back(upperBound); 1757 SmallVector<Value, 4> operands; 1758 operands.push_back(prev); 1759 operands.push_back(upperBound); 1760 // Maintain running product of loop upper bounds. 1761 prev = builder.create<AffineApplyOp>( 1762 loc, 1763 AffineMap::get(/*numDims=*/1, 1764 /*numSymbols=*/1, 1765 builder.getAffineDimExpr(0) * 1766 builder.getAffineSymbolExpr(0)), 1767 operands); 1768 } 1769 // Set upper bound of the coalesced loop. 1770 AffineMap newUbMap = AffineMap::get( 1771 /*numDims=*/0, 1772 /*numSymbols=*/1, builder.getAffineSymbolExpr(0), builder.getContext()); 1773 outermost.setUpperBound(prev, newUbMap); 1774 1775 builder.setInsertionPointToStart(outermost.getBody()); 1776 1777 // 3. Remap induction variables. For each original loop, the value of the 1778 // induction variable can be obtained by dividing the induction variable of 1779 // the linearized loop by the total number of iterations of the loops nested 1780 // in it modulo the number of iterations in this loop (remove the values 1781 // related to the outer loops): 1782 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i. 1783 // Compute these iteratively from the innermost loop by creating a "running 1784 // quotient" of division by the range. 1785 Value previous = outermost.getInductionVar(); 1786 for (unsigned idx = loops.size(); idx > 0; --idx) { 1787 if (idx != loops.size()) { 1788 SmallVector<Value, 4> operands; 1789 operands.push_back(previous); 1790 operands.push_back(upperBoundSymbols[idx]); 1791 previous = builder.create<AffineApplyOp>( 1792 loc, 1793 AffineMap::get( 1794 /*numDims=*/1, /*numSymbols=*/1, 1795 builder.getAffineDimExpr(0).floorDiv( 1796 builder.getAffineSymbolExpr(0))), 1797 operands); 1798 } 1799 // Modified value of the induction variables of the nested loops after 1800 // coalescing. 1801 Value inductionVariable; 1802 if (idx == 1) { 1803 inductionVariable = previous; 1804 } else { 1805 SmallVector<Value, 4> applyOperands; 1806 applyOperands.push_back(previous); 1807 applyOperands.push_back(upperBoundSymbols[idx - 1]); 1808 inductionVariable = builder.create<AffineApplyOp>( 1809 loc, 1810 AffineMap::get( 1811 /*numDims=*/1, /*numSymbols=*/1, 1812 builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)), 1813 applyOperands); 1814 } 1815 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(), 1816 inductionVariable, loops.back().getRegion()); 1817 } 1818 1819 // 4. Move the operations from the innermost just above the second-outermost 1820 // loop, delete the extra terminator and the second-outermost loop. 1821 AffineForOp secondOutermostLoop = loops[1]; 1822 innermost.getBody()->back().erase(); 1823 outermost.getBody()->getOperations().splice( 1824 Block::iterator(secondOutermostLoop.getOperation()), 1825 innermost.getBody()->getOperations()); 1826 secondOutermostLoop.erase(); 1827 return success(); 1828 } 1829 1830 void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId, 1831 ArrayRef<Value> numProcessors) { 1832 assert(processorId.size() == numProcessors.size()); 1833 if (processorId.empty()) 1834 return; 1835 1836 OpBuilder b(forOp); 1837 Location loc(forOp.getLoc()); 1838 AffineExpr lhs, rhs; 1839 bindSymbols(forOp.getContext(), lhs, rhs); 1840 auto mulMap = AffineMap::get(0, 2, lhs * rhs); 1841 auto addMap = AffineMap::get(0, 2, lhs + rhs); 1842 1843 Value linearIndex = processorId.front(); 1844 for (unsigned i = 1, e = processorId.size(); i < e; ++i) { 1845 auto mulApplyOp = b.create<AffineApplyOp>( 1846 loc, mulMap, ValueRange{linearIndex, numProcessors[i]}); 1847 linearIndex = b.create<AffineApplyOp>( 1848 loc, addMap, ValueRange{mulApplyOp, processorId[i]}); 1849 } 1850 1851 auto mulApplyOp = b.create<AffineApplyOp>( 1852 loc, mulMap, ValueRange{linearIndex, forOp.getStep()}); 1853 Value lb = b.create<AffineApplyOp>( 1854 loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()}); 1855 forOp.setLowerBound(lb); 1856 1857 Value step = forOp.getStep(); 1858 for (auto numProcs : numProcessors) 1859 step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step}); 1860 forOp.setStep(step); 1861 } 1862 1863 /// Given a memref region, determine the lowest depth at which transfers can be 1864 /// placed for it, and return the corresponding block, start and end positions 1865 /// in the block for placing incoming (read) and outgoing (write) copies 1866 /// respectively. The lowest depth depends on whether the region being accessed 1867 /// is hoistable with respect to one or more immediately surrounding loops. 1868 static void 1869 findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, 1870 Block::iterator &begin, Block::iterator &end, 1871 Block **copyPlacementBlock, 1872 Block::iterator *copyInPlacementStart, 1873 Block::iterator *copyOutPlacementStart) { 1874 const auto *cst = region.getConstraints(); 1875 SmallVector<Value, 4> symbols; 1876 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols); 1877 1878 SmallVector<AffineForOp, 4> enclosingFors; 1879 getLoopIVs(*block.begin(), &enclosingFors); 1880 // Walk up loop parents till we find an IV on which this region is 1881 // symbolic/variant. 1882 auto it = enclosingFors.rbegin(); 1883 for (auto e = enclosingFors.rend(); it != e; ++it) { 1884 // TODO: also need to be checking this for regions symbols that 1885 // aren't loop IVs, whether we are within their resp. defs' dominance scope. 1886 if (llvm::is_contained(symbols, it->getInductionVar())) 1887 break; 1888 } 1889 1890 if (it != enclosingFors.rbegin()) { 1891 auto lastInvariantIV = *std::prev(it); 1892 *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation()); 1893 *copyOutPlacementStart = std::next(*copyInPlacementStart); 1894 *copyPlacementBlock = lastInvariantIV->getBlock(); 1895 } else { 1896 *copyInPlacementStart = begin; 1897 *copyOutPlacementStart = end; 1898 *copyPlacementBlock = █ 1899 } 1900 } 1901 1902 // Info comprising stride and number of elements transferred every stride. 1903 struct StrideInfo { 1904 int64_t stride; 1905 int64_t numEltPerStride; 1906 }; 1907 1908 /// Returns striding information for a copy/transfer of this region with 1909 /// potentially multiple striding levels from outermost to innermost. For an 1910 /// n-dimensional region, there can be at most n-1 levels of striding 1911 /// successively nested. 1912 // TODO: make this work with non-identity layout maps. 1913 static void getMultiLevelStrides(const MemRefRegion ®ion, 1914 ArrayRef<int64_t> bufferShape, 1915 SmallVectorImpl<StrideInfo> *strideInfos) { 1916 if (bufferShape.size() <= 1) 1917 return; 1918 1919 int64_t numEltPerStride = 1; 1920 int64_t stride = 1; 1921 for (int d = bufferShape.size() - 1; d >= 1; d--) { 1922 int64_t dimSize = region.memref.getType().cast<MemRefType>().getDimSize(d); 1923 stride *= dimSize; 1924 numEltPerStride *= bufferShape[d]; 1925 // A stride is needed only if the region has a shorter extent than the 1926 // memref along the dimension *and* has an extent greater than one along the 1927 // next major dimension. 1928 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) { 1929 strideInfos->push_back({stride, numEltPerStride}); 1930 } 1931 } 1932 } 1933 1934 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and 1935 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and 1936 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper 1937 /// bound information for the copy loop nest. `fastBufOffsets` contain the 1938 /// expressions to be subtracted out from the respective copy loop iterators in 1939 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out; 1940 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is 1941 /// inserted. 1942 // 1943 /// The copy-in nest is generated as follows as an example for a 2-d region: 1944 /// for x = ... 1945 /// for y = ... 1946 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y] 1947 /// 1948 static AffineForOp 1949 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, 1950 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands, 1951 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands, 1952 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut, 1953 OpBuilder b) { 1954 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) { 1955 return lbMap.getNumInputs() == lbOperands.size(); 1956 })); 1957 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) { 1958 return ubMap.getNumInputs() == ubOperands.size(); 1959 })); 1960 1961 unsigned rank = memref.getType().cast<MemRefType>().getRank(); 1962 assert(lbMaps.size() == rank && "wrong number of lb maps"); 1963 assert(ubMaps.size() == rank && "wrong number of ub maps"); 1964 1965 SmallVector<Value, 4> memIndices; 1966 SmallVector<AffineExpr, 4> fastBufExprs; 1967 SmallVector<Value, 4> fastBufMapOperands; 1968 AffineForOp copyNestRoot; 1969 SmallVector<AffineApplyOp, 4> mayBeDeadApplys; 1970 for (unsigned d = 0; d < rank; ++d) { 1971 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d], 1972 ubOperands, ubMaps[d]); 1973 if (d == 0) 1974 copyNestRoot = forOp; 1975 1976 b = OpBuilder::atBlockTerminator(forOp.getBody()); 1977 1978 auto fastBufOffsetMap = 1979 AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]); 1980 auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands); 1981 1982 // Construct the subscript for the fast memref being copied into/from: 1983 // x - offset_x. 1984 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) - 1985 b.getAffineDimExpr(2 * d)); 1986 fastBufMapOperands.push_back(offset); 1987 fastBufMapOperands.push_back(forOp.getInductionVar()); 1988 mayBeDeadApplys.push_back(offset); 1989 1990 // Subscript for the slow memref being copied. 1991 memIndices.push_back(forOp.getInductionVar()); 1992 } 1993 1994 auto fastBufMap = 1995 AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext()); 1996 fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands); 1997 fastBufMap = simplifyAffineMap(fastBufMap); 1998 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands); 1999 2000 // Drop any dead affine.applys. 2001 for (auto applyOp : mayBeDeadApplys) 2002 if (applyOp.use_empty()) 2003 applyOp.erase(); 2004 2005 if (!isCopyOut) { 2006 // Copy in. 2007 auto load = b.create<AffineLoadOp>(loc, memref, memIndices); 2008 b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap, 2009 fastBufMapOperands); 2010 return copyNestRoot; 2011 } 2012 2013 // Copy out. 2014 auto load = 2015 b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands); 2016 b.create<AffineStoreOp>(loc, load, memref, memIndices); 2017 return copyNestRoot; 2018 } 2019 2020 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED 2021 emitRemarkForBlock(Block &block) { 2022 return block.getParentOp()->emitRemark(); 2023 } 2024 2025 /// Creates a buffer in the faster memory space for the specified memref region; 2026 /// generates a copy from the lower memory space to this one, and replaces all 2027 /// loads/stores in the block range [`begin', `end') of `block' to load/store 2028 /// from that buffer. Returns failure if copies could not be generated due to 2029 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart` 2030 /// in copyPlacementBlock specify the insertion points where the incoming copies 2031 /// and outgoing copies, respectively, should be inserted (the insertion happens 2032 /// right before the insertion point). Since `begin` can itself be invalidated 2033 /// due to the memref rewriting done from this method, the output argument 2034 /// `nBegin` is set to its replacement (set to `begin` if no invalidation 2035 /// happens). Since outgoing copies could have been inserted at `end`, the 2036 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the 2037 /// size of the fast buffer allocated. 2038 static LogicalResult generateCopy( 2039 const MemRefRegion ®ion, Block *block, Block::iterator begin, 2040 Block::iterator end, Block *copyPlacementBlock, 2041 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, 2042 AffineCopyOptions copyOptions, DenseMap<Value, Value> &fastBufferMap, 2043 DenseSet<Operation *> ©Nests, uint64_t *sizeInBytes, 2044 Block::iterator *nBegin, Block::iterator *nEnd) { 2045 *nBegin = begin; 2046 *nEnd = end; 2047 2048 func::FuncOp f = begin->getParentOfType<func::FuncOp>(); 2049 OpBuilder topBuilder(f.getBody()); 2050 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0); 2051 2052 if (begin == end) 2053 return success(); 2054 2055 // Is the copy out point at the end of the block where we are doing 2056 // explicit copying. 2057 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart); 2058 2059 // Copies for read regions are going to be inserted at 'begin'. 2060 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart); 2061 // Copies for write regions are going to be inserted at 'end'. 2062 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart); 2063 OpBuilder &b = region.isWrite() ? epilogue : prologue; 2064 2065 // Builder to create constants at the top level. 2066 auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>(); 2067 OpBuilder top(func.getBody()); 2068 2069 auto loc = region.loc; 2070 auto memref = region.memref; 2071 auto memRefType = memref.getType().cast<MemRefType>(); 2072 2073 if (!memRefType.getLayout().isIdentity()) { 2074 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n"); 2075 return failure(); 2076 } 2077 2078 // Indices to use for the copying. 2079 // Indices for the original memref being copied from/to. 2080 SmallVector<Value, 4> memIndices; 2081 // Indices for the faster buffer being copied into/from. 2082 SmallVector<Value, 4> bufIndices; 2083 2084 unsigned rank = memRefType.getRank(); 2085 SmallVector<int64_t, 4> fastBufferShape; 2086 2087 // Compute the extents of the buffer. 2088 std::vector<SmallVector<int64_t, 4>> lbs; 2089 SmallVector<int64_t, 8> lbDivisors; 2090 lbs.reserve(rank); 2091 Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape( 2092 &fastBufferShape, &lbs, &lbDivisors); 2093 if (!numElements) { 2094 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n"); 2095 return failure(); 2096 } 2097 2098 if (*numElements == 0) { 2099 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n"); 2100 *sizeInBytes = 0; 2101 return success(); 2102 } 2103 2104 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank); 2105 for (unsigned i = 0; i < rank; ++i) 2106 region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]); 2107 2108 const FlatAffineValueConstraints *cst = region.getConstraints(); 2109 // 'regionSymbols' hold values that this memory region is symbolic/parametric 2110 // on; these typically include loop IVs surrounding the level at which the 2111 // copy generation is being done or other valid symbols in MLIR. 2112 SmallVector<Value, 8> regionSymbols; 2113 cst->getValues(rank, cst->getNumVars(), ®ionSymbols); 2114 2115 // Construct the index expressions for the fast memory buffer. The index 2116 // expression for a particular dimension of the fast buffer is obtained by 2117 // subtracting out the lower bound on the original memref's data region 2118 // along the corresponding dimension. 2119 2120 // Index start offsets for faster memory buffer relative to the original. 2121 SmallVector<AffineExpr, 4> fastBufOffsets; 2122 fastBufOffsets.reserve(rank); 2123 for (unsigned d = 0; d < rank; d++) { 2124 assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size"); 2125 2126 AffineExpr offset = top.getAffineConstantExpr(0); 2127 for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) 2128 offset = offset + lbs[d][j] * top.getAffineDimExpr(j); 2129 assert(lbDivisors[d] > 0); 2130 offset = 2131 (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]); 2132 2133 // Set copy start location for this dimension in the lower memory space 2134 // memref. 2135 if (auto caf = offset.dyn_cast<AffineConstantExpr>()) { 2136 auto indexVal = caf.getValue(); 2137 if (indexVal == 0) { 2138 memIndices.push_back(zeroIndex); 2139 } else { 2140 memIndices.push_back( 2141 top.create<arith::ConstantIndexOp>(loc, indexVal).getResult()); 2142 } 2143 } else { 2144 // The coordinate for the start location is just the lower bound along the 2145 // corresponding dimension on the memory region (stored in 'offset'). 2146 auto map = AffineMap::get( 2147 cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset); 2148 memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols)); 2149 } 2150 // The fast buffer is copied into at location zero; addressing is relative. 2151 bufIndices.push_back(zeroIndex); 2152 2153 // Record the offsets since they are needed to remap the memory accesses of 2154 // the original memref further below. 2155 fastBufOffsets.push_back(offset); 2156 } 2157 2158 // The faster memory space buffer. 2159 Value fastMemRef; 2160 2161 // Check if a buffer was already created. 2162 bool existingBuf = fastBufferMap.count(memref) > 0; 2163 if (!existingBuf) { 2164 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank); 2165 auto fastMemRefType = 2166 MemRefType::get(fastBufferShape, memRefType.getElementType(), 2167 fastBufferLayout, copyOptions.fastMemorySpace); 2168 2169 // Create the fast memory space buffer just before the 'affine.for' 2170 // operation. 2171 fastMemRef = 2172 prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult(); 2173 // Record it. 2174 fastBufferMap[memref] = fastMemRef; 2175 // fastMemRefType is a constant shaped memref. 2176 *sizeInBytes = *getMemRefSizeInBytes(fastMemRefType); 2177 LLVM_DEBUG(emitRemarkForBlock(*block) 2178 << "Creating fast buffer of type " << fastMemRefType 2179 << " and size " << llvm::divideCeil(*sizeInBytes, 1024) 2180 << " KiB\n"); 2181 } else { 2182 // Reuse the one already created. 2183 fastMemRef = fastBufferMap[memref]; 2184 *sizeInBytes = 0; 2185 } 2186 2187 auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements); 2188 2189 Value dmaStride = nullptr; 2190 Value numEltPerDmaStride = nullptr; 2191 if (copyOptions.generateDma) { 2192 SmallVector<StrideInfo, 4> dmaStrideInfos; 2193 getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos); 2194 2195 // TODO: use all stride levels once DmaStartOp is extended for 2196 // multi-level strides. 2197 if (dmaStrideInfos.size() > 1) { 2198 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n"); 2199 return failure(); 2200 } 2201 2202 if (!dmaStrideInfos.empty()) { 2203 dmaStride = 2204 top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride); 2205 numEltPerDmaStride = top.create<arith::ConstantIndexOp>( 2206 loc, dmaStrideInfos[0].numEltPerStride); 2207 } 2208 } 2209 2210 // Record the last operation where we want the memref replacement to end. We 2211 // later do the memref replacement only in [begin, postDomFilter] so 2212 // that the original memref's used in the data movement code themselves don't 2213 // get replaced. 2214 auto postDomFilter = std::prev(end); 2215 2216 // Create fully composed affine maps for each memref. 2217 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size()); 2218 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices); 2219 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size()); 2220 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices); 2221 2222 if (!copyOptions.generateDma) { 2223 // Point-wise copy generation. 2224 auto copyNest = 2225 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps, 2226 /*lbOperands=*/regionSymbols, ubMaps, 2227 /*ubOperands=*/regionSymbols, fastBufOffsets, 2228 /*isCopyOut=*/region.isWrite(), b); 2229 2230 // Record this so that we can skip it from yet another copy. 2231 copyNests.insert(copyNest); 2232 2233 // Since new ops are being appended (for copy out's), adjust the end to 2234 // mark end of block range being processed if necessary. 2235 if (region.isWrite() && isCopyOutAtEndOfBlock) 2236 *nEnd = Block::iterator(copyNest.getOperation()); 2237 } else { 2238 // DMA generation. 2239 // Create a tag (single element 1-d memref) for the DMA. 2240 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {}, 2241 copyOptions.tagMemorySpace); 2242 auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType); 2243 2244 SmallVector<Value, 4> tagIndices({zeroIndex}); 2245 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size()); 2246 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices); 2247 if (!region.isWrite()) { 2248 // DMA non-blocking read from original buffer to fast buffer. 2249 b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices, 2250 fastMemRef, bufAffineMap, bufIndices, 2251 tagMemRef, tagAffineMap, tagIndices, 2252 numElementsSSA, dmaStride, numEltPerDmaStride); 2253 } else { 2254 // DMA non-blocking write from fast buffer to the original memref. 2255 auto op = b.create<AffineDmaStartOp>( 2256 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap, 2257 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA, 2258 dmaStride, numEltPerDmaStride); 2259 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the 2260 // end to mark end of block range being processed. 2261 if (isCopyOutAtEndOfBlock) 2262 *nEnd = Block::iterator(op.getOperation()); 2263 } 2264 2265 // Matching DMA wait to block on completion; tag always has a 0 index. 2266 b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex, 2267 numElementsSSA); 2268 2269 // Generate dealloc for the tag. 2270 auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef); 2271 if (*nEnd == end && isCopyOutAtEndOfBlock) 2272 // Since new ops are being appended (for outgoing DMAs), adjust the end to 2273 // mark end of range of the original. 2274 *nEnd = Block::iterator(tagDeallocOp.getOperation()); 2275 } 2276 2277 // Generate dealloc for the buffer. 2278 if (!existingBuf) { 2279 auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef); 2280 // When generating pointwise copies, `nEnd' has to be set to deallocOp on 2281 // the fast buffer (since it marks the new end insertion point). 2282 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock) 2283 *nEnd = Block::iterator(bufDeallocOp.getOperation()); 2284 } 2285 2286 // Replace all uses of the old memref with the faster one while remapping 2287 // access indices (subtracting out lower bound offsets for each dimension). 2288 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT], 2289 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT), 2290 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j), 2291 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'. 2292 // d2, d3 correspond to the original indices (%i, %j). 2293 SmallVector<AffineExpr, 4> remapExprs; 2294 remapExprs.reserve(rank); 2295 for (unsigned i = 0; i < rank; i++) { 2296 // The starting operands of indexRemap will be regionSymbols (the symbols on 2297 // which the memref region is parametric); then those corresponding to 2298 // the memref's original indices follow. 2299 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i); 2300 remapExprs.push_back(dimExpr - fastBufOffsets[i]); 2301 } 2302 auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs, 2303 b.getContext()); 2304 2305 // Record the begin since it may be invalidated by memref replacement. 2306 Block::iterator prevOfBegin; 2307 bool isBeginAtStartOfBlock = (begin == block->begin()); 2308 if (!isBeginAtStartOfBlock) 2309 prevOfBegin = std::prev(begin); 2310 2311 // *Only* those uses within the range [begin, end) of 'block' are replaced. 2312 (void)replaceAllMemRefUsesWith(memref, fastMemRef, 2313 /*extraIndices=*/{}, indexRemap, 2314 /*extraOperands=*/regionSymbols, 2315 /*symbolOperands=*/{}, 2316 /*domOpFilter=*/&*begin, 2317 /*postDomOpFilter=*/&*postDomFilter); 2318 2319 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin); 2320 2321 return success(); 2322 } 2323 2324 /// Construct the memref region to just include the entire memref. Returns false 2325 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of 2326 /// enclosing loop IVs of `op` (starting from the outermost) that the region 2327 /// is parametric on. 2328 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, 2329 MemRefRegion *region) { 2330 unsigned rank; 2331 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) { 2332 rank = loadOp.getMemRefType().getRank(); 2333 region->memref = loadOp.getMemRef(); 2334 region->setWrite(false); 2335 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) { 2336 rank = storeOp.getMemRefType().getRank(); 2337 region->memref = storeOp.getMemRef(); 2338 region->setWrite(true); 2339 } else { 2340 assert(false && "expected load or store op"); 2341 return false; 2342 } 2343 auto memRefType = region->memref.getType().cast<MemRefType>(); 2344 if (!memRefType.hasStaticShape()) 2345 return false; 2346 2347 auto *regionCst = region->getConstraints(); 2348 2349 // Just get the first numSymbols IVs, which the memref region is parametric 2350 // on. 2351 SmallVector<AffineForOp, 4> ivs; 2352 getLoopIVs(*op, &ivs); 2353 ivs.resize(numParamLoopIVs); 2354 SmallVector<Value, 4> symbols; 2355 extractForInductionVars(ivs, &symbols); 2356 regionCst->reset(rank, numParamLoopIVs, 0); 2357 regionCst->setValues(rank, rank + numParamLoopIVs, symbols); 2358 2359 // Memref dim sizes provide the bounds. 2360 for (unsigned d = 0; d < rank; d++) { 2361 auto dimSize = memRefType.getDimSize(d); 2362 assert(dimSize > 0 && "filtered dynamic shapes above"); 2363 regionCst->addBound(IntegerPolyhedron::LB, d, 0); 2364 regionCst->addBound(IntegerPolyhedron::UB, d, dimSize - 1); 2365 } 2366 return true; 2367 } 2368 2369 LogicalResult mlir::affineDataCopyGenerate(Block::iterator begin, 2370 Block::iterator end, 2371 const AffineCopyOptions ©Options, 2372 Optional<Value> filterMemRef, 2373 DenseSet<Operation *> ©Nests) { 2374 if (begin == end) 2375 return success(); 2376 2377 assert(begin->getBlock() == std::prev(end)->getBlock() && 2378 "Inconsistent block begin/end args"); 2379 assert(end != end->getBlock()->end() && "end can't be the block terminator"); 2380 2381 Block *block = begin->getBlock(); 2382 2383 // Copies will be generated for this depth, i.e., symbolic in all loops 2384 // surrounding the this block range. 2385 unsigned copyDepth = getNestingDepth(&*begin); 2386 2387 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth 2388 << "\n"); 2389 LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n"); 2390 LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n"); 2391 2392 // List of memory regions to copy for. We need a map vector to have a 2393 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here 2394 // since the alloc's for example are identical except for the SSA id. 2395 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions; 2396 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions; 2397 2398 // Map from original memref's to the fast buffers that their accesses are 2399 // replaced with. 2400 DenseMap<Value, Value> fastBufferMap; 2401 2402 // To check for errors when walking the block. 2403 bool error = false; 2404 2405 // Walk this range of operations to gather all memory regions. 2406 block->walk(begin, end, [&](Operation *opInst) { 2407 // Gather regions to allocate to buffers in faster memory space. 2408 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) { 2409 if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) || 2410 (loadOp.getMemRefType().getMemorySpaceAsInt() != 2411 copyOptions.slowMemorySpace)) 2412 return; 2413 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) { 2414 if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) || 2415 storeOp.getMemRefType().getMemorySpaceAsInt() != 2416 copyOptions.slowMemorySpace) 2417 return; 2418 } else { 2419 // Neither load nor a store op. 2420 return; 2421 } 2422 2423 // Compute the MemRefRegion accessed. 2424 auto region = std::make_unique<MemRefRegion>(opInst->getLoc()); 2425 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr, 2426 /*addMemRefDimBounds=*/false))) { 2427 LLVM_DEBUG(llvm::dbgs() 2428 << "Error obtaining memory region: semi-affine maps?\n"); 2429 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n"); 2430 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) { 2431 LLVM_DEBUG( 2432 opInst->emitError("non-constant memref sizes not yet supported")); 2433 error = true; 2434 return; 2435 } 2436 } 2437 2438 // Each memref has a single buffer associated with it irrespective of how 2439 // many load's and store's happen on it. 2440 // TODO: in the future, when regions don't intersect and satisfy 2441 // other properties (based on load/store regions), we could consider 2442 // multiple buffers per memref. 2443 2444 // Add to the appropriate region if it's not already in it, or take a 2445 // bounding box union with the existing one if it's already in there. 2446 // Note that a memref may have both read and write regions - so update the 2447 // region in the other list if one exists (write in case of read and vice 2448 // versa) since there is a single bounding box for a memref across all reads 2449 // and writes that happen on it. 2450 2451 // Attempts to update; returns true if 'region' exists in targetRegions. 2452 auto updateRegion = 2453 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> 2454 &targetRegions) { 2455 const auto *const it = targetRegions.find(region->memref); 2456 if (it == targetRegions.end()) 2457 return false; 2458 2459 // Perform a union with the existing region. 2460 if (failed(it->second->unionBoundingBox(*region))) { 2461 LLVM_DEBUG(llvm::dbgs() 2462 << "Memory region bounding box failed; " 2463 "over-approximating to the entire memref\n"); 2464 // If the union fails, we will overapproximate. 2465 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) { 2466 LLVM_DEBUG(opInst->emitError( 2467 "non-constant memref sizes not yet supported")); 2468 error = true; 2469 return true; 2470 } 2471 it->second->getConstraints()->clearAndCopyFrom( 2472 *region->getConstraints()); 2473 } else { 2474 // Union was computed and stored in 'it->second': copy to 'region'. 2475 region->getConstraints()->clearAndCopyFrom( 2476 *it->second->getConstraints()); 2477 } 2478 return true; 2479 }; 2480 2481 bool existsInRead = updateRegion(readRegions); 2482 if (error) 2483 return; 2484 bool existsInWrite = updateRegion(writeRegions); 2485 if (error) 2486 return; 2487 2488 // Finally add it to the region list. 2489 if (region->isWrite() && !existsInWrite) { 2490 writeRegions[region->memref] = std::move(region); 2491 } else if (!region->isWrite() && !existsInRead) { 2492 readRegions[region->memref] = std::move(region); 2493 } 2494 }); 2495 2496 if (error) { 2497 LLVM_DEBUG(begin->emitError( 2498 "copy generation failed for one or more memref's in this block\n")); 2499 return failure(); 2500 } 2501 2502 uint64_t totalCopyBuffersSizeInBytes = 0; 2503 bool ret = true; 2504 auto processRegions = 2505 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> 2506 ®ions) { 2507 for (const auto ®ionEntry : regions) { 2508 // For each region, hoist copy in/out past all hoistable 2509 // 'affine.for's. 2510 Block::iterator copyInPlacementStart, copyOutPlacementStart; 2511 Block *copyPlacementBlock; 2512 findHighestBlockForPlacement( 2513 *regionEntry.second, *block, begin, end, ©PlacementBlock, 2514 ©InPlacementStart, ©OutPlacementStart); 2515 2516 uint64_t sizeInBytes; 2517 Block::iterator nBegin, nEnd; 2518 LogicalResult iRet = generateCopy( 2519 *regionEntry.second, block, begin, end, copyPlacementBlock, 2520 copyInPlacementStart, copyOutPlacementStart, copyOptions, 2521 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd); 2522 if (succeeded(iRet)) { 2523 // begin/end could have been invalidated, and need update. 2524 begin = nBegin; 2525 end = nEnd; 2526 totalCopyBuffersSizeInBytes += sizeInBytes; 2527 } 2528 ret = ret & succeeded(iRet); 2529 } 2530 }; 2531 processRegions(readRegions); 2532 processRegions(writeRegions); 2533 2534 if (!ret) { 2535 LLVM_DEBUG(begin->emitError( 2536 "copy generation failed for one or more memref's in this block\n")); 2537 return failure(); 2538 } 2539 2540 // For a range of operations, a note will be emitted at the caller. 2541 AffineForOp forOp; 2542 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) { 2543 LLVM_DEBUG(forOp.emitRemark() 2544 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024) 2545 << " KiB of copy buffers in fast memory space for this block\n"); 2546 } 2547 2548 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) { 2549 StringRef str = "Total size of all copy buffers' for this block " 2550 "exceeds fast memory capacity\n"; 2551 block->getParentOp()->emitWarning(str); 2552 } 2553 2554 return success(); 2555 } 2556 2557 // A convenience version of affineDataCopyGenerate for all ops in the body of 2558 // an AffineForOp. 2559 LogicalResult mlir::affineDataCopyGenerate(AffineForOp forOp, 2560 const AffineCopyOptions ©Options, 2561 Optional<Value> filterMemRef, 2562 DenseSet<Operation *> ©Nests) { 2563 return affineDataCopyGenerate(forOp.getBody()->begin(), 2564 std::prev(forOp.getBody()->end()), copyOptions, 2565 filterMemRef, copyNests); 2566 } 2567 2568 LogicalResult mlir::generateCopyForMemRegion( 2569 const MemRefRegion &memrefRegion, Operation *analyzedOp, 2570 const AffineCopyOptions ©Options, CopyGenerateResult &result) { 2571 Block *block = analyzedOp->getBlock(); 2572 auto begin = analyzedOp->getIterator(); 2573 auto end = std::next(begin); 2574 DenseMap<Value, Value> fastBufferMap; 2575 DenseSet<Operation *> copyNests; 2576 2577 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end, 2578 copyOptions, fastBufferMap, copyNests, 2579 &result.sizeInBytes, &begin, &end); 2580 if (failed(err)) 2581 return err; 2582 2583 const auto &en = fastBufferMap.find(memrefRegion.memref); 2584 // In some cases (empty loops), no copy generation would have happened. 2585 if (en == fastBufferMap.end()) 2586 return failure(); 2587 result.alloc = en->second.getDefiningOp(); 2588 assert(result.alloc && "fast buffer expected to be locally allocated"); 2589 assert(copyNests.size() <= 1 && "At most one copy nest is expected."); 2590 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin(); 2591 return success(); 2592 } 2593 2594 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. 2595 static void 2596 gatherLoopsInBlock(Block *block, unsigned currLoopDepth, 2597 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { 2598 // Add a new empty level to output if it doesn't exist level already. 2599 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth"); 2600 if (currLoopDepth == depthToLoops.size()) 2601 depthToLoops.emplace_back(); 2602 2603 for (auto &op : *block) { 2604 if (auto forOp = dyn_cast<AffineForOp>(op)) { 2605 depthToLoops[currLoopDepth].push_back(forOp); 2606 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops); 2607 } 2608 } 2609 } 2610 2611 /// Gathers all AffineForOps in 'func.func' grouped by loop depth. 2612 void mlir::gatherLoops(func::FuncOp func, 2613 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { 2614 for (auto &block : func) 2615 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops); 2616 2617 // Remove last loop level from output since it's empty. 2618 if (!depthToLoops.empty()) { 2619 assert(depthToLoops.back().empty() && "Last loop level is not empty?"); 2620 depthToLoops.pop_back(); 2621 } 2622 } 2623 2624 // TODO: if necessary, this can be extended to also compose in any 2625 // affine.applys, fold to constant if all result dimensions of the map are 2626 // constant (canonicalizeMapAndOperands below already does this for single 2627 // result bound maps), and use simplifyMap to perform algebraic simplification. 2628 AffineForOp mlir::createCanonicalizedAffineForOp( 2629 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, 2630 ValueRange ubOperands, AffineMap ubMap, int64_t step) { 2631 SmallVector<Value, 4> lowerOperands(lbOperands); 2632 SmallVector<Value, 4> upperOperands(ubOperands); 2633 2634 fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands); 2635 canonicalizeMapAndOperands(&lbMap, &lowerOperands); 2636 lbMap = removeDuplicateExprs(lbMap); 2637 fullyComposeAffineMapAndOperands(&ubMap, &upperOperands); 2638 canonicalizeMapAndOperands(&ubMap, &upperOperands); 2639 ubMap = removeDuplicateExprs(ubMap); 2640 2641 return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap, 2642 step); 2643 } 2644 2645 /// Creates an AffineIfOp that encodes the conditional to choose between 2646 /// the constant trip count version and an unknown trip count version of this 2647 /// nest of loops. This is used to separate partial and full tiles if `loops` 2648 /// has the intra-tile loops. The affine.if op is inserted at the builder 2649 /// insertion point of `b`. 2650 static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops, 2651 OpBuilder b) { 2652 if (loops.empty()) 2653 return nullptr; 2654 2655 auto *context = loops[0].getContext(); 2656 2657 FlatAffineValueConstraints cst; 2658 SmallVector<Operation *, 8> ops; 2659 llvm::append_range(ops, loops); 2660 (void)getIndexSet(ops, &cst); 2661 2662 // Remove constraints that are independent of these loop IVs. 2663 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size()); 2664 2665 // Construct the constraint set representing the guard for full tiles. The 2666 // lower bound (and upper bound) corresponding to the full tile should be 2667 // larger (and resp. smaller) than any other lower (or upper bound). 2668 SmallVector<int64_t, 8> fullTileLb, fullTileUb; 2669 for (auto loop : loops) { 2670 (void)loop; 2671 // TODO: Non-unit stride is not an issue to generalize to. 2672 assert(loop.getStep() == 1 && "point loop step expected to be one"); 2673 // Mark everything symbols for the purpose of finding a constant diff pair. 2674 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() - 2675 1); 2676 unsigned fullTileLbPos, fullTileUbPos; 2677 if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr, 2678 /*boundFloorDivisor=*/nullptr, 2679 /*ub=*/nullptr, &fullTileLbPos, 2680 &fullTileUbPos)) { 2681 LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n"); 2682 return nullptr; 2683 } 2684 2685 SmallVector<unsigned, 4> lbIndices, ubIndices; 2686 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices); 2687 2688 auto fLb = cst.getInequality(fullTileLbPos); 2689 auto fUb = cst.getInequality(fullTileUbPos); 2690 fullTileLb.assign(fLb.begin(), fLb.end()); 2691 fullTileUb.assign(fUb.begin(), fUb.end()); 2692 2693 // Full tile lower bound should be >= than any other lower bound. 2694 for (auto lbIndex : lbIndices) 2695 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i) 2696 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i); 2697 2698 // Full tile upper bound should be <= any other upper bound. 2699 for (auto ubIndex : ubIndices) 2700 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i) 2701 cst.atIneq(ubIndex, i) -= fullTileUb[i]; 2702 2703 cst.removeVar(0); 2704 } 2705 2706 // The previous step leads to all zeros for the full tile lb and ub position 2707 // itself; remove those and any other duplicates / trivial redundancies. 2708 cst.removeTrivialRedundancy(); 2709 2710 // Turn everything into dims conservatively since we earlier turned all 2711 // trailing ids past point loop IV into symbols. Some of these could be outer 2712 // loop IVs; we'll canonicalize anyway. 2713 cst.setDimSymbolSeparation(0); 2714 2715 IntegerSet ifCondSet = cst.getAsIntegerSet(context); 2716 // ifCondSet can be null if cst was empty -- this can happen if all loops 2717 // in the nest have constant trip counts. 2718 if (!ifCondSet) 2719 return nullptr; 2720 2721 SmallVector<Value, 4> setOperands; 2722 cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands); 2723 canonicalizeSetAndOperands(&ifCondSet, &setOperands); 2724 return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands, 2725 /*withElseRegion=*/true); 2726 } 2727 2728 /// Create the full tile loop nest (along with its body). 2729 static LogicalResult 2730 createFullTiles(MutableArrayRef<AffineForOp> inputNest, 2731 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) { 2732 fullTileLoops.reserve(inputNest.size()); 2733 2734 // For each loop in the original nest identify a lower/upper bound pair such 2735 // that their difference is a constant. 2736 FlatAffineValueConstraints cst; 2737 for (auto loop : inputNest) { 2738 // TODO: straightforward to generalize to a non-unit stride. 2739 if (loop.getStep() != 1) { 2740 LLVM_DEBUG(llvm::dbgs() 2741 << "[tile separation] non-unit stride not implemented\n"); 2742 return failure(); 2743 } 2744 SmallVector<Operation *, 1> loopOp{loop.getOperation()}; 2745 (void)getIndexSet(loopOp, &cst); 2746 // We will mark everything other than this loop IV as symbol for getting a 2747 // pair of <lb, ub> with a constant difference. 2748 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1); 2749 unsigned lbPos, ubPos; 2750 if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr, 2751 /*lbDivisor=*/nullptr, /*ub=*/nullptr, 2752 &lbPos, &ubPos) || 2753 lbPos == ubPos) { 2754 LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / " 2755 "equalities not yet handled\n"); 2756 return failure(); 2757 } 2758 2759 // Set all variables as dimensions uniformly since some of those marked as 2760 // symbols above could be outer loop IVs (corresponding tile space IVs). 2761 cst.setDimSymbolSeparation(/*newSymbolCount=*/0); 2762 2763 AffineValueMap lbVmap, ubVmap; 2764 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext()); 2765 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext()); 2766 AffineForOp fullTileLoop = createCanonicalizedAffineForOp( 2767 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(), 2768 ubVmap.getOperands(), ubVmap.getAffineMap()); 2769 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody()); 2770 fullTileLoops.push_back(fullTileLoop); 2771 } 2772 2773 // Add the body for the full tile loop nest. 2774 BlockAndValueMapping operandMap; 2775 for (const auto &loopEn : llvm::enumerate(inputNest)) 2776 operandMap.map(loopEn.value().getInductionVar(), 2777 fullTileLoops[loopEn.index()].getInductionVar()); 2778 b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody()); 2779 for (auto &op : inputNest.back().getBody()->without_terminator()) 2780 b.clone(op, operandMap); 2781 return success(); 2782 } 2783 2784 LogicalResult 2785 mlir::separateFullTiles(MutableArrayRef<AffineForOp> inputNest, 2786 SmallVectorImpl<AffineForOp> *fullTileNest) { 2787 if (inputNest.empty()) 2788 return success(); 2789 2790 auto firstLoop = inputNest[0]; 2791 2792 // Each successive for op has to be nested in the other. 2793 auto prevLoop = firstLoop; 2794 for (auto loop : inputNest.drop_front(1)) { 2795 assert(loop->getParentOp() == prevLoop && "input not contiguously nested"); 2796 prevLoop = loop; 2797 } 2798 2799 // Create the full tile loop nest. 2800 SmallVector<AffineForOp, 4> fullTileLoops; 2801 OpBuilder b(firstLoop); 2802 if (failed(createFullTiles(inputNest, fullTileLoops, b))) { 2803 if (!fullTileLoops.empty()) 2804 fullTileLoops.front().erase(); 2805 return failure(); 2806 } 2807 2808 // Create and insert the version select right before the root of the nest. 2809 b = OpBuilder(firstLoop); 2810 AffineIfOp ifOp = createSeparationCondition(inputNest, b); 2811 if (!ifOp) { 2812 fullTileLoops.front().erase(); 2813 LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating " 2814 "separation condition\n"); 2815 return failure(); 2816 } 2817 2818 // Move the full tile into the then block. 2819 Block *thenBlock = ifOp.getThenBlock(); 2820 AffineForOp outermostFullTileLoop = fullTileLoops[0]; 2821 thenBlock->getOperations().splice( 2822 std::prev(thenBlock->end()), 2823 outermostFullTileLoop->getBlock()->getOperations(), 2824 Block::iterator(outermostFullTileLoop)); 2825 2826 // Move the partial tile into the else block. The partial tile is the same as 2827 // the original loop nest. 2828 Block *elseBlock = ifOp.getElseBlock(); 2829 elseBlock->getOperations().splice(std::prev(elseBlock->end()), 2830 firstLoop->getBlock()->getOperations(), 2831 Block::iterator(firstLoop)); 2832 2833 if (fullTileNest) 2834 *fullTileNest = std::move(fullTileLoops); 2835 2836 return success(); 2837 } 2838