1 //===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions concerned with hoisting padding operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" 14 #include "mlir/Analysis/AffineStructures.h" 15 #include "mlir/Analysis/SliceAnalysis.h" 16 #include "mlir/Dialect/Affine/IR/AffineValueMap.h" 17 #include "mlir/Dialect/Affine/Utils.h" 18 #include "mlir/Dialect/Linalg/IR/LinalgOps.h" 19 #include "mlir/Dialect/Linalg/Transforms/Transforms.h" 20 #include "mlir/Dialect/SCF/SCF.h" 21 #include "mlir/Dialect/SCF/Utils.h" 22 #include "mlir/Dialect/StandardOps/IR/Ops.h" 23 #include "mlir/Dialect/Tensor/IR/Tensor.h" 24 #include "mlir/Dialect/Vector/VectorOps.h" 25 #include "mlir/Dialect/Vector/VectorUtils.h" 26 #include "mlir/IR/AsmState.h" 27 #include "mlir/IR/BuiltinOps.h" 28 #include "mlir/IR/Dominance.h" 29 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 30 #include "mlir/Transforms/LoopUtils.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/Support/Debug.h" 33 34 using llvm::dbgs; 35 36 #define DEBUG_TYPE "hoist-padding" 37 38 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") 39 40 using namespace mlir; 41 using namespace mlir::linalg; 42 43 /// Analysis class to support PadTensorOp hoisting across multiple enclosing 44 /// loops. The failure conditions are: 45 /// 1. Pad op has a use that is not an input of a LinalgOp. 46 /// 2. There is no immediately enclosing scf::ForOp. 47 /// 3. The backward slice from the pad op to the scf::ForOp to hoist above 48 /// contains an unknown op with a region. 49 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above is 50 /// empty. 51 /// 5. The source tensor of pad op is not defined by an extract slice op. 52 /// 6. The source tensor of the extract slice op is not defined outside of 53 /// the outermost enclosing scf::ForOp. 54 /// 7. There is no enclosing scf::ForOp that indexes the padded data. 55 /// Other cases succeed and will trigger hoisting of the pad op. 56 struct HoistingAnalysis { 57 HoistingAnalysis(PadTensorOp padTensorOp, int nLevels); 58 59 bool isValid() { return valid; } 60 61 /// Footprint of the packedTensor, computed from the packingLoops and 62 /// `backwardSlice`. 63 FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b); 64 65 /// The padTensorOp that needs to be hoisted. 66 PadTensorOp padTensorOp; 67 68 /// The maximum number of immediately enclosing scf::ForOp to hoist over. 69 int nLevels; 70 71 /// The outermost loop, determined by `nLevels` above which `padTensorOp` will 72 /// be hoisted. 73 scf::ForOp outermostEnclosingForOp; 74 75 /// Backward slice rooted at `padTensorOp` and nested under 76 /// `outermostEnclosingForOp`. 77 SetVector<Operation *> backwardSlice; 78 79 /// The scf::ForOp immediately enclosing `padTensorOp` such that: 80 /// 1. they are nested under `outermostEnclosingForOp` (inclusive) 81 /// 2. whose induction variable is used, directly or indirectly, in the 82 /// computation of `padTensorOp`. 83 /// The span of these loops determines the footprint of the packed tensor. 84 /// SmallSetVector<scf::ForOp> packingLoops; 85 SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>> 86 packingLoops; 87 88 private: 89 /// Returns the loops in `backwardSlice` used to index the padded data. The 90 /// method starts from `padTensorOp` and `sliceOp`, follows the use-def 91 /// chains of their index operands, and stores any enclosing loop whose 92 /// induction variable is part of the walked index computation. 93 /// 94 /// Example: 95 /// ``` 96 /// %source = linalg.fill(%cst, %arg0) 97 /// scf.for %i 98 /// scf.for %j 99 /// scf.for %k // not used to index %source! 100 /// %ubi = affine.min #map(%i) 101 /// %ubj = affine.min #map(%j) 102 /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 103 /// %padded_slice = linalg.pad_tensor %slice 104 /// ``` 105 /// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j] 106 SetVector<Operation *> getIndexingLoops(PadTensorOp padTensorOp, 107 tensor::ExtractSliceOp sliceOp); 108 109 /// Encodes whether the analysis is valid and hoisting can proceed. 110 bool valid; 111 }; 112 113 /// Return true if all uses of `padTensorOp` are an input tensor of some 114 /// LinalgOp. 115 static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) { 116 for (OpOperand &use : padTensorOp.result().getUses()) { 117 auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner()); 118 if (!linalgUser || !linalgUser.isInputTensor(&use)) { 119 LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp) 120 << "\nthat is not an input tensor of a LinalgOp, " 121 << "cannot hoist\n" 122 << *(use.getOwner()) << "\n"); 123 return false; 124 } 125 } 126 return true; 127 } 128 129 /// Return at most nLevels of immediately enclosing scf::ForOp loops. 130 /// Stops at the first parent that is not an scf::ForOp. 131 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. 132 /// Control-flow and other containing ops with regions are not modeled atm. 133 static void 134 getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels, 135 SmallVector<scf::ForOp> &reverseEnclosingLoops) { 136 AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>()); 137 (void)state; 138 scf::ForOp outermostEnclosingForOp = nullptr; 139 Operation *nextEnclosingOp = padTensorOp->getParentOp(); 140 while (nLevels-- > 0 && 141 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) { 142 LLVM_DEBUG( 143 DBGS() << "loops: "; 144 outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); 145 dbgs() << "\n"); 146 reverseEnclosingLoops.push_back(outermostEnclosingForOp); 147 nextEnclosingOp = outermostEnclosingForOp->getParentOp(); 148 } 149 } 150 151 HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels) 152 : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) { 153 AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>()); 154 (void)state; 155 156 // Bail on any use that isn't an input of a Linalg op. 157 // Hoisting of inplace updates happens after vectorization. 158 if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp)) 159 return; 160 161 // Get at most nLevels of immediately enclosing loops. 162 SmallVector<scf::ForOp> reverseEnclosingLoops; 163 getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops); 164 if (reverseEnclosingLoops.empty()) { 165 LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); 166 return; 167 } 168 169 outermostEnclosingForOp = reverseEnclosingLoops.back(); 170 171 // Get all the ops in the backwards slice starting from `padTensorOp` and that 172 // are dominated by the outermost enclosing loop. 173 // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp. 174 bool analysisFailure = false; 175 DominanceInfo domInfo(outermostEnclosingForOp); 176 getBackwardSlice( 177 padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) { 178 if (!domInfo.dominates(outermostEnclosingForOp, op)) 179 return false; 180 if (op != padTensorOp && op->getNumRegions() > 0 && 181 !isa<scf::ForOp, LinalgOp>(op)) { 182 analysisFailure = true; 183 LLVM_DEBUG(DBGS() 184 << "Unsupported op with region: " << *op << " -> skip\n"); 185 return false; 186 } 187 return true; 188 }); 189 190 if (analysisFailure || backwardSlice.empty()) 191 return; 192 193 // Get the `sliceOp` that defines the source tensor of `padTensorOp` and 194 // check its source is defined outside of the outermost loop. This check 195 // ensures the padded data is available for packing before entering the 196 // outermost enclosing loop. 197 // 198 // Example: 199 // ``` 200 // %source = linalg.fill(%cst, %arg0) 201 // // %source is available for packing here! 202 // scf.for %i 203 // scf.for %j 204 // scf.for %k 205 // %slice = tensor.extract_slice %source [%i, %j] 206 // %padded_slice = linalg.pad_tensor %slice 207 // ``` 208 auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>(); 209 if (!sliceOp) { 210 LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); 211 return; 212 } 213 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) { 214 LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); 215 return; 216 } 217 218 // Search the loops found in `backwardSlice` used to index the padded data. 219 SetVector<Operation *> indexingLoops = getIndexingLoops(padTensorOp, sliceOp); 220 221 // Add only the loops part of `indexingLoops` to the packing loops. All other 222 // loops are not used to index the padded data and consequently access the 223 // same data in every loop iteration. Adding them to the packing loops would 224 // increase the cache footprint of the packed data by storing the same data 225 // multiple times. 226 for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { 227 if (indexingLoops.contains(forOp)) 228 packingLoops.insert(forOp); 229 } 230 assert(indexingLoops.size() == packingLoops.size() && 231 "expect the all indexing loops are enclosing loops"); 232 if (packingLoops.empty()) { 233 LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); 234 return; 235 } 236 237 // The analysis is valid and hoisting can occur. 238 valid = true; 239 } 240 241 /// Add all index operands of `operation` to `indexEdges`. An index operand is 242 /// an operand of type index. 243 static void addIndexOperandsToIndexEdges(Operation *operation, 244 SetVector<Value> &indexEdges) { 245 for (Value operand : operation->getOperands()) 246 if (operand.getType().isIndex()) 247 indexEdges.insert(operand); 248 } 249 250 SetVector<Operation *> 251 HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp, 252 tensor::ExtractSliceOp sliceOp) { 253 // Set of all values used for index computation. 254 SetVector<Value> indexEdges; 255 256 // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index 257 // type in `backwardSlice`. Add the index operands of an operation to 258 // `indexEdges` if one of its results is an index edge found so far and store 259 // all loops part of the index computation to `indexingLoops`. 260 // 261 // Example: 262 // ``` 263 // %source = linalg.fill(%cst, %arg0) 264 // scf.for %i 265 // scf.for %j 266 // scf.for %k // not used to index %source! 267 // %ubi = affine.min #map(%i) 268 // %ubj = affine.min #map(%j) 269 // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 270 // %padded_slice = linalg.pad_tensor %slice 271 // ``` 272 // After iterating `backwardSlice` we obtain: 273 // indexEdges = [%i, %j, %ubi, %ubj] 274 // indexingLoops = [scf.for %i, scf.for %j] 275 SetVector<Operation *> indexingLoops; 276 for (Operation *op : llvm::reverse(backwardSlice)) { 277 // Add the index operands of `padTensorOp` and `sliceOp` to start the 278 // exploration of the index computation. 279 if (op == padTensorOp || op == sliceOp) { 280 addIndexOperandsToIndexEdges(op, indexEdges); 281 continue; 282 } 283 // Add the index operands of the loop if its induction variable is 284 // used for index computation. Additionally, insert the loop into 285 // `indexingLoops` 286 if (auto forOp = dyn_cast<scf::ForOp>(op)) { 287 if (indexEdges.contains(forOp.getInductionVar())) { 288 addIndexOperandsToIndexEdges(op, indexEdges); 289 indexingLoops.insert(forOp); 290 continue; 291 } 292 } 293 // Add the index operands of all other operations if at least one result is 294 // used for index computation. 295 if (llvm::any_of(op->getResults(), 296 [&](Value result) { return indexEdges.contains(result); })) 297 addIndexOperandsToIndexEdges(op, indexEdges); 298 } 299 return indexingLoops; 300 } 301 302 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { 303 return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>(); 304 } 305 306 /// For each loop in `loops`, determine the ops involved in the construction of 307 /// its upper bound---up to the outerLimit loop--- and fold them as new 308 /// inequalities in the constraint set. 309 /// This is achieved by computing the backwardSlice of the loop's upper bound 310 /// and iteratively folding each op in reverse topological order to guarantee 311 /// use-def ordering. 312 /// As operations are folded in, their result is projected out of the 313 /// constraints set. 314 /// The following operations are supported: 315 /// - scf::ForOp are simply skipped. 316 /// - AffineApplyOp are composed to replace the result by an equality. 317 /// - AffineMinOp are composed by adding each entry as an upper bound. 318 /// Additionally, the following terminal operations are handled: 319 /// - DimOp and ConstantOp are skipped. 320 /// If any other operation is met, return failure. 321 // TODO: extend on a per-need basis. 322 static LogicalResult 323 foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints, 324 scf::ForOp outerLimit, 325 ArrayRef<scf::ForOp> loops) { 326 SetVector<Value> toProjectOut; 327 for (scf::ForOp loop : loops) { 328 auto ub = loop.upperBound(); 329 330 // Set of all values used for index computation. 331 SetVector<Value> indexEdges; 332 indexEdges.insert(ub); 333 334 // Compute the backward slice `indexSlice` containing the index computation 335 // performed to obtain the upper bound `ub`. Starting from `ub` add the 336 // index operands of an operation to `indexEdges` if one of its results is 337 // an index edge. Otherwise, stop the slice computation. For a loop, check 338 // if its induction variable is an index edge. 339 // 340 // Example: 341 // ``` 342 // %c0 = arith.constant 0 343 // scf.for %i = %c0 to ... 344 // scf.for %j = %c0 to ... 345 // %ub = affine.min #map(%i) 346 // scf.for %k = %c0 to %ub 347 // ``` 348 // After computing the backward slice we obtain: 349 // indexEdges = [%ub, %i, %c0] 350 // indexSlice = [arith.constant 0, scf.for %i, affine.min #map(%i)] 351 SetVector<Operation *> indexSlice; 352 getBackwardSlice(ub, &indexSlice, [&](Operation *op) { 353 // Continue only along the index operands of the ForOp. 354 if (auto forOp = dyn_cast<scf::ForOp>(op)) { 355 // Consider only loops part of the enclosing loops. 356 if (!outerLimit->isAncestor(op)) 357 return false; 358 if (!indexEdges.contains(forOp.getInductionVar())) 359 return false; 360 addIndexOperandsToIndexEdges(op, indexEdges); 361 return true; 362 } 363 // All supported index operations have one result. 364 assert(op->getNumResults() == 1 && 365 "expect operations to have one result"); 366 if (!indexEdges.contains(op->getResult(0))) 367 return false; 368 addIndexOperandsToIndexEdges(op, indexEdges); 369 return true; 370 }); 371 indexSlice.insert(ub.getDefiningOp()); 372 373 // Iterate over all ops in the slice and compose them in the constraints. 374 for (Operation *op : llvm::reverse(indexSlice)) { 375 // All ForOps have previously been added to the constraints and ConstantOp 376 // and DimOp are terminals of the index computation. 377 if (isa<scf::ForOp, arith::ConstantOp, tensor::DimOp>(op)) 378 continue; 379 // Check all index computation operations are supported. 380 if (!isa<AffineApplyOp, AffineMinOp>(op)) 381 return failure(); 382 // Ensure there is an id. 383 auto ensureIdFailed = [&](Value v) { 384 if (constraints.containsId(v)) { 385 unsigned pos; 386 constraints.findId(v, &pos); 387 return pos >= constraints.getNumDimIds(); 388 } 389 constraints.appendDimId(v); 390 return false; 391 }; 392 393 // Ensure all ids exist and add results for later projection. 394 if (llvm::any_of(op->getResults(), ensureIdFailed) || 395 llvm::any_of(op->getOperands(), ensureIdFailed)) 396 return failure(); 397 398 // All supported ops have 1 result. 399 // TODO: extend when needed. 400 assert(op->getNumResults() == 1 && 401 "expect operations to have one result"); 402 toProjectOut.insert(op->getResult(0)); 403 404 // Compose supported ops. 405 if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) { 406 AffineValueMap avm(affineApplyOp.getAffineMap(), 407 affineApplyOp.getOperands(), 408 affineApplyOp.getResult()); 409 if (failed(constraints.composeMap(&avm))) 410 return failure(); 411 continue; 412 } 413 auto affineMinOp = cast<AffineMinOp>(op); 414 unsigned pos; 415 bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos); 416 (void)foundMinOp; 417 assert(foundMinOp); 418 AffineMap alignedMap = constraints.computeAlignedMap( 419 affineMinOp.getAffineMap(), affineMinOp.getOperands()); 420 if (failed( 421 constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap))) 422 return failure(); 423 } 424 } 425 for (Value v : toProjectOut) 426 constraints.projectOut(v); 427 return success(); 428 } 429 430 // Footprint of the packedTensor, computed from the packingLoops and 431 // `backwardSlice`. 432 FailureOr<SmallVector<Value>> 433 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { 434 // Create the base affine constaints for the packedLoops. 435 auto constraints = FlatAffineValueConstraints::getHyperrectangular( 436 llvm::to_vector<8>(llvm::map_range( 437 packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })), 438 llvm::to_vector<8>(llvm::map_range( 439 packingLoops, [](scf::ForOp op) { return op.lowerBound(); })), 440 llvm::to_vector<8>(llvm::map_range( 441 packingLoops, [](scf::ForOp op) { return op.upperBound(); }))); 442 443 // Iteratively try to fold the upper bounds into the constraints set. 444 if (failed(foldUpperBoundsIntoConstraintsSet( 445 constraints, outermostEnclosingForOp, packingLoops.getArrayRef()))) 446 return failure(); 447 448 int nPackedLoops = packingLoops.size(); 449 SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops); 450 // Compute the bounds of the first positions, assuming the others are fixed. 451 constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops, 452 outermostEnclosingForOp->getContext(), &lbs, &ubs); 453 454 SmallVector<Value> allValues; 455 constraints.getAllValues(&allValues); 456 SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops, 457 allValues.end()); 458 459 // For each packingLoop, create the extent by (ub - lb).ceilDiv(step). 460 // IP just before the outermost loop considered that we hoist above. 461 assert(nPackedLoops == static_cast<int64_t>(lbs.size()) && 462 "expected matching lb sizes"); 463 assert(nPackedLoops == static_cast<int64_t>(ubs.size()) && 464 "expected matching ub sizes"); 465 SmallVector<Value> dynamicTensorSizes; 466 for (auto it : llvm::zip(packingLoops, lbs, ubs)) { 467 scf::ForOp loop = std::get<0>(it); 468 AffineMap lbMap = std::get<1>(it); 469 AffineMap ubMap = std::get<2>(it); 470 SmallVector<Value> lbOperands(allNonLoopValues); 471 canonicalizeMapAndOperands(&lbMap, &lbOperands); 472 Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands); 473 474 SmallVector<Value> ubOperands(allNonLoopValues); 475 canonicalizeMapAndOperands(&ubMap, &ubOperands); 476 Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands); 477 478 AffineExpr lb, ub, step; 479 bindDims(b.getContext(), lb, ub); 480 bindSymbols(b.getContext(), step); 481 Value res = b.createOrFold<AffineApplyOp>( 482 (ub - lb).ceilDiv(step), 483 ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()}); 484 485 dynamicTensorSizes.push_back(res); 486 } 487 return dynamicTensorSizes; 488 } 489 490 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). 491 /// The returned Value is guaranteed not to depend on any loop comprised in 492 /// [`outer`, `forOp`]. 493 /// Return null if such a loop-independent quantity cannot be computed. 494 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, 495 scf::ForOp forOp) { 496 MLIRContext *ctx = forOp->getContext(); 497 AffineExpr iv, lb, step; 498 bindDims(ctx, iv, lb); 499 bindSymbols(ctx, step); 500 if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) || 501 !isDefinedOutsideOrConstant(outer, forOp.step())) 502 return Value(); 503 Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(), 504 stepVal = forOp.step(); 505 auto loc = forOp->getLoc(); 506 return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step), 507 ValueRange{ivVal, lbVal, stepVal}); 508 } 509 510 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist, 511 int numLoops, 512 PadTensorOp &hoistedOp) { 513 LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops 514 << " loops\n"); 515 HoistingAnalysis analysis(opToHoist, numLoops); 516 if (!analysis.isValid()) { 517 LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); 518 return failure(); 519 } 520 521 scf::ForOp outer = analysis.outermostEnclosingForOp; 522 ImplicitLocOpBuilder b(outer->getLoc(), outer); 523 524 auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b); 525 if (failed(maybeDynamicTensorSizes)) 526 return failure(); 527 SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes; 528 529 // Update actual number of loops, which may be smaller. 530 int nPackedLoops = analysis.packingLoops.size(); 531 532 Location loc = opToHoist->getLoc(); 533 RankedTensorType paddedTensorType = opToHoist.getResultType(); 534 int paddedRank = paddedTensorType.getRank(); 535 536 // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize 537 // padding. 538 SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize); 539 // TODO: go grab dims when necessary, for now PadTensorOp returns a static 540 // tensor. 541 llvm::append_range(packedShape, paddedTensorType.getShape()); 542 auto packedTensorType = 543 RankedTensorType::get(packedShape, paddedTensorType.getElementType()); 544 Value packedTensor = b.create<linalg::InitTensorOp>( 545 loc, dynamicTensorSizes, packedTensorType.getShape(), 546 packedTensorType.getElementType()); 547 548 // Clone the operations involved in the backward slice, iteratively stepping 549 // into the loops that we encounter. 550 // The implementation proceeds in a stack-like fashion: 551 // 1. Iteratively clone and step into the loops, pushing the `packedTensor` 552 // deeper in the stack. 553 // 2. Create a InsertSliceOp at the top of the stack. 554 // 3. Iteratively pop and yield the result of the InsertSliceOp across 555 // the cloned loops. 556 SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; 557 clonedLoopIvs.reserve(nPackedLoops); 558 leadingPackedTensorIndexings.reserve(nPackedLoops); 559 BlockAndValueMapping bvm; 560 // Insert `opToHoist` into the backwardSlice so we clone it too. 561 analysis.backwardSlice.insert(opToHoist); 562 // Stack step 1. iteratively clone loops and push `packedTensor`. 563 for (Operation *op : analysis.backwardSlice) { 564 // Specifically sit out in the extract_slice(packedTensor) case: this is the 565 // piece we seek to replace. 566 if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) 567 if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) 568 continue; 569 auto effects = dyn_cast<MemoryEffectOpInterface>(op); 570 bool hasNoEffects = !effects || effects.hasNoEffect(); 571 if (hasNoEffects && 572 (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) { 573 b.clone(*op, bvm); 574 continue; 575 } 576 // TODO: support more cases as they appear. 577 auto forOp = dyn_cast<scf::ForOp>(op); 578 assert(forOp && "Expected scf::ForOp when hoisting pad ops"); 579 // Unused loop, just skip it. 580 if (!analysis.packingLoops.contains(forOp)) 581 continue; 582 583 auto clonedForOp = 584 b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()), 585 bvm.lookupOrDefault(forOp.upperBound()), 586 bvm.lookupOrDefault(forOp.step()), packedTensor); 587 // Map the induction var, region args and results to the `clonedForOp`. 588 bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); 589 bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); 590 bvm.map(forOp.getResults(), clonedForOp.getResults()); 591 assert(clonedForOp->getNumRegions() == 1); 592 clonedLoopIvs.push_back(clonedForOp.getInductionVar()); 593 594 b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); 595 Value loopIndependentIterationCount = 596 buildLoopIterationCount(b, outer, clonedForOp); 597 // Assert the loop-independent iteration count can be computed. 598 if (!loopIndependentIterationCount) 599 llvm_unreachable("loop independence prerequisite not met"); 600 leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); 601 packedTensor = clonedForOp.getRegionIterArgs().front(); 602 } 603 604 // Stack step 2. create InsertSliceOp at the top of the stack. 605 // offsets = [clonedLoopIvs, 0 .. 0]. 606 SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(), 607 leadingPackedTensorIndexings.end()); 608 offsets.append(paddedRank, b.getIndexAttr(0)); 609 // sizes = [1 .. 1, paddedShape]. 610 SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1)); 611 for (int64_t sz : paddedTensorType.getShape()) { 612 // TODO: go grab dims when necessary, for now PadTensorOp returns a static 613 // tensor. 614 assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); 615 sizes.push_back(b.getIndexAttr(sz)); 616 } 617 // strides = [1 .. 1]. 618 SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank, 619 b.getIndexAttr(1)); 620 621 Value inserted = 622 b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()), 623 packedTensor, offsets, sizes, strides); 624 625 // Stack step 3. iteratively pop the stack and propagate the yield. 626 Value valueToYield = inserted; 627 for (Value iv : llvm::reverse(clonedLoopIvs)) { 628 auto forOp = scf::getForInductionVarOwner(iv); 629 b.setInsertionPointToEnd(&forOp.getRegion().front()); 630 b.create<scf::YieldOp>(loc, valueToYield); 631 valueToYield = forOp.getResult(0); 632 } 633 634 // Now the packed tensor is ready, replace the original padding op by a 635 // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. 636 b.setInsertionPoint(opToHoist); 637 SmallVector<Value> loopIterationCounts = llvm::to_vector<4>( 638 llvm::map_range(analysis.packingLoops, [&](Operation *loop) { 639 return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop)); 640 })); 641 // Assert all loop iteration counts can be computed. 642 if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) 643 llvm_unreachable("loop independence prerequisite not met"); 644 // offsets = [originalLoopIvs, 0 .. 0]. 645 offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); 646 offsets.append(paddedRank, b.getIndexAttr(0)); 647 // sizes = [1 .. 1, paddedShape] (definedabove). 648 // strides = [1 .. 1] (defined above) 649 packedTensor = 650 scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); 651 Value newResult = b.create<tensor::ExtractSliceOp>( 652 loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides); 653 654 // Make the newly cloned `opToHoist` available to the caller. 655 hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp()); 656 return newResult; 657 } 658