1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions concerned with hoisting padding operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" 14 #include "mlir/Analysis/SliceAnalysis.h" 15 #include "mlir/Dialect/Affine/IR/AffineOps.h" 16 #include "mlir/Dialect/Func/IR/FuncOps.h" 17 #include "mlir/Dialect/Linalg/IR/Linalg.h" 18 #include "mlir/Dialect/Linalg/Transforms/Transforms.h" 19 #include "mlir/Dialect/SCF/IR/SCF.h" 20 #include "mlir/Dialect/SCF/Utils/Utils.h" 21 #include "mlir/Dialect/Tensor/IR/Tensor.h" 22 #include "mlir/Dialect/Utils/IndexingUtils.h" 23 #include "mlir/Dialect/Vector/IR/VectorOps.h" 24 #include "mlir/Dialect/Vector/Utils/VectorUtils.h" 25 #include "mlir/IR/AsmState.h" 26 #include "mlir/IR/BuiltinOps.h" 27 #include "mlir/IR/Dominance.h" 28 #include "mlir/IR/Matchers.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/Support/Debug.h" 31 32 using llvm::dbgs; 33 34 #define DEBUG_TYPE "hoist-padding" 35 36 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") 37 38 using namespace mlir; 39 using namespace mlir::linalg; 40 41 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing 42 /// loops. The failure conditions are: 43 /// 1. Pad op has a use that is not an input of a LinalgOp. 44 /// 2. Pad op does not have a constant padding value. 45 /// 3. There is no immediately enclosing scf::ForOp. 46 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above 47 /// contains an unknown op with non index type operands, a region, or a 48 /// memory effect. 49 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is 50 /// empty. 51 /// 6. The source tensor of pad op is not defined by an extract slice op. 52 /// 7. The source tensor of the extract slice op is not defined outside of 53 /// the outermost enclosing scf::ForOp. 54 /// 8. There is no enclosing scf::ForOp that indexes the padded data. 55 /// Other cases succeed and will trigger hoisting of the pad op. 56 struct HoistingAnalysis { 57 HoistingAnalysis(tensor::PadOp padOp, int numLoops); 58 59 bool isValid() { return valid; } 60 61 /// Footprint of the packedTensor, computed from the packingLoops. 62 SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b); 63 64 /// The outermost loop, determined by `nLevels` above which `padOp` will 65 /// be hoisted. 66 scf::ForOp outermostEnclosingForOp; 67 68 /// Backward slice rooted at `padOp` and nested under 69 /// `outermostEnclosingForOp`. 70 SetVector<Operation *> backwardSlice; 71 72 /// The scf::ForOp immediately enclosing `padOp` such that: 73 /// 1. they are nested under `outermostEnclosingForOp` (inclusive) 74 /// 2. whose induction variable is used, directly or indirectly, in the 75 /// computation of `padOp`. 76 /// The span of these loops determines the footprint of the packed tensor. 77 SmallVector<scf::ForOp> packingLoops; 78 79 private: 80 /// Drop any non-index dependencies of `padOp` and `sliceOp` from 81 /// `backwardSlice`. The method follows the use-def chains of the index 82 /// operands consumed by `padOp` and `sliceOp` and drops the operations 83 /// not part of this index computation. Afterwards, the filtered 84 /// `backwardSlice` contains only the loops whose induction variable is used, 85 /// directly or indirectly, to index the padded tensor. The method returns 86 /// failure if the filtered backward slice contains an unexpected operation. 87 /// 88 /// Example: 89 /// ``` 90 /// %source = linalg.fill(%cst, %arg0) 91 /// scf.for %i 92 /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! 93 /// scf.for %j (%arg2 = %unrelated) 94 /// scf.for %k // not used to index %source! 95 /// %ubi = affine.min #map(%i) 96 /// %ubj = affine.min #map(%j) 97 /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 98 /// %padded_slice = tensor.pad %slice 99 /// ``` 100 /// dropNonIndexDependencies(%padded_slice, %slice) 101 /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. 102 LogicalResult dropNonIndexDependencies(tensor::PadOp padOp, 103 tensor::ExtractSliceOp sliceOp); 104 105 /// Encodes whether the analysis is valid and hoisting can proceed. 106 bool valid; 107 }; 108 109 /// Return true if all uses of `padOp` are an input tensor of some 110 /// LinalgOp. 111 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) { 112 for (OpOperand &use : padOp.result().getUses()) { 113 auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner()); 114 if (!linalgUser || !linalgUser.isInputTensor(&use)) { 115 LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp) 116 << "\nthat is not an input tensor of a LinalgOp, " 117 << "cannot hoist\n" 118 << *(use.getOwner()) << "\n"); 119 return false; 120 } 121 } 122 return true; 123 } 124 125 /// Return at most nLevels of immediately enclosing scf::ForOp loops. 126 /// Stops at the first parent that is not an scf::ForOp. 127 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. 128 /// Control-flow and other containing ops with regions are not modeled atm. 129 static void 130 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, 131 SmallVector<scf::ForOp> &reverseEnclosingLoops) { 132 AsmState state(padOp->getParentOfType<func::FuncOp>()); 133 (void)state; 134 scf::ForOp outermostEnclosingForOp = nullptr; 135 Operation *nextEnclosingOp = padOp->getParentOp(); 136 while (nLevels-- > 0 && 137 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) { 138 LLVM_DEBUG( 139 DBGS() << "loops: "; 140 outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); 141 dbgs() << "\n"); 142 reverseEnclosingLoops.push_back(outermostEnclosingForOp); 143 nextEnclosingOp = outermostEnclosingForOp->getParentOp(); 144 } 145 } 146 147 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. 148 /// Fail if `transposeVector` is no permutation matching the tensor rank. 149 static FailureOr<RankedTensorType> 150 computeTransposedType(RankedTensorType rankedTensorType, 151 ArrayRef<int64_t> transposeVector) { 152 if (transposeVector.empty()) 153 return rankedTensorType; 154 if (!isPermutation(transposeVector) || 155 transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank())) 156 return failure(); 157 158 SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(), 159 rankedTensorType.getShape().end()); 160 applyPermutationToVector(transposedShape, transposeVector); 161 162 using RTTBuilder = RankedTensorType::Builder; 163 RankedTensorType transposedTensorType = 164 RTTBuilder(rankedTensorType).setShape(transposedShape); 165 return transposedTensorType; 166 } 167 168 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { 169 valid = false; 170 171 // Bail on any use that isn't an input of a LinalgOp. 172 // Hoisting of inplace updates happens after vectorization. 173 if (!isOnlyUsedAsInputOfLinalgOp(padOp)) 174 return; 175 176 // Get at most `numLoops` of immediately enclosing loops. 177 SmallVector<scf::ForOp> reverseEnclosingLoops; 178 getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); 179 if (reverseEnclosingLoops.empty()) { 180 LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); 181 return; 182 } 183 184 outermostEnclosingForOp = reverseEnclosingLoops.back(); 185 186 // Get the `sliceOp` that defines the source tensor of `padOp` and 187 // check its source is defined outside of the outermost loop. This check 188 // ensures the padded data is available for packing before entering the 189 // outermost enclosing loop. 190 // 191 // Example: 192 // ``` 193 // %source = linalg.fill(%cst, %arg0) 194 // // %source is available for packing here! 195 // scf.for %i 196 // scf.for %j 197 // scf.for %k 198 // %slice = tensor.extract_slice %source [%i, %j] 199 // %padded_slice = tensor.pad %slice 200 // ``` 201 auto sliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>(); 202 if (!sliceOp) { 203 LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); 204 return; 205 } 206 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) { 207 LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); 208 return; 209 } 210 211 // Check the region of `padOp` depends on a constant only. Adding 212 // hoisting support for arbitrary padding regions would require cloning all 213 // dependencies captured by the padding region. 214 Value paddingValue = padOp.getConstantPaddingValue(); 215 if (!paddingValue || 216 !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) { 217 LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n"); 218 return; 219 } 220 221 // Get all the ops in the backwards slice starting from `padOp` and that 222 // are dominated by the outermost enclosing loop. 223 DominanceInfo domInfo(outermostEnclosingForOp); 224 getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) { 225 return domInfo.dominates(outermostEnclosingForOp, op); 226 }); 227 if (backwardSlice.empty()) 228 return; 229 // Add `padOp` itself to the backward slice. 230 backwardSlice.insert(padOp.getOperation()); 231 232 // Remove all ops in the backward slice that are not used to index the padded 233 // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and 234 // affine operations used for the index computation. 235 if (failed(dropNonIndexDependencies(padOp, sliceOp))) 236 return; 237 238 // Add only the loops part of the filtered `backwardSlice` to the packing 239 // loops. All other loops are not used to index the padded data and 240 // consequently access the same data in every loop iteration. Adding them to 241 // the packing loops would increase the cache footprint of the packed data 242 // by storing the same data multiple times. 243 for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) 244 if (backwardSlice.contains(forOp)) 245 packingLoops.push_back(forOp); 246 if (packingLoops.empty()) { 247 LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); 248 return; 249 } 250 251 // The analysis is valid and hoisting can occur. 252 valid = true; 253 } 254 255 LogicalResult 256 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp, 257 tensor::ExtractSliceOp sliceOp) { 258 // Set of all values used for index computation. 259 SetVector<Value> indexEdges; 260 261 // Add all index operands of `operation` to `indexEdges`. An index operand is 262 // an operand of type index. 263 auto addIndexOperandsToIndexEdges = [&](Operation *operation) { 264 for (Value operand : operation->getOperands()) 265 if (operand.getType().isIndex()) 266 indexEdges.insert(operand); 267 }; 268 269 // Check if any operation result is contained in `indexEdges`. 270 auto hasIndexResult = [&](Operation *operation) { 271 return llvm::any_of(operation->getResults(), [&](Value result) { 272 return indexEdges.contains(result); 273 }); 274 }; 275 276 // Starting from `padOp` and `sliceOp` walk the use-def edges of index 277 // type in `backwardSlice`. Add the index operands of an operation to 278 // `indexEdges` and remove all operations from `backwardSlice` that are not 279 // part of the index computation. 280 // 281 // Example: 282 // ``` 283 // %source = linalg.fill(%cst, %arg0) 284 // scf.for %i 285 // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! 286 // scf.for %j (%arg2 = %unrelated) 287 // scf.for %k // not used to index %source! 288 // %ubi = affine.min #map(%i) 289 // %ubj = affine.min #map(%j) 290 // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 291 // %padded_slice = tensor.pad %slice 292 // ``` 293 // After iterating `backwardSlice` we obtain: 294 // indexEdges = [%i, %j, %ubi, %ubj] 295 // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k] 296 SetVector<Operation *> operationsToRemove; 297 for (Operation *op : llvm::reverse(backwardSlice)) { 298 // Add the index operands of `padOp` and `sliceOp` to start the 299 // exploration of the index computation. 300 if (op == padOp || op == sliceOp) { 301 addIndexOperandsToIndexEdges(op); 302 continue; 303 } 304 // Add the index operands of the loop if its induction variable is 305 // used for index computation. 306 if (auto forOp = dyn_cast<scf::ForOp>(op)) { 307 if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) { 308 addIndexOperandsToIndexEdges(op); 309 continue; 310 } 311 } 312 // Add the index operands of all other operations if at least one result is 313 // used for index computation. 314 if (hasIndexResult(op)) { 315 addIndexOperandsToIndexEdges(op); 316 // Check the operands of the remaining operations all have index type. 317 if (llvm::any_of(op->getOperandTypes(), 318 [](Type type) { return !type.isIndex(); })) { 319 LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: " 320 << op << " -> skip\n"); 321 return failure(); 322 } 323 // Check the remaining operations do not have regions or memory effects. 324 auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op); 325 bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect(); 326 if (hasMemoryEffect || op->getNumRegions() != 0) { 327 LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: " 328 << op << " -> skip\n"); 329 return failure(); 330 } 331 continue; 332 } 333 // Remove all other operations not used by the index computation. An 334 // exception are constant operations that may be used by `padOp`. 335 if (!isa<arith::ConstantOp>(op)) 336 operationsToRemove.insert(op); 337 } 338 backwardSlice.set_subtract(operationsToRemove); 339 return success(); 340 } 341 342 SmallVector<Value> 343 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { 344 SmallVector<Value> dynamicTensorSizes; 345 346 // Upper bound the packing loop lengths to size the packed tensor. Taking 347 // upper bounds can make the sizes of the packed tensor independent of the 348 // enclosing loops. This independence is a prerequisite for reusing the same 349 // buffer for all enclosing loop iterations and hoisting its allocation out of 350 // the enclosing loops. 351 for (auto forOp : packingLoops) { 352 // Compute an upper bound `ubVal` for the upper bound of `forOp`. 353 AffineMap boundMap; 354 SmallVector<Value> boundOperands; 355 getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands); 356 Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands); 357 // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and 358 // store the result to `dynamicTensorSizes`. 359 // TODO: instead of using the lower bound of `forOp` directly, implement a 360 // lower bound computation similar to the upper bound computation. 361 AffineExpr lb, ub, step; 362 bindDims(b.getContext(), lb, ub); 363 bindSymbols(b.getContext(), step); 364 Value res = b.createOrFold<AffineApplyOp>( 365 (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal, 366 cast<scf::ForOp>(forOp).getStep()}); 367 dynamicTensorSizes.push_back(res); 368 } 369 370 return dynamicTensorSizes; 371 } 372 373 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { 374 return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant()); 375 } 376 377 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). 378 /// The returned Value is guaranteed not to depend on any loop comprised in 379 /// [`outer`, `forOp`]. 380 /// Return null if such a loop-independent quantity cannot be computed. 381 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, 382 scf::ForOp forOp) { 383 MLIRContext *ctx = forOp->getContext(); 384 AffineExpr iv, lb, step; 385 bindDims(ctx, iv, lb); 386 bindSymbols(ctx, step); 387 if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) || 388 !isDefinedOutsideOrConstant(outer, forOp.getStep())) 389 return Value(); 390 Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(), 391 stepVal = forOp.getStep(); 392 auto loc = forOp->getLoc(); 393 return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step), 394 ValueRange{ivVal, lbVal, stepVal}); 395 } 396 397 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors( 398 tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector, 399 tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) { 400 LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops 401 << " loops\n"); 402 HoistingAnalysis analysis(opToHoist, numLoops); 403 if (!analysis.isValid()) { 404 LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); 405 return failure(); 406 } 407 408 scf::ForOp outer = analysis.outermostEnclosingForOp; 409 ImplicitLocOpBuilder b(outer->getLoc(), outer); 410 411 SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b); 412 413 // Update actual number of loops, which may be smaller. 414 int nPackedLoops = analysis.packingLoops.size(); 415 416 Location loc = opToHoist->getLoc(); 417 RankedTensorType paddedTensorType = opToHoist.getResultType(); 418 int paddedRank = paddedTensorType.getRank(); 419 420 // Compute the type of the transposed padded tensor. 421 FailureOr<RankedTensorType> transposedTensorType = 422 computeTransposedType(paddedTensorType, transposeVector); 423 if (failed(transposedTensorType)) 424 return failure(); 425 426 // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize 427 // padding. 428 SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize); 429 // TODO: go grab dims when necessary, for now tensor::PadOp returns a static 430 // tensor. 431 llvm::append_range(packedShape, transposedTensorType->getShape()); 432 auto packedTensorType = RankedTensorType::get( 433 packedShape, transposedTensorType->getElementType()); 434 Value packedTensor = b.create<linalg::InitTensorOp>( 435 loc, dynamicTensorSizes, packedTensorType.getShape(), 436 packedTensorType.getElementType()); 437 438 // Clone the operations involved in the backward slice, iteratively stepping 439 // into the loops that we encounter. 440 // The implementation proceeds in a stack-like fashion: 441 // 1. Iteratively clone and step into the loops, pushing the `packedTensor` 442 // deeper in the stack. 443 // 2. Create a GenericOp if `transposeVector` is non-empty. 444 // 3. Create a InsertSliceOp at the top of the stack. 445 // 4. Iteratively pop and yield the result of the InsertSliceOp across 446 // the cloned loops. 447 SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; 448 clonedLoopIvs.reserve(nPackedLoops); 449 leadingPackedTensorIndexings.reserve(nPackedLoops); 450 BlockAndValueMapping bvm; 451 // Stack step 1. iteratively clone loops and push `packedTensor`. 452 for (Operation *op : analysis.backwardSlice) { 453 // Specifically sit out in the extract_slice(packedTensor) case: this is the 454 // piece we seek to replace. 455 if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) 456 if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) 457 continue; 458 // Clone all operations except it is a loop. 459 auto forOp = dyn_cast<scf::ForOp>(op); 460 if (!forOp) { 461 b.clone(*op, bvm); 462 continue; 463 } 464 // Create a packing loop that takes `packedTensor` as iteration argument. 465 auto clonedForOp = b.create<scf::ForOp>( 466 loc, bvm.lookupOrDefault(forOp.getLowerBound()), 467 bvm.lookupOrDefault(forOp.getUpperBound()), 468 bvm.lookupOrDefault(forOp.getStep()), packedTensor); 469 // Map the induction var, region args and results to the `clonedForOp`. 470 bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); 471 bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); 472 bvm.map(forOp.getResults(), clonedForOp.getResults()); 473 assert(clonedForOp->getNumRegions() == 1); 474 clonedLoopIvs.push_back(clonedForOp.getInductionVar()); 475 476 b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); 477 Value loopIndependentIterationCount = 478 buildLoopIterationCount(b, outer, clonedForOp); 479 // Assert the loop-independent iteration count can be computed. 480 if (!loopIndependentIterationCount) 481 llvm_unreachable("loop independence prerequisite not met"); 482 leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); 483 packedTensor = clonedForOp.getRegionIterArgs().front(); 484 } 485 486 // offsets = [clonedLoopIvs, 0 .. 0]. 487 SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(), 488 leadingPackedTensorIndexings.end()); 489 offsets.append(paddedRank, b.getIndexAttr(0)); 490 // sizes = [1 .. 1, transposedShape]. 491 SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1)); 492 for (int64_t sz : transposedTensorType->getShape()) { 493 // TODO: go grab dims when necessary, for now tensor::PadOp returns a static 494 assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); 495 sizes.push_back(b.getIndexAttr(sz)); 496 } 497 // strides = [1 .. 1]. 498 SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank, 499 b.getIndexAttr(1)); 500 501 // Stack step 2. create GenericOp if `transposeVector` is non-empty. 502 Value paddedTensor = bvm.lookup(opToHoist.result()); 503 if (!transposeVector.empty()) { 504 Value outputTensor = b.create<tensor::ExtractSliceOp>( 505 loc, *transposedTensorType, packedTensor, offsets, sizes, strides); 506 transposeOps.push_back( 507 makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector)); 508 paddedTensor = transposeOps.back()->getResult(0); 509 } 510 511 // Stack step 3. create InsertSliceOp at the top of the stack. 512 Value inserted = b.create<tensor::InsertSliceOp>( 513 loc, paddedTensor, packedTensor, offsets, sizes, strides); 514 515 // Stack step 4. iteratively pop the stack and propagate the yield. 516 Value valueToYield = inserted; 517 for (Value iv : llvm::reverse(clonedLoopIvs)) { 518 auto forOp = scf::getForInductionVarOwner(iv); 519 b.setInsertionPointToEnd(&forOp.getRegion().front()); 520 b.create<scf::YieldOp>(loc, valueToYield); 521 valueToYield = forOp.getResult(0); 522 } 523 524 // Now the packed tensor is ready, replace the original padding op by a 525 // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. 526 b.setInsertionPoint(opToHoist); 527 SmallVector<Value> loopIterationCounts = llvm::to_vector<4>( 528 llvm::map_range(analysis.packingLoops, [&](Operation *loop) { 529 return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop)); 530 })); 531 // Assert all loop iteration counts can be computed. 532 if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) 533 llvm_unreachable("loop independence prerequisite not met"); 534 // offsets = [originalLoopIvs, 0 .. 0]. 535 offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); 536 offsets.append(paddedRank, b.getIndexAttr(0)); 537 // sizes = [1 .. 1, transposedShape] (definedabove). 538 // strides = [1 .. 1] (defined above) 539 packedTensor = 540 scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); 541 Value newResult = b.create<tensor::ExtractSliceOp>( 542 loc, *transposedTensorType, packedTensor, offsets, sizes, strides); 543 544 // Transpose the packed tensor back to the original storage order. 545 if (!transposeVector.empty()) { 546 Value initTensor = 547 b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(), 548 paddedTensorType.getElementType()); 549 transposeOps.push_back( 550 makeTransposeOp(b, loc, newResult, initTensor, transposeVector)); 551 newResult = transposeOps.back()->getResult(0); 552 } 553 554 // Make the newly cloned `opToHoist` available to the caller. 555 hoistedOp = 556 cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp()); 557 return newResult; 558 } 559