1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions concerned with hoisting padding operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" 14 #include "mlir/Analysis/SliceAnalysis.h" 15 #include "mlir/Dialect/Linalg/IR/Linalg.h" 16 #include "mlir/Dialect/Linalg/Transforms/Transforms.h" 17 #include "mlir/Dialect/SCF/SCF.h" 18 #include "mlir/Dialect/SCF/Utils/Utils.h" 19 #include "mlir/Dialect/Tensor/IR/Tensor.h" 20 #include "mlir/Dialect/Utils/IndexingUtils.h" 21 #include "mlir/Dialect/Vector/IR/VectorOps.h" 22 #include "mlir/Dialect/Vector/Utils/VectorUtils.h" 23 #include "mlir/IR/AsmState.h" 24 #include "mlir/IR/BuiltinOps.h" 25 #include "mlir/IR/Dominance.h" 26 #include "mlir/IR/Matchers.h" 27 #include "llvm/ADT/StringRef.h" 28 #include "llvm/Support/Debug.h" 29 30 using llvm::dbgs; 31 32 #define DEBUG_TYPE "hoist-padding" 33 34 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") 35 36 using namespace mlir; 37 using namespace mlir::linalg; 38 39 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing 40 /// loops. The failure conditions are: 41 /// 1. Pad op has a use that is not an input of a LinalgOp. 42 /// 2. Pad op does not have a constant padding value. 43 /// 3. There is no immediately enclosing scf::ForOp. 44 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above 45 /// contains an unknown op with non index type operands, a region, or a 46 /// memory effect. 47 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is 48 /// empty. 49 /// 6. The source tensor of pad op is not defined by an extract slice op. 50 /// 7. The source tensor of the extract slice op is not defined outside of 51 /// the outermost enclosing scf::ForOp. 52 /// 8. There is no enclosing scf::ForOp that indexes the padded data. 53 /// Other cases succeed and will trigger hoisting of the pad op. 54 struct HoistingAnalysis { 55 HoistingAnalysis(tensor::PadOp padOp, int numLoops); 56 57 bool isValid() { return valid; } 58 59 /// Footprint of the packedTensor, computed from the packingLoops. 60 SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b); 61 62 /// The outermost loop, determined by `nLevels` above which `padOp` will 63 /// be hoisted. 64 scf::ForOp outermostEnclosingForOp; 65 66 /// Backward slice rooted at `padOp` and nested under 67 /// `outermostEnclosingForOp`. 68 SetVector<Operation *> backwardSlice; 69 70 /// The scf::ForOp immediately enclosing `padOp` such that: 71 /// 1. they are nested under `outermostEnclosingForOp` (inclusive) 72 /// 2. whose induction variable is used, directly or indirectly, in the 73 /// computation of `padOp`. 74 /// The span of these loops determines the footprint of the packed tensor. 75 SmallVector<scf::ForOp> packingLoops; 76 77 private: 78 /// Drop any non-index dependencies of `padOp` and `sliceOp` from 79 /// `backwardSlice`. The method follows the use-def chains of the index 80 /// operands consumed by `padOp` and `sliceOp` and drops the operations 81 /// not part of this index computation. Afterwards, the filtered 82 /// `backwardSlice` contains only the loops whose induction variable is used, 83 /// directly or indirectly, to index the padded tensor. The method returns 84 /// failure if the filtered backward slice contains an unexpected operation. 85 /// 86 /// Example: 87 /// ``` 88 /// %source = linalg.fill(%cst, %arg0) 89 /// scf.for %i 90 /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! 91 /// scf.for %j (%arg2 = %unrelated) 92 /// scf.for %k // not used to index %source! 93 /// %ubi = affine.min #map(%i) 94 /// %ubj = affine.min #map(%j) 95 /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 96 /// %padded_slice = tensor.pad %slice 97 /// ``` 98 /// dropNonIndexDependencies(%padded_slice, %slice) 99 /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. 100 LogicalResult dropNonIndexDependencies(tensor::PadOp padOp, 101 tensor::ExtractSliceOp sliceOp); 102 103 /// Encodes whether the analysis is valid and hoisting can proceed. 104 bool valid; 105 }; 106 107 /// Return true if all uses of `padOp` are an input tensor of some 108 /// LinalgOp. 109 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) { 110 for (OpOperand &use : padOp.result().getUses()) { 111 auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner()); 112 if (!linalgUser || !linalgUser.isInputTensor(&use)) { 113 LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp) 114 << "\nthat is not an input tensor of a LinalgOp, " 115 << "cannot hoist\n" 116 << *(use.getOwner()) << "\n"); 117 return false; 118 } 119 } 120 return true; 121 } 122 123 /// Return at most nLevels of immediately enclosing scf::ForOp loops. 124 /// Stops at the first parent that is not an scf::ForOp. 125 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. 126 /// Control-flow and other containing ops with regions are not modeled atm. 127 static void 128 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, 129 SmallVector<scf::ForOp> &reverseEnclosingLoops) { 130 AsmState state(padOp->getParentOfType<mlir::FuncOp>()); 131 (void)state; 132 scf::ForOp outermostEnclosingForOp = nullptr; 133 Operation *nextEnclosingOp = padOp->getParentOp(); 134 while (nLevels-- > 0 && 135 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) { 136 LLVM_DEBUG( 137 DBGS() << "loops: "; 138 outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); 139 dbgs() << "\n"); 140 reverseEnclosingLoops.push_back(outermostEnclosingForOp); 141 nextEnclosingOp = outermostEnclosingForOp->getParentOp(); 142 } 143 } 144 145 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. 146 /// Fail if `transposeVector` is no permutation matching the tensor rank. 147 static FailureOr<RankedTensorType> 148 computeTransposedType(RankedTensorType rankedTensorType, 149 ArrayRef<int64_t> transposeVector) { 150 if (transposeVector.empty()) 151 return rankedTensorType; 152 if (!isPermutation(transposeVector) || 153 transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank())) 154 return failure(); 155 156 SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(), 157 rankedTensorType.getShape().end()); 158 applyPermutationToVector(transposedShape, transposeVector); 159 160 using RTTBuilder = RankedTensorType::Builder; 161 RankedTensorType transposedTensorType = 162 RTTBuilder(rankedTensorType).setShape(transposedShape); 163 return transposedTensorType; 164 } 165 166 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { 167 valid = false; 168 169 // Bail on any use that isn't an input of a LinalgOp. 170 // Hoisting of inplace updates happens after vectorization. 171 if (!isOnlyUsedAsInputOfLinalgOp(padOp)) 172 return; 173 174 // Get at most `numLoops` of immediately enclosing loops. 175 SmallVector<scf::ForOp> reverseEnclosingLoops; 176 getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); 177 if (reverseEnclosingLoops.empty()) { 178 LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); 179 return; 180 } 181 182 outermostEnclosingForOp = reverseEnclosingLoops.back(); 183 184 // Get the `sliceOp` that defines the source tensor of `padOp` and 185 // check its source is defined outside of the outermost loop. This check 186 // ensures the padded data is available for packing before entering the 187 // outermost enclosing loop. 188 // 189 // Example: 190 // ``` 191 // %source = linalg.fill(%cst, %arg0) 192 // // %source is available for packing here! 193 // scf.for %i 194 // scf.for %j 195 // scf.for %k 196 // %slice = tensor.extract_slice %source [%i, %j] 197 // %padded_slice = tensor.pad %slice 198 // ``` 199 auto sliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>(); 200 if (!sliceOp) { 201 LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); 202 return; 203 } 204 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) { 205 LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); 206 return; 207 } 208 209 // Check the region of `padOp` depends on a constant only. Adding 210 // hoisting support for arbitrary padding regions would require cloning all 211 // dependencies captured by the padding region. 212 Value paddingValue = padOp.getConstantPaddingValue(); 213 if (!paddingValue || 214 !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) { 215 LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n"); 216 return; 217 } 218 219 // Get all the ops in the backwards slice starting from `padOp` and that 220 // are dominated by the outermost enclosing loop. 221 DominanceInfo domInfo(outermostEnclosingForOp); 222 getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) { 223 return domInfo.dominates(outermostEnclosingForOp, op); 224 }); 225 if (backwardSlice.empty()) 226 return; 227 // Add `padOp` itself to the backward slice. 228 backwardSlice.insert(padOp.getOperation()); 229 230 // Remove all ops in the backward slice that are not used to index the padded 231 // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and 232 // affine operations used for the index computation. 233 if (failed(dropNonIndexDependencies(padOp, sliceOp))) 234 return; 235 236 // Add only the loops part of the filtered `backwardSlice` to the packing 237 // loops. All other loops are not used to index the padded data and 238 // consequently access the same data in every loop iteration. Adding them to 239 // the packing loops would increase the cache footprint of the packed data 240 // by storing the same data multiple times. 241 for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) 242 if (backwardSlice.contains(forOp)) 243 packingLoops.push_back(forOp); 244 if (packingLoops.empty()) { 245 LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); 246 return; 247 } 248 249 // The analysis is valid and hoisting can occur. 250 valid = true; 251 } 252 253 LogicalResult 254 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp, 255 tensor::ExtractSliceOp sliceOp) { 256 // Set of all values used for index computation. 257 SetVector<Value> indexEdges; 258 259 // Add all index operands of `operation` to `indexEdges`. An index operand is 260 // an operand of type index. 261 auto addIndexOperandsToIndexEdges = [&](Operation *operation) { 262 for (Value operand : operation->getOperands()) 263 if (operand.getType().isIndex()) 264 indexEdges.insert(operand); 265 }; 266 267 // Check if any operation result is contained in `indexEdges`. 268 auto hasIndexResult = [&](Operation *operation) { 269 return llvm::any_of(operation->getResults(), [&](Value result) { 270 return indexEdges.contains(result); 271 }); 272 }; 273 274 // Starting from `padOp` and `sliceOp` walk the use-def edges of index 275 // type in `backwardSlice`. Add the index operands of an operation to 276 // `indexEdges` and remove all operations from `backwardSlice` that are not 277 // part of the index computation. 278 // 279 // Example: 280 // ``` 281 // %source = linalg.fill(%cst, %arg0) 282 // scf.for %i 283 // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! 284 // scf.for %j (%arg2 = %unrelated) 285 // scf.for %k // not used to index %source! 286 // %ubi = affine.min #map(%i) 287 // %ubj = affine.min #map(%j) 288 // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 289 // %padded_slice = tensor.pad %slice 290 // ``` 291 // After iterating `backwardSlice` we obtain: 292 // indexEdges = [%i, %j, %ubi, %ubj] 293 // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k] 294 SetVector<Operation *> operationsToRemove; 295 for (Operation *op : llvm::reverse(backwardSlice)) { 296 // Add the index operands of `padOp` and `sliceOp` to start the 297 // exploration of the index computation. 298 if (op == padOp || op == sliceOp) { 299 addIndexOperandsToIndexEdges(op); 300 continue; 301 } 302 // Add the index operands of the loop if its induction variable is 303 // used for index computation. 304 if (auto forOp = dyn_cast<scf::ForOp>(op)) { 305 if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) { 306 addIndexOperandsToIndexEdges(op); 307 continue; 308 } 309 } 310 // Add the index operands of all other operations if at least one result is 311 // used for index computation. 312 if (hasIndexResult(op)) { 313 addIndexOperandsToIndexEdges(op); 314 // Check the operands of the remaining operations all have index type. 315 if (llvm::any_of(op->getOperandTypes(), 316 [](Type type) { return !type.isIndex(); })) { 317 LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: " 318 << op << " -> skip\n"); 319 return failure(); 320 } 321 // Check the remaining operations do not have regions or memory effects. 322 auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op); 323 bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect(); 324 if (hasMemoryEffect || op->getNumRegions() != 0) { 325 LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: " 326 << op << " -> skip\n"); 327 return failure(); 328 } 329 continue; 330 } 331 // Remove all other operations not used by the index computation. An 332 // exception are constant operations that may be used by `padOp`. 333 if (!isa<arith::ConstantOp>(op)) 334 operationsToRemove.insert(op); 335 } 336 backwardSlice.set_subtract(operationsToRemove); 337 return success(); 338 } 339 340 SmallVector<Value> 341 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { 342 SmallVector<Value> dynamicTensorSizes; 343 344 // Upper bound the packing loop lengths to size the packed tensor. Taking 345 // upper bounds can make the sizes of the packed tensor independent of the 346 // enclosing loops. This independence is a prerequisite for reusing the same 347 // buffer for all enclosing loop iterations and hoisting its allocation out of 348 // the enclosing loops. 349 for (auto forOp : packingLoops) { 350 // Compute an upper bound `ubVal` for the upper bound of `forOp`. 351 AffineMap boundMap; 352 SmallVector<Value> boundOperands; 353 getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands); 354 Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands); 355 // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and 356 // store the result to `dynamicTensorSizes`. 357 // TODO: instead of using the lower bound of `forOp` directly, implement a 358 // lower bound computation similar to the upper bound computation. 359 AffineExpr lb, ub, step; 360 bindDims(b.getContext(), lb, ub); 361 bindSymbols(b.getContext(), step); 362 Value res = b.createOrFold<AffineApplyOp>( 363 (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal, 364 cast<scf::ForOp>(forOp).getStep()}); 365 dynamicTensorSizes.push_back(res); 366 } 367 368 return dynamicTensorSizes; 369 } 370 371 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { 372 return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant()); 373 } 374 375 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). 376 /// The returned Value is guaranteed not to depend on any loop comprised in 377 /// [`outer`, `forOp`]. 378 /// Return null if such a loop-independent quantity cannot be computed. 379 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, 380 scf::ForOp forOp) { 381 MLIRContext *ctx = forOp->getContext(); 382 AffineExpr iv, lb, step; 383 bindDims(ctx, iv, lb); 384 bindSymbols(ctx, step); 385 if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) || 386 !isDefinedOutsideOrConstant(outer, forOp.getStep())) 387 return Value(); 388 Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(), 389 stepVal = forOp.getStep(); 390 auto loc = forOp->getLoc(); 391 return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step), 392 ValueRange{ivVal, lbVal, stepVal}); 393 } 394 395 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors( 396 tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector, 397 tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) { 398 LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops 399 << " loops\n"); 400 HoistingAnalysis analysis(opToHoist, numLoops); 401 if (!analysis.isValid()) { 402 LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); 403 return failure(); 404 } 405 406 scf::ForOp outer = analysis.outermostEnclosingForOp; 407 ImplicitLocOpBuilder b(outer->getLoc(), outer); 408 409 SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b); 410 411 // Update actual number of loops, which may be smaller. 412 int nPackedLoops = analysis.packingLoops.size(); 413 414 Location loc = opToHoist->getLoc(); 415 RankedTensorType paddedTensorType = opToHoist.getResultType(); 416 int paddedRank = paddedTensorType.getRank(); 417 418 // Compute the type of the transposed padded tensor. 419 FailureOr<RankedTensorType> transposedTensorType = 420 computeTransposedType(paddedTensorType, transposeVector); 421 if (failed(transposedTensorType)) 422 return failure(); 423 424 // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize 425 // padding. 426 SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize); 427 // TODO: go grab dims when necessary, for now tensor::PadOp returns a static 428 // tensor. 429 llvm::append_range(packedShape, transposedTensorType->getShape()); 430 auto packedTensorType = RankedTensorType::get( 431 packedShape, transposedTensorType->getElementType()); 432 Value packedTensor = b.create<linalg::InitTensorOp>( 433 loc, dynamicTensorSizes, packedTensorType.getShape(), 434 packedTensorType.getElementType()); 435 436 // Clone the operations involved in the backward slice, iteratively stepping 437 // into the loops that we encounter. 438 // The implementation proceeds in a stack-like fashion: 439 // 1. Iteratively clone and step into the loops, pushing the `packedTensor` 440 // deeper in the stack. 441 // 2. Create a GenericOp if `transposeVector` is non-empty. 442 // 3. Create a InsertSliceOp at the top of the stack. 443 // 4. Iteratively pop and yield the result of the InsertSliceOp across 444 // the cloned loops. 445 SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; 446 clonedLoopIvs.reserve(nPackedLoops); 447 leadingPackedTensorIndexings.reserve(nPackedLoops); 448 BlockAndValueMapping bvm; 449 // Stack step 1. iteratively clone loops and push `packedTensor`. 450 for (Operation *op : analysis.backwardSlice) { 451 // Specifically sit out in the extract_slice(packedTensor) case: this is the 452 // piece we seek to replace. 453 if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) 454 if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) 455 continue; 456 // Clone all operations except it is a loop. 457 auto forOp = dyn_cast<scf::ForOp>(op); 458 if (!forOp) { 459 b.clone(*op, bvm); 460 continue; 461 } 462 // Create a packing loop that takes `packedTensor` as iteration argument. 463 auto clonedForOp = b.create<scf::ForOp>( 464 loc, bvm.lookupOrDefault(forOp.getLowerBound()), 465 bvm.lookupOrDefault(forOp.getUpperBound()), 466 bvm.lookupOrDefault(forOp.getStep()), packedTensor); 467 // Map the induction var, region args and results to the `clonedForOp`. 468 bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); 469 bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); 470 bvm.map(forOp.getResults(), clonedForOp.getResults()); 471 assert(clonedForOp->getNumRegions() == 1); 472 clonedLoopIvs.push_back(clonedForOp.getInductionVar()); 473 474 b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); 475 Value loopIndependentIterationCount = 476 buildLoopIterationCount(b, outer, clonedForOp); 477 // Assert the loop-independent iteration count can be computed. 478 if (!loopIndependentIterationCount) 479 llvm_unreachable("loop independence prerequisite not met"); 480 leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); 481 packedTensor = clonedForOp.getRegionIterArgs().front(); 482 } 483 484 // offsets = [clonedLoopIvs, 0 .. 0]. 485 SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(), 486 leadingPackedTensorIndexings.end()); 487 offsets.append(paddedRank, b.getIndexAttr(0)); 488 // sizes = [1 .. 1, transposedShape]. 489 SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1)); 490 for (int64_t sz : transposedTensorType->getShape()) { 491 // TODO: go grab dims when necessary, for now tensor::PadOp returns a static 492 assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); 493 sizes.push_back(b.getIndexAttr(sz)); 494 } 495 // strides = [1 .. 1]. 496 SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank, 497 b.getIndexAttr(1)); 498 499 // Stack step 2. create GenericOp if `transposeVector` is non-empty. 500 Value paddedTensor = bvm.lookup(opToHoist.result()); 501 if (!transposeVector.empty()) { 502 Value outputTensor = b.create<tensor::ExtractSliceOp>( 503 loc, *transposedTensorType, packedTensor, offsets, sizes, strides); 504 transposeOps.push_back( 505 makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector)); 506 paddedTensor = transposeOps.back()->getResult(0); 507 } 508 509 // Stack step 3. create InsertSliceOp at the top of the stack. 510 Value inserted = b.create<tensor::InsertSliceOp>( 511 loc, paddedTensor, packedTensor, offsets, sizes, strides); 512 513 // Stack step 4. iteratively pop the stack and propagate the yield. 514 Value valueToYield = inserted; 515 for (Value iv : llvm::reverse(clonedLoopIvs)) { 516 auto forOp = scf::getForInductionVarOwner(iv); 517 b.setInsertionPointToEnd(&forOp.getRegion().front()); 518 b.create<scf::YieldOp>(loc, valueToYield); 519 valueToYield = forOp.getResult(0); 520 } 521 522 // Now the packed tensor is ready, replace the original padding op by a 523 // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. 524 b.setInsertionPoint(opToHoist); 525 SmallVector<Value> loopIterationCounts = llvm::to_vector<4>( 526 llvm::map_range(analysis.packingLoops, [&](Operation *loop) { 527 return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop)); 528 })); 529 // Assert all loop iteration counts can be computed. 530 if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) 531 llvm_unreachable("loop independence prerequisite not met"); 532 // offsets = [originalLoopIvs, 0 .. 0]. 533 offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); 534 offsets.append(paddedRank, b.getIndexAttr(0)); 535 // sizes = [1 .. 1, transposedShape] (definedabove). 536 // strides = [1 .. 1] (defined above) 537 packedTensor = 538 scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); 539 Value newResult = b.create<tensor::ExtractSliceOp>( 540 loc, *transposedTensorType, packedTensor, offsets, sizes, strides); 541 542 // Transpose the packed tensor back to the original storage order. 543 if (!transposeVector.empty()) { 544 Value initTensor = 545 b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(), 546 paddedTensorType.getElementType()); 547 transposeOps.push_back( 548 makeTransposeOp(b, loc, newResult, initTensor, transposeVector)); 549 newResult = transposeOps.back()->getResult(0); 550 } 551 552 // Make the newly cloned `opToHoist` available to the caller. 553 hoistedOp = 554 cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp()); 555 return newResult; 556 } 557