1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions concerned with hoisting padding operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" 14 #include "mlir/Analysis/SliceAnalysis.h" 15 #include "mlir/Dialect/Linalg/IR/Linalg.h" 16 #include "mlir/Dialect/Linalg/Transforms/Transforms.h" 17 #include "mlir/Dialect/SCF/SCF.h" 18 #include "mlir/Dialect/SCF/Utils/Utils.h" 19 #include "mlir/Dialect/Tensor/IR/Tensor.h" 20 #include "mlir/Dialect/Vector/IR/VectorOps.h" 21 #include "mlir/Dialect/Vector/Utils/VectorUtils.h" 22 #include "mlir/IR/AsmState.h" 23 #include "mlir/IR/BuiltinOps.h" 24 #include "mlir/IR/Dominance.h" 25 #include "mlir/IR/Matchers.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/Support/Debug.h" 28 29 using llvm::dbgs; 30 31 #define DEBUG_TYPE "hoist-padding" 32 33 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") 34 35 using namespace mlir; 36 using namespace mlir::linalg; 37 38 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing 39 /// loops. The failure conditions are: 40 /// 1. Pad op has a use that is not an input of a LinalgOp. 41 /// 2. Pad op does not have a constant padding value. 42 /// 3. There is no immediately enclosing scf::ForOp. 43 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above 44 /// contains an unknown op with non index type operands, a region, or a 45 /// memory effect. 46 /// 5. The backward slice from the pad op to the scf::ForOp to hoist above is 47 /// empty. 48 /// 6. The source tensor of pad op is not defined by an extract slice op. 49 /// 7. The source tensor of the extract slice op is not defined outside of 50 /// the outermost enclosing scf::ForOp. 51 /// 8. There is no enclosing scf::ForOp that indexes the padded data. 52 /// Other cases succeed and will trigger hoisting of the pad op. 53 struct HoistingAnalysis { 54 HoistingAnalysis(tensor::PadOp padOp, int numLoops); 55 56 bool isValid() { return valid; } 57 58 /// Footprint of the packedTensor, computed from the packingLoops. 59 SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b); 60 61 /// The outermost loop, determined by `nLevels` above which `padOp` will 62 /// be hoisted. 63 scf::ForOp outermostEnclosingForOp; 64 65 /// Backward slice rooted at `padOp` and nested under 66 /// `outermostEnclosingForOp`. 67 SetVector<Operation *> backwardSlice; 68 69 /// The scf::ForOp immediately enclosing `padOp` such that: 70 /// 1. they are nested under `outermostEnclosingForOp` (inclusive) 71 /// 2. whose induction variable is used, directly or indirectly, in the 72 /// computation of `padOp`. 73 /// The span of these loops determines the footprint of the packed tensor. 74 SmallVector<scf::ForOp> packingLoops; 75 76 private: 77 /// Drop any non-index dependencies of `padOp` and `sliceOp` from 78 /// `backwardSlice`. The method follows the use-def chains of the index 79 /// operands consumed by `padOp` and `sliceOp` and drops the operations 80 /// not part of this index computation. Afterwards, the filtered 81 /// `backwardSlice` contains only the loops whose induction variable is used, 82 /// directly or indirectly, to index the padded tensor. The method returns 83 /// failure if the filtered backward slice contains an unexpected operation. 84 /// 85 /// Example: 86 /// ``` 87 /// %source = linalg.fill(%cst, %arg0) 88 /// scf.for %i 89 /// %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! 90 /// scf.for %j (%arg2 = %unrelated) 91 /// scf.for %k // not used to index %source! 92 /// %ubi = affine.min #map(%i) 93 /// %ubj = affine.min #map(%j) 94 /// %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 95 /// %padded_slice = tensor.pad %slice 96 /// ``` 97 /// dropNonIndexDependencies(%padded_slice, %slice) 98 /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice. 99 LogicalResult dropNonIndexDependencies(tensor::PadOp padOp, 100 tensor::ExtractSliceOp sliceOp); 101 102 /// Encodes whether the analysis is valid and hoisting can proceed. 103 bool valid; 104 }; 105 106 /// Return true if all uses of `padOp` are an input tensor of some 107 /// LinalgOp. 108 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) { 109 for (OpOperand &use : padOp.result().getUses()) { 110 auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner()); 111 if (!linalgUser || !linalgUser.isInputTensor(&use)) { 112 LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp) 113 << "\nthat is not an input tensor of a LinalgOp, " 114 << "cannot hoist\n" 115 << *(use.getOwner()) << "\n"); 116 return false; 117 } 118 } 119 return true; 120 } 121 122 /// Return at most nLevels of immediately enclosing scf::ForOp loops. 123 /// Stops at the first parent that is not an scf::ForOp. 124 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. 125 /// Control-flow and other containing ops with regions are not modeled atm. 126 static void 127 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels, 128 SmallVector<scf::ForOp> &reverseEnclosingLoops) { 129 AsmState state(padOp->getParentOfType<mlir::FuncOp>()); 130 (void)state; 131 scf::ForOp outermostEnclosingForOp = nullptr; 132 Operation *nextEnclosingOp = padOp->getParentOp(); 133 while (nLevels-- > 0 && 134 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) { 135 LLVM_DEBUG( 136 DBGS() << "loops: "; 137 outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); 138 dbgs() << "\n"); 139 reverseEnclosingLoops.push_back(outermostEnclosingForOp); 140 nextEnclosingOp = outermostEnclosingForOp->getParentOp(); 141 } 142 } 143 144 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty. 145 /// Fail if `transposeVector` is no permutation matching the tensor rank. 146 static FailureOr<RankedTensorType> 147 computeTransposedType(RankedTensorType rankedTensorType, 148 ArrayRef<int64_t> transposeVector) { 149 if (transposeVector.empty()) 150 return rankedTensorType; 151 if (!isPermutation(transposeVector) || 152 transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank())) 153 return failure(); 154 155 SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(), 156 rankedTensorType.getShape().end()); 157 applyPermutationToVector(transposedShape, transposeVector); 158 159 using RTTBuilder = RankedTensorType::Builder; 160 RankedTensorType transposedTensorType = 161 RTTBuilder(rankedTensorType).setShape(transposedShape); 162 return transposedTensorType; 163 } 164 165 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) { 166 valid = false; 167 168 // Bail on any use that isn't an input of a LinalgOp. 169 // Hoisting of inplace updates happens after vectorization. 170 if (!isOnlyUsedAsInputOfLinalgOp(padOp)) 171 return; 172 173 // Get at most `numLoops` of immediately enclosing loops. 174 SmallVector<scf::ForOp> reverseEnclosingLoops; 175 getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops); 176 if (reverseEnclosingLoops.empty()) { 177 LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); 178 return; 179 } 180 181 outermostEnclosingForOp = reverseEnclosingLoops.back(); 182 183 // Get the `sliceOp` that defines the source tensor of `padOp` and 184 // check its source is defined outside of the outermost loop. This check 185 // ensures the padded data is available for packing before entering the 186 // outermost enclosing loop. 187 // 188 // Example: 189 // ``` 190 // %source = linalg.fill(%cst, %arg0) 191 // // %source is available for packing here! 192 // scf.for %i 193 // scf.for %j 194 // scf.for %k 195 // %slice = tensor.extract_slice %source [%i, %j] 196 // %padded_slice = tensor.pad %slice 197 // ``` 198 auto sliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>(); 199 if (!sliceOp) { 200 LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n"); 201 return; 202 } 203 if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) { 204 LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n"); 205 return; 206 } 207 208 // Check the region of `padOp` depends on a constant only. Adding 209 // hoisting support for arbitrary padding regions would require cloning all 210 // dependencies captured by the padding region. 211 Value paddingValue = padOp.getConstantPaddingValue(); 212 if (!paddingValue || 213 !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) { 214 LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n"); 215 return; 216 } 217 218 // Get all the ops in the backwards slice starting from `padOp` and that 219 // are dominated by the outermost enclosing loop. 220 DominanceInfo domInfo(outermostEnclosingForOp); 221 getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) { 222 return domInfo.dominates(outermostEnclosingForOp, op); 223 }); 224 if (backwardSlice.empty()) 225 return; 226 // Add `padOp` itself to the backward slice. 227 backwardSlice.insert(padOp.getOperation()); 228 229 // Remove all ops in the backward slice that are not used to index the padded 230 // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and 231 // affine operations used for the index computation. 232 if (failed(dropNonIndexDependencies(padOp, sliceOp))) 233 return; 234 235 // Add only the loops part of the filtered `backwardSlice` to the packing 236 // loops. All other loops are not used to index the padded data and 237 // consequently access the same data in every loop iteration. Adding them to 238 // the packing loops would increase the cache footprint of the packed data 239 // by storing the same data multiple times. 240 for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) 241 if (backwardSlice.contains(forOp)) 242 packingLoops.push_back(forOp); 243 if (packingLoops.empty()) { 244 LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n"); 245 return; 246 } 247 248 // The analysis is valid and hoisting can occur. 249 valid = true; 250 } 251 252 LogicalResult 253 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp, 254 tensor::ExtractSliceOp sliceOp) { 255 // Set of all values used for index computation. 256 SetVector<Value> indexEdges; 257 258 // Add all index operands of `operation` to `indexEdges`. An index operand is 259 // an operand of type index. 260 auto addIndexOperandsToIndexEdges = [&](Operation *operation) { 261 for (Value operand : operation->getOperands()) 262 if (operand.getType().isIndex()) 263 indexEdges.insert(operand); 264 }; 265 266 // Check if any operation result is contained in `indexEdges`. 267 auto hasIndexResult = [&](Operation *operation) { 268 return llvm::any_of(operation->getResults(), [&](Value result) { 269 return indexEdges.contains(result); 270 }); 271 }; 272 273 // Starting from `padOp` and `sliceOp` walk the use-def edges of index 274 // type in `backwardSlice`. Add the index operands of an operation to 275 // `indexEdges` and remove all operations from `backwardSlice` that are not 276 // part of the index computation. 277 // 278 // Example: 279 // ``` 280 // %source = linalg.fill(%cst, %arg0) 281 // scf.for %i 282 // %unrelated = linalg.fill(%cst, %arg1) // not used to index %source! 283 // scf.for %j (%arg2 = %unrelated) 284 // scf.for %k // not used to index %source! 285 // %ubi = affine.min #map(%i) 286 // %ubj = affine.min #map(%j) 287 // %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj] 288 // %padded_slice = tensor.pad %slice 289 // ``` 290 // After iterating `backwardSlice` we obtain: 291 // indexEdges = [%i, %j, %ubi, %ubj] 292 // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k] 293 SetVector<Operation *> operationsToRemove; 294 for (Operation *op : llvm::reverse(backwardSlice)) { 295 // Add the index operands of `padOp` and `sliceOp` to start the 296 // exploration of the index computation. 297 if (op == padOp || op == sliceOp) { 298 addIndexOperandsToIndexEdges(op); 299 continue; 300 } 301 // Add the index operands of the loop if its induction variable is 302 // used for index computation. 303 if (auto forOp = dyn_cast<scf::ForOp>(op)) { 304 if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) { 305 addIndexOperandsToIndexEdges(op); 306 continue; 307 } 308 } 309 // Add the index operands of all other operations if at least one result is 310 // used for index computation. 311 if (hasIndexResult(op)) { 312 addIndexOperandsToIndexEdges(op); 313 // Check the operands of the remaining operations all have index type. 314 if (llvm::any_of(op->getOperandTypes(), 315 [](Type type) { return !type.isIndex(); })) { 316 LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: " 317 << op << " -> skip\n"); 318 return failure(); 319 } 320 // Check the remaining operations do not have regions or memory effects. 321 auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op); 322 bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect(); 323 if (hasMemoryEffect || op->getNumRegions() != 0) { 324 LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: " 325 << op << " -> skip\n"); 326 return failure(); 327 } 328 continue; 329 } 330 // Remove all other operations not used by the index computation. An 331 // exception are constant operations that may be used by `padOp`. 332 if (!isa<arith::ConstantOp>(op)) 333 operationsToRemove.insert(op); 334 } 335 backwardSlice.set_subtract(operationsToRemove); 336 return success(); 337 } 338 339 SmallVector<Value> 340 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { 341 SmallVector<Value> dynamicTensorSizes; 342 343 // Upper bound the packing loop lengths to size the packed tensor. Taking 344 // upper bounds can make the sizes of the packed tensor independent of the 345 // enclosing loops. This independence is a prerequisite for reusing the same 346 // buffer for all enclosing loop iterations and hoisting its allocation out of 347 // the enclosing loops. 348 for (auto forOp : packingLoops) { 349 // Compute an upper bound `ubVal` for the upper bound of `forOp`. 350 AffineMap boundMap; 351 SmallVector<Value> boundOperands; 352 getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands); 353 Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands); 354 // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and 355 // store the result to `dynamicTensorSizes`. 356 // TODO: instead of using the lower bound of `forOp` directly, implement a 357 // lower bound computation similar to the upper bound computation. 358 AffineExpr lb, ub, step; 359 bindDims(b.getContext(), lb, ub); 360 bindSymbols(b.getContext(), step); 361 Value res = b.createOrFold<AffineApplyOp>( 362 (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal, 363 cast<scf::ForOp>(forOp).getStep()}); 364 dynamicTensorSizes.push_back(res); 365 } 366 367 return dynamicTensorSizes; 368 } 369 370 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { 371 return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant()); 372 } 373 374 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). 375 /// The returned Value is guaranteed not to depend on any loop comprised in 376 /// [`outer`, `forOp`]. 377 /// Return null if such a loop-independent quantity cannot be computed. 378 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, 379 scf::ForOp forOp) { 380 MLIRContext *ctx = forOp->getContext(); 381 AffineExpr iv, lb, step; 382 bindDims(ctx, iv, lb); 383 bindSymbols(ctx, step); 384 if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) || 385 !isDefinedOutsideOrConstant(outer, forOp.getStep())) 386 return Value(); 387 Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(), 388 stepVal = forOp.getStep(); 389 auto loc = forOp->getLoc(); 390 return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step), 391 ValueRange{ivVal, lbVal, stepVal}); 392 } 393 394 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors( 395 tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector, 396 tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) { 397 LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops 398 << " loops\n"); 399 HoistingAnalysis analysis(opToHoist, numLoops); 400 if (!analysis.isValid()) { 401 LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); 402 return failure(); 403 } 404 405 scf::ForOp outer = analysis.outermostEnclosingForOp; 406 ImplicitLocOpBuilder b(outer->getLoc(), outer); 407 408 SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b); 409 410 // Update actual number of loops, which may be smaller. 411 int nPackedLoops = analysis.packingLoops.size(); 412 413 Location loc = opToHoist->getLoc(); 414 RankedTensorType paddedTensorType = opToHoist.getResultType(); 415 int paddedRank = paddedTensorType.getRank(); 416 417 // Compute the type of the transposed padded tensor. 418 FailureOr<RankedTensorType> transposedTensorType = 419 computeTransposedType(paddedTensorType, transposeVector); 420 if (failed(transposedTensorType)) 421 return failure(); 422 423 // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize 424 // padding. 425 SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize); 426 // TODO: go grab dims when necessary, for now tensor::PadOp returns a static 427 // tensor. 428 llvm::append_range(packedShape, transposedTensorType->getShape()); 429 auto packedTensorType = RankedTensorType::get( 430 packedShape, transposedTensorType->getElementType()); 431 Value packedTensor = b.create<linalg::InitTensorOp>( 432 loc, dynamicTensorSizes, packedTensorType.getShape(), 433 packedTensorType.getElementType()); 434 435 // Clone the operations involved in the backward slice, iteratively stepping 436 // into the loops that we encounter. 437 // The implementation proceeds in a stack-like fashion: 438 // 1. Iteratively clone and step into the loops, pushing the `packedTensor` 439 // deeper in the stack. 440 // 2. Create a GenericOp if `transposeVector` is non-empty. 441 // 3. Create a InsertSliceOp at the top of the stack. 442 // 4. Iteratively pop and yield the result of the InsertSliceOp across 443 // the cloned loops. 444 SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; 445 clonedLoopIvs.reserve(nPackedLoops); 446 leadingPackedTensorIndexings.reserve(nPackedLoops); 447 BlockAndValueMapping bvm; 448 // Stack step 1. iteratively clone loops and push `packedTensor`. 449 for (Operation *op : analysis.backwardSlice) { 450 // Specifically sit out in the extract_slice(packedTensor) case: this is the 451 // piece we seek to replace. 452 if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) 453 if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) 454 continue; 455 // Clone all operations except it is a loop. 456 auto forOp = dyn_cast<scf::ForOp>(op); 457 if (!forOp) { 458 b.clone(*op, bvm); 459 continue; 460 } 461 // Create a packing loop that takes `packedTensor` as iteration argument. 462 auto clonedForOp = b.create<scf::ForOp>( 463 loc, bvm.lookupOrDefault(forOp.getLowerBound()), 464 bvm.lookupOrDefault(forOp.getUpperBound()), 465 bvm.lookupOrDefault(forOp.getStep()), packedTensor); 466 // Map the induction var, region args and results to the `clonedForOp`. 467 bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); 468 bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); 469 bvm.map(forOp.getResults(), clonedForOp.getResults()); 470 assert(clonedForOp->getNumRegions() == 1); 471 clonedLoopIvs.push_back(clonedForOp.getInductionVar()); 472 473 b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); 474 Value loopIndependentIterationCount = 475 buildLoopIterationCount(b, outer, clonedForOp); 476 // Assert the loop-independent iteration count can be computed. 477 if (!loopIndependentIterationCount) 478 llvm_unreachable("loop independence prerequisite not met"); 479 leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); 480 packedTensor = clonedForOp.getRegionIterArgs().front(); 481 } 482 483 // offsets = [clonedLoopIvs, 0 .. 0]. 484 SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(), 485 leadingPackedTensorIndexings.end()); 486 offsets.append(paddedRank, b.getIndexAttr(0)); 487 // sizes = [1 .. 1, transposedShape]. 488 SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1)); 489 for (int64_t sz : transposedTensorType->getShape()) { 490 // TODO: go grab dims when necessary, for now tensor::PadOp returns a static 491 assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); 492 sizes.push_back(b.getIndexAttr(sz)); 493 } 494 // strides = [1 .. 1]. 495 SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank, 496 b.getIndexAttr(1)); 497 498 // Stack step 2. create GenericOp if `transposeVector` is non-empty. 499 Value paddedTensor = bvm.lookup(opToHoist.result()); 500 if (!transposeVector.empty()) { 501 Value outputTensor = b.create<tensor::ExtractSliceOp>( 502 loc, *transposedTensorType, packedTensor, offsets, sizes, strides); 503 transposeOps.push_back( 504 makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector)); 505 paddedTensor = transposeOps.back()->getResult(0); 506 } 507 508 // Stack step 3. create InsertSliceOp at the top of the stack. 509 Value inserted = b.create<tensor::InsertSliceOp>( 510 loc, paddedTensor, packedTensor, offsets, sizes, strides); 511 512 // Stack step 4. iteratively pop the stack and propagate the yield. 513 Value valueToYield = inserted; 514 for (Value iv : llvm::reverse(clonedLoopIvs)) { 515 auto forOp = scf::getForInductionVarOwner(iv); 516 b.setInsertionPointToEnd(&forOp.getRegion().front()); 517 b.create<scf::YieldOp>(loc, valueToYield); 518 valueToYield = forOp.getResult(0); 519 } 520 521 // Now the packed tensor is ready, replace the original padding op by a 522 // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. 523 b.setInsertionPoint(opToHoist); 524 SmallVector<Value> loopIterationCounts = llvm::to_vector<4>( 525 llvm::map_range(analysis.packingLoops, [&](Operation *loop) { 526 return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop)); 527 })); 528 // Assert all loop iteration counts can be computed. 529 if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) 530 llvm_unreachable("loop independence prerequisite not met"); 531 // offsets = [originalLoopIvs, 0 .. 0]. 532 offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); 533 offsets.append(paddedRank, b.getIndexAttr(0)); 534 // sizes = [1 .. 1, transposedShape] (definedabove). 535 // strides = [1 .. 1] (defined above) 536 packedTensor = 537 scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); 538 Value newResult = b.create<tensor::ExtractSliceOp>( 539 loc, *transposedTensorType, packedTensor, offsets, sizes, strides); 540 541 // Transpose the packed tensor back to the original storage order. 542 if (!transposeVector.empty()) { 543 Value initTensor = 544 b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(), 545 paddedTensorType.getElementType()); 546 transposeOps.push_back( 547 makeTransposeOp(b, loc, newResult, initTensor, transposeVector)); 548 newResult = transposeOps.back()->getResult(0); 549 } 550 551 // Make the newly cloned `opToHoist` available to the caller. 552 hoistedOp = 553 cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp()); 554 return newResult; 555 } 556