1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Linalg/IR/Linalg.h"
16 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
17 #include "mlir/Dialect/SCF/SCF.h"
18 #include "mlir/Dialect/SCF/Utils/Utils.h"
19 #include "mlir/Dialect/Tensor/IR/Tensor.h"
20 #include "mlir/Dialect/Utils/IndexingUtils.h"
21 #include "mlir/Dialect/Vector/IR/VectorOps.h"
22 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
23 #include "mlir/IR/AsmState.h"
24 #include "mlir/IR/BuiltinOps.h"
25 #include "mlir/IR/Dominance.h"
26 #include "mlir/IR/Matchers.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Support/Debug.h"
29 
30 using llvm::dbgs;
31 
32 #define DEBUG_TYPE "hoist-padding"
33 
34 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
35 
36 using namespace mlir;
37 using namespace mlir::linalg;
38 
39 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
40 /// loops. The failure conditions are:
41 ///   1. Pad op has a use that is not an input of a LinalgOp.
42 ///   2. Pad op does not have a constant padding value.
43 ///   3. There is no immediately enclosing scf::ForOp.
44 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above
45 ///      contains an unknown op with non index type operands, a region, or a
46 ///      memory effect.
47 ///   5. The backward slice from the pad op to the scf::ForOp to hoist above is
48 ///      empty.
49 ///   6. The source tensor of pad op is not defined by an extract slice op.
50 ///   7. The source tensor of the extract slice op is not defined outside of
51 ///      the outermost enclosing scf::ForOp.
52 ///   8. There is no enclosing scf::ForOp that indexes the padded data.
53 /// Other cases succeed and will trigger hoisting of the pad op.
54 struct HoistingAnalysis {
55   HoistingAnalysis(tensor::PadOp padOp, int numLoops);
56 
57   bool isValid() { return valid; }
58 
59   /// Footprint of the packedTensor, computed from the packingLoops.
60   SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);
61 
62   /// The outermost loop, determined by `nLevels` above which `padOp` will
63   /// be hoisted.
64   scf::ForOp outermostEnclosingForOp;
65 
66   /// Backward slice rooted at `padOp` and nested under
67   /// `outermostEnclosingForOp`.
68   SetVector<Operation *> backwardSlice;
69 
70   /// The scf::ForOp immediately enclosing `padOp` such that:
71   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
72   ///  2. whose induction variable is used, directly or indirectly, in the
73   ///     computation of `padOp`.
74   /// The span of these loops determines the footprint of the packed tensor.
75   SmallVector<scf::ForOp> packingLoops;
76 
77 private:
78   /// Drop any non-index dependencies of `padOp` and `sliceOp` from
79   /// `backwardSlice`. The method follows the use-def chains of the index
80   /// operands consumed by `padOp` and `sliceOp` and drops the operations
81   /// not part of this index computation. Afterwards, the filtered
82   /// `backwardSlice` contains only the loops whose induction variable is used,
83   /// directly or indirectly, to index the padded tensor. The method returns
84   /// failure if the filtered backward slice contains an unexpected operation.
85   ///
86   /// Example:
87   /// ```
88   /// %source = linalg.fill(%cst, %arg0)
89   /// scf.for %i
90   ///   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!
91   ///   scf.for %j (%arg2 = %unrelated)
92   ///     scf.for %k                             // not used to index %source!
93   ///       %ubi = affine.min #map(%i)
94   ///       %ubj = affine.min #map(%j)
95   ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
96   ///       %padded_slice = tensor.pad %slice
97   /// ```
98   /// dropNonIndexDependencies(%padded_slice, %slice)
99   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
100   LogicalResult dropNonIndexDependencies(tensor::PadOp padOp,
101                                          tensor::ExtractSliceOp sliceOp);
102 
103   /// Encodes whether the analysis is valid and hoisting can proceed.
104   bool valid;
105 };
106 
107 /// Return true if all uses of `padOp` are an input tensor of some
108 /// LinalgOp.
109 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) {
110   for (OpOperand &use : padOp.result().getUses()) {
111     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
112     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
113       LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp)
114                         << "\nthat is not an input tensor of a LinalgOp, "
115                         << "cannot hoist\n"
116                         << *(use.getOwner()) << "\n");
117       return false;
118     }
119   }
120   return true;
121 }
122 
123 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
124 /// Stops at the first parent that is not an scf::ForOp.
125 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
126 /// Control-flow and other containing ops with regions are not modeled atm.
127 static void
128 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
129                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
130   AsmState state(padOp->getParentOfType<mlir::FuncOp>());
131   (void)state;
132   scf::ForOp outermostEnclosingForOp = nullptr;
133   Operation *nextEnclosingOp = padOp->getParentOp();
134   while (nLevels-- > 0 &&
135          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
136     LLVM_DEBUG(
137         DBGS() << "loops: ";
138         outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
139         dbgs() << "\n");
140     reverseEnclosingLoops.push_back(outermostEnclosingForOp);
141     nextEnclosingOp = outermostEnclosingForOp->getParentOp();
142   }
143 }
144 
145 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty.
146 /// Fail if `transposeVector` is no permutation matching the tensor rank.
147 static FailureOr<RankedTensorType>
148 computeTransposedType(RankedTensorType rankedTensorType,
149                       ArrayRef<int64_t> transposeVector) {
150   if (transposeVector.empty())
151     return rankedTensorType;
152   if (!isPermutation(transposeVector) ||
153       transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank()))
154     return failure();
155 
156   SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(),
157                                        rankedTensorType.getShape().end());
158   applyPermutationToVector(transposedShape, transposeVector);
159 
160   using RTTBuilder = RankedTensorType::Builder;
161   RankedTensorType transposedTensorType =
162       RTTBuilder(rankedTensorType).setShape(transposedShape);
163   return transposedTensorType;
164 }
165 
166 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) {
167   valid = false;
168 
169   // Bail on any use that isn't an input of a LinalgOp.
170   // Hoisting of inplace updates happens after vectorization.
171   if (!isOnlyUsedAsInputOfLinalgOp(padOp))
172     return;
173 
174   // Get at most `numLoops` of immediately enclosing loops.
175   SmallVector<scf::ForOp> reverseEnclosingLoops;
176   getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops);
177   if (reverseEnclosingLoops.empty()) {
178     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
179     return;
180   }
181 
182   outermostEnclosingForOp = reverseEnclosingLoops.back();
183 
184   // Get the `sliceOp` that defines the source tensor of `padOp` and
185   // check its source is defined outside of the outermost loop. This check
186   // ensures the padded data is available for packing before entering the
187   // outermost enclosing loop.
188   //
189   // Example:
190   // ```
191   // %source = linalg.fill(%cst, %arg0)
192   // // %source is available for packing here!
193   // scf.for %i
194   //   scf.for %j
195   //     scf.for %k
196   //       %slice = tensor.extract_slice %source [%i, %j]
197   //       %padded_slice = tensor.pad %slice
198   // ```
199   auto sliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>();
200   if (!sliceOp) {
201     LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
202     return;
203   }
204   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
205     LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
206     return;
207   }
208 
209   // Check the region of `padOp` depends on a constant only. Adding
210   // hoisting support for arbitrary padding regions would require cloning all
211   // dependencies captured by the padding region.
212   Value paddingValue = padOp.getConstantPaddingValue();
213   if (!paddingValue ||
214       !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
215     LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");
216     return;
217   }
218 
219   // Get all the ops in the backwards slice starting from `padOp` and that
220   // are dominated by the outermost enclosing loop.
221   DominanceInfo domInfo(outermostEnclosingForOp);
222   getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) {
223     return domInfo.dominates(outermostEnclosingForOp, op);
224   });
225   if (backwardSlice.empty())
226     return;
227   // Add `padOp` itself to the backward slice.
228   backwardSlice.insert(padOp.getOperation());
229 
230   // Remove all ops in the backward slice that are not used to index the padded
231   // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and
232   // affine operations used for the index computation.
233   if (failed(dropNonIndexDependencies(padOp, sliceOp)))
234     return;
235 
236   // Add only the loops part of the filtered `backwardSlice` to the packing
237   // loops. All other loops are not used to index the padded data and
238   // consequently access the same data in every loop iteration. Adding them to
239   // the packing loops would increase the cache footprint of the packed data
240   // by storing the same data multiple times.
241   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
242     if (backwardSlice.contains(forOp))
243       packingLoops.push_back(forOp);
244   if (packingLoops.empty()) {
245     LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
246     return;
247   }
248 
249   // The analysis is valid and hoisting can occur.
250   valid = true;
251 }
252 
253 LogicalResult
254 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp,
255                                            tensor::ExtractSliceOp sliceOp) {
256   // Set of all values used for index computation.
257   SetVector<Value> indexEdges;
258 
259   // Add all index operands of `operation` to `indexEdges`. An index operand is
260   // an operand of type index.
261   auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
262     for (Value operand : operation->getOperands())
263       if (operand.getType().isIndex())
264         indexEdges.insert(operand);
265   };
266 
267   // Check if any operation result is contained in `indexEdges`.
268   auto hasIndexResult = [&](Operation *operation) {
269     return llvm::any_of(operation->getResults(), [&](Value result) {
270       return indexEdges.contains(result);
271     });
272   };
273 
274   // Starting from `padOp` and `sliceOp` walk the use-def edges of index
275   // type in `backwardSlice`. Add the index operands of an operation to
276   // `indexEdges` and remove all operations from `backwardSlice` that are not
277   // part of the index computation.
278   //
279   // Example:
280   // ```
281   // %source = linalg.fill(%cst, %arg0)
282   // scf.for %i
283   //   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!
284   //   scf.for %j (%arg2 = %unrelated)
285   //     scf.for %k                             // not used to index %source!
286   //       %ubi = affine.min #map(%i)
287   //       %ubj = affine.min #map(%j)
288   //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
289   //       %padded_slice = tensor.pad %slice
290   // ```
291   // After iterating `backwardSlice` we obtain:
292   // indexEdges = [%i, %j, %ubi, %ubj]
293   // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
294   SetVector<Operation *> operationsToRemove;
295   for (Operation *op : llvm::reverse(backwardSlice)) {
296     // Add the index operands of `padOp` and `sliceOp` to start the
297     // exploration of the index computation.
298     if (op == padOp || op == sliceOp) {
299       addIndexOperandsToIndexEdges(op);
300       continue;
301     }
302     // Add the index operands of the loop if its induction variable is
303     // used for index computation.
304     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
305       if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
306         addIndexOperandsToIndexEdges(op);
307         continue;
308       }
309     }
310     // Add the index operands of all other operations if at least one result is
311     // used for index computation.
312     if (hasIndexResult(op)) {
313       addIndexOperandsToIndexEdges(op);
314       // Check the operands of the remaining operations all have index type.
315       if (llvm::any_of(op->getOperandTypes(),
316                        [](Type type) { return !type.isIndex(); })) {
317         LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
318                           << op << " -> skip\n");
319         return failure();
320       }
321       // Check the remaining operations do not have regions or memory effects.
322       auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
323       bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
324       if (hasMemoryEffect || op->getNumRegions() != 0) {
325         LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
326                           << op << " -> skip\n");
327         return failure();
328       }
329       continue;
330     }
331     // Remove all other operations not used by the index computation. An
332     // exception are constant operations that may be used by `padOp`.
333     if (!isa<arith::ConstantOp>(op))
334       operationsToRemove.insert(op);
335   }
336   backwardSlice.set_subtract(operationsToRemove);
337   return success();
338 }
339 
340 SmallVector<Value>
341 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
342   SmallVector<Value> dynamicTensorSizes;
343 
344   // Upper bound the packing loop lengths to size the packed tensor. Taking
345   // upper bounds can make the sizes of the packed tensor independent of the
346   // enclosing loops. This independence is a prerequisite for reusing the same
347   // buffer for all enclosing loop iterations and hoisting its allocation out of
348   // the enclosing loops.
349   for (auto forOp : packingLoops) {
350     // Compute an upper bound `ubVal` for the upper bound of `forOp`.
351     AffineMap boundMap;
352     SmallVector<Value> boundOperands;
353     getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands);
354     Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands);
355     // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
356     // store the result to `dynamicTensorSizes`.
357     // TODO: instead of using the lower bound of `forOp` directly, implement a
358     // lower bound computation similar to the upper bound computation.
359     AffineExpr lb, ub, step;
360     bindDims(b.getContext(), lb, ub);
361     bindSymbols(b.getContext(), step);
362     Value res = b.createOrFold<AffineApplyOp>(
363         (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal,
364                                             cast<scf::ForOp>(forOp).getStep()});
365     dynamicTensorSizes.push_back(res);
366   }
367 
368   return dynamicTensorSizes;
369 }
370 
371 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
372   return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
373 }
374 
375 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
376 /// The returned Value is guaranteed not to depend on any loop comprised in
377 /// [`outer`, `forOp`].
378 /// Return null if such a loop-independent quantity cannot be computed.
379 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
380                                      scf::ForOp forOp) {
381   MLIRContext *ctx = forOp->getContext();
382   AffineExpr iv, lb, step;
383   bindDims(ctx, iv, lb);
384   bindSymbols(ctx, step);
385   if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||
386       !isDefinedOutsideOrConstant(outer, forOp.getStep()))
387     return Value();
388   Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
389         stepVal = forOp.getStep();
390   auto loc = forOp->getLoc();
391   return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
392                                        ValueRange{ivVal, lbVal, stepVal});
393 }
394 
395 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
396     tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector,
397     tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) {
398   LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
399                     << " loops\n");
400   HoistingAnalysis analysis(opToHoist, numLoops);
401   if (!analysis.isValid()) {
402     LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
403     return failure();
404   }
405 
406   scf::ForOp outer = analysis.outermostEnclosingForOp;
407   ImplicitLocOpBuilder b(outer->getLoc(), outer);
408 
409   SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);
410 
411   // Update actual number of loops, which may be smaller.
412   int nPackedLoops = analysis.packingLoops.size();
413 
414   Location loc = opToHoist->getLoc();
415   RankedTensorType paddedTensorType = opToHoist.getResultType();
416   int paddedRank = paddedTensorType.getRank();
417 
418   // Compute the type of the transposed padded tensor.
419   FailureOr<RankedTensorType> transposedTensorType =
420       computeTransposedType(paddedTensorType, transposeVector);
421   if (failed(transposedTensorType))
422     return failure();
423 
424   // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize
425   // padding.
426   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
427   // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
428   // tensor.
429   llvm::append_range(packedShape, transposedTensorType->getShape());
430   auto packedTensorType = RankedTensorType::get(
431       packedShape, transposedTensorType->getElementType());
432   Value packedTensor = b.create<linalg::InitTensorOp>(
433       loc, dynamicTensorSizes, packedTensorType.getShape(),
434       packedTensorType.getElementType());
435 
436   // Clone the operations involved in the backward slice, iteratively stepping
437   // into the loops that we encounter.
438   // The implementation proceeds in a stack-like fashion:
439   //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
440   //      deeper in the stack.
441   //   2. Create a GenericOp if `transposeVector` is non-empty.
442   //   3. Create a InsertSliceOp at the top of the stack.
443   //   4. Iteratively pop and yield the result of the InsertSliceOp across
444   //      the cloned loops.
445   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
446   clonedLoopIvs.reserve(nPackedLoops);
447   leadingPackedTensorIndexings.reserve(nPackedLoops);
448   BlockAndValueMapping bvm;
449   // Stack step 1. iteratively clone loops and push `packedTensor`.
450   for (Operation *op : analysis.backwardSlice) {
451     // Specifically sit out in the extract_slice(packedTensor) case: this is the
452     // piece we seek to replace.
453     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
454       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
455         continue;
456     // Clone all operations except it is a loop.
457     auto forOp = dyn_cast<scf::ForOp>(op);
458     if (!forOp) {
459       b.clone(*op, bvm);
460       continue;
461     }
462     // Create a packing loop that takes `packedTensor` as iteration argument.
463     auto clonedForOp = b.create<scf::ForOp>(
464         loc, bvm.lookupOrDefault(forOp.getLowerBound()),
465         bvm.lookupOrDefault(forOp.getUpperBound()),
466         bvm.lookupOrDefault(forOp.getStep()), packedTensor);
467     // Map the induction var, region args and results to the `clonedForOp`.
468     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
469     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
470     bvm.map(forOp.getResults(), clonedForOp.getResults());
471     assert(clonedForOp->getNumRegions() == 1);
472     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
473 
474     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
475     Value loopIndependentIterationCount =
476         buildLoopIterationCount(b, outer, clonedForOp);
477     // Assert the loop-independent iteration count can be computed.
478     if (!loopIndependentIterationCount)
479       llvm_unreachable("loop independence prerequisite not met");
480     leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
481     packedTensor = clonedForOp.getRegionIterArgs().front();
482   }
483 
484   // offsets = [clonedLoopIvs, 0 .. 0].
485   SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
486                                     leadingPackedTensorIndexings.end());
487   offsets.append(paddedRank, b.getIndexAttr(0));
488   // sizes = [1 .. 1, transposedShape].
489   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
490   for (int64_t sz : transposedTensorType->getShape()) {
491     // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
492     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
493     sizes.push_back(b.getIndexAttr(sz));
494   }
495   // strides = [1 .. 1].
496   SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
497                                     b.getIndexAttr(1));
498 
499   // Stack step 2. create GenericOp if `transposeVector` is non-empty.
500   Value paddedTensor = bvm.lookup(opToHoist.result());
501   if (!transposeVector.empty()) {
502     Value outputTensor = b.create<tensor::ExtractSliceOp>(
503         loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
504     transposeOps.push_back(
505         makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector));
506     paddedTensor = transposeOps.back()->getResult(0);
507   }
508 
509   // Stack step 3. create InsertSliceOp at the top of the stack.
510   Value inserted = b.create<tensor::InsertSliceOp>(
511       loc, paddedTensor, packedTensor, offsets, sizes, strides);
512 
513   // Stack step 4. iteratively pop the stack and propagate the yield.
514   Value valueToYield = inserted;
515   for (Value iv : llvm::reverse(clonedLoopIvs)) {
516     auto forOp = scf::getForInductionVarOwner(iv);
517     b.setInsertionPointToEnd(&forOp.getRegion().front());
518     b.create<scf::YieldOp>(loc, valueToYield);
519     valueToYield = forOp.getResult(0);
520   }
521 
522   // Now the packed tensor is ready, replace the original padding op by a
523   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
524   b.setInsertionPoint(opToHoist);
525   SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
526       llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
527         return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
528       }));
529   // Assert all loop iteration counts can be computed.
530   if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
531     llvm_unreachable("loop independence prerequisite not met");
532   // offsets = [originalLoopIvs, 0 .. 0].
533   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
534   offsets.append(paddedRank, b.getIndexAttr(0));
535   // sizes = [1 .. 1, transposedShape] (definedabove).
536   // strides = [1 .. 1] (defined above)
537   packedTensor =
538       scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
539   Value newResult = b.create<tensor::ExtractSliceOp>(
540       loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
541 
542   // Transpose the packed tensor back to the original storage order.
543   if (!transposeVector.empty()) {
544     Value initTensor =
545         b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(),
546                                paddedTensorType.getElementType());
547     transposeOps.push_back(
548         makeTransposeOp(b, loc, newResult, initTensor, transposeVector));
549     newResult = transposeOps.back()->getResult(0);
550   }
551 
552   // Make the newly cloned `opToHoist` available to the caller.
553   hoistedOp =
554       cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
555   return newResult;
556 }
557