1 //===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Func/IR/FuncOps.h"
16 #include "mlir/Dialect/Linalg/IR/Linalg.h"
17 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
18 #include "mlir/Dialect/SCF/SCF.h"
19 #include "mlir/Dialect/SCF/Utils/Utils.h"
20 #include "mlir/Dialect/Tensor/IR/Tensor.h"
21 #include "mlir/Dialect/Utils/IndexingUtils.h"
22 #include "mlir/Dialect/Vector/IR/VectorOps.h"
23 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
24 #include "mlir/IR/AsmState.h"
25 #include "mlir/IR/BuiltinOps.h"
26 #include "mlir/IR/Dominance.h"
27 #include "mlir/IR/Matchers.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/Support/Debug.h"
30 
31 using llvm::dbgs;
32 
33 #define DEBUG_TYPE "hoist-padding"
34 
35 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
36 
37 using namespace mlir;
38 using namespace mlir::linalg;
39 
40 /// Analysis class to support tensor::PadOp hoisting across multiple enclosing
41 /// loops. The failure conditions are:
42 ///   1. Pad op has a use that is not an input of a LinalgOp.
43 ///   2. Pad op does not have a constant padding value.
44 ///   3. There is no immediately enclosing scf::ForOp.
45 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above
46 ///      contains an unknown op with non index type operands, a region, or a
47 ///      memory effect.
48 ///   5. The backward slice from the pad op to the scf::ForOp to hoist above is
49 ///      empty.
50 ///   6. The source tensor of pad op is not defined by an extract slice op.
51 ///   7. The source tensor of the extract slice op is not defined outside of
52 ///      the outermost enclosing scf::ForOp.
53 ///   8. There is no enclosing scf::ForOp that indexes the padded data.
54 /// Other cases succeed and will trigger hoisting of the pad op.
55 struct HoistingAnalysis {
56   HoistingAnalysis(tensor::PadOp padOp, int numLoops);
57 
58   bool isValid() { return valid; }
59 
60   /// Footprint of the packedTensor, computed from the packingLoops.
61   SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);
62 
63   /// The outermost loop, determined by `nLevels` above which `padOp` will
64   /// be hoisted.
65   scf::ForOp outermostEnclosingForOp;
66 
67   /// Backward slice rooted at `padOp` and nested under
68   /// `outermostEnclosingForOp`.
69   SetVector<Operation *> backwardSlice;
70 
71   /// The scf::ForOp immediately enclosing `padOp` such that:
72   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
73   ///  2. whose induction variable is used, directly or indirectly, in the
74   ///     computation of `padOp`.
75   /// The span of these loops determines the footprint of the packed tensor.
76   SmallVector<scf::ForOp> packingLoops;
77 
78 private:
79   /// Drop any non-index dependencies of `padOp` and `sliceOp` from
80   /// `backwardSlice`. The method follows the use-def chains of the index
81   /// operands consumed by `padOp` and `sliceOp` and drops the operations
82   /// not part of this index computation. Afterwards, the filtered
83   /// `backwardSlice` contains only the loops whose induction variable is used,
84   /// directly or indirectly, to index the padded tensor. The method returns
85   /// failure if the filtered backward slice contains an unexpected operation.
86   ///
87   /// Example:
88   /// ```
89   /// %source = linalg.fill(%cst, %arg0)
90   /// scf.for %i
91   ///   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!
92   ///   scf.for %j (%arg2 = %unrelated)
93   ///     scf.for %k                             // not used to index %source!
94   ///       %ubi = affine.min #map(%i)
95   ///       %ubj = affine.min #map(%j)
96   ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
97   ///       %padded_slice = tensor.pad %slice
98   /// ```
99   /// dropNonIndexDependencies(%padded_slice, %slice)
100   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
101   LogicalResult dropNonIndexDependencies(tensor::PadOp padOp,
102                                          tensor::ExtractSliceOp sliceOp);
103 
104   /// Encodes whether the analysis is valid and hoisting can proceed.
105   bool valid;
106 };
107 
108 /// Return true if all uses of `padOp` are an input tensor of some
109 /// LinalgOp.
110 static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padOp) {
111   for (OpOperand &use : padOp.result().getUses()) {
112     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
113     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
114       LLVM_DEBUG(DBGS() << "Found a use of " << *(padOp)
115                         << "\nthat is not an input tensor of a LinalgOp, "
116                         << "cannot hoist\n"
117                         << *(use.getOwner()) << "\n");
118       return false;
119     }
120   }
121   return true;
122 }
123 
124 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
125 /// Stops at the first parent that is not an scf::ForOp.
126 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
127 /// Control-flow and other containing ops with regions are not modeled atm.
128 static void
129 getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
130                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
131   AsmState state(padOp->getParentOfType<func::FuncOp>());
132   (void)state;
133   scf::ForOp outermostEnclosingForOp = nullptr;
134   Operation *nextEnclosingOp = padOp->getParentOp();
135   while (nLevels-- > 0 &&
136          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
137     LLVM_DEBUG(
138         DBGS() << "loops: ";
139         outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
140         dbgs() << "\n");
141     reverseEnclosingLoops.push_back(outermostEnclosingForOp);
142     nextEnclosingOp = outermostEnclosingForOp->getParentOp();
143   }
144 }
145 
146 /// Returns the transposed `rankedTensorType` if `transposeVector` is non-empty.
147 /// Fail if `transposeVector` is no permutation matching the tensor rank.
148 static FailureOr<RankedTensorType>
149 computeTransposedType(RankedTensorType rankedTensorType,
150                       ArrayRef<int64_t> transposeVector) {
151   if (transposeVector.empty())
152     return rankedTensorType;
153   if (!isPermutation(transposeVector) ||
154       transposeVector.size() != static_cast<size_t>(rankedTensorType.getRank()))
155     return failure();
156 
157   SmallVector<int64_t> transposedShape(rankedTensorType.getShape().begin(),
158                                        rankedTensorType.getShape().end());
159   applyPermutationToVector(transposedShape, transposeVector);
160 
161   using RTTBuilder = RankedTensorType::Builder;
162   RankedTensorType transposedTensorType =
163       RTTBuilder(rankedTensorType).setShape(transposedShape);
164   return transposedTensorType;
165 }
166 
167 HoistingAnalysis::HoistingAnalysis(tensor::PadOp padOp, int numLoops) {
168   valid = false;
169 
170   // Bail on any use that isn't an input of a LinalgOp.
171   // Hoisting of inplace updates happens after vectorization.
172   if (!isOnlyUsedAsInputOfLinalgOp(padOp))
173     return;
174 
175   // Get at most `numLoops` of immediately enclosing loops.
176   SmallVector<scf::ForOp> reverseEnclosingLoops;
177   getAtMostNEnclosingLoops(padOp, numLoops, reverseEnclosingLoops);
178   if (reverseEnclosingLoops.empty()) {
179     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
180     return;
181   }
182 
183   outermostEnclosingForOp = reverseEnclosingLoops.back();
184 
185   // Get the `sliceOp` that defines the source tensor of `padOp` and
186   // check its source is defined outside of the outermost loop. This check
187   // ensures the padded data is available for packing before entering the
188   // outermost enclosing loop.
189   //
190   // Example:
191   // ```
192   // %source = linalg.fill(%cst, %arg0)
193   // // %source is available for packing here!
194   // scf.for %i
195   //   scf.for %j
196   //     scf.for %k
197   //       %slice = tensor.extract_slice %source [%i, %j]
198   //       %padded_slice = tensor.pad %slice
199   // ```
200   auto sliceOp = padOp.source().getDefiningOp<tensor::ExtractSliceOp>();
201   if (!sliceOp) {
202     LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
203     return;
204   }
205   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
206     LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
207     return;
208   }
209 
210   // Check the region of `padOp` depends on a constant only. Adding
211   // hoisting support for arbitrary padding regions would require cloning all
212   // dependencies captured by the padding region.
213   Value paddingValue = padOp.getConstantPaddingValue();
214   if (!paddingValue ||
215       !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
216     LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");
217     return;
218   }
219 
220   // Get all the ops in the backwards slice starting from `padOp` and that
221   // are dominated by the outermost enclosing loop.
222   DominanceInfo domInfo(outermostEnclosingForOp);
223   getBackwardSlice(padOp.getOperation(), &backwardSlice, [&](Operation *op) {
224     return domInfo.dominates(outermostEnclosingForOp, op);
225   });
226   if (backwardSlice.empty())
227     return;
228   // Add `padOp` itself to the backward slice.
229   backwardSlice.insert(padOp.getOperation());
230 
231   // Remove all ops in the backward slice that are not used to index the padded
232   // tensor. In particular, keep `padOp`, `sliceOp`, and the loop and
233   // affine operations used for the index computation.
234   if (failed(dropNonIndexDependencies(padOp, sliceOp)))
235     return;
236 
237   // Add only the loops part of the filtered `backwardSlice` to the packing
238   // loops. All other loops are not used to index the padded data and
239   // consequently access the same data in every loop iteration. Adding them to
240   // the packing loops would increase the cache footprint of the packed data
241   // by storing the same data multiple times.
242   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
243     if (backwardSlice.contains(forOp))
244       packingLoops.push_back(forOp);
245   if (packingLoops.empty()) {
246     LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
247     return;
248   }
249 
250   // The analysis is valid and hoisting can occur.
251   valid = true;
252 }
253 
254 LogicalResult
255 HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padOp,
256                                            tensor::ExtractSliceOp sliceOp) {
257   // Set of all values used for index computation.
258   SetVector<Value> indexEdges;
259 
260   // Add all index operands of `operation` to `indexEdges`. An index operand is
261   // an operand of type index.
262   auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
263     for (Value operand : operation->getOperands())
264       if (operand.getType().isIndex())
265         indexEdges.insert(operand);
266   };
267 
268   // Check if any operation result is contained in `indexEdges`.
269   auto hasIndexResult = [&](Operation *operation) {
270     return llvm::any_of(operation->getResults(), [&](Value result) {
271       return indexEdges.contains(result);
272     });
273   };
274 
275   // Starting from `padOp` and `sliceOp` walk the use-def edges of index
276   // type in `backwardSlice`. Add the index operands of an operation to
277   // `indexEdges` and remove all operations from `backwardSlice` that are not
278   // part of the index computation.
279   //
280   // Example:
281   // ```
282   // %source = linalg.fill(%cst, %arg0)
283   // scf.for %i
284   //   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!
285   //   scf.for %j (%arg2 = %unrelated)
286   //     scf.for %k                             // not used to index %source!
287   //       %ubi = affine.min #map(%i)
288   //       %ubj = affine.min #map(%j)
289   //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
290   //       %padded_slice = tensor.pad %slice
291   // ```
292   // After iterating `backwardSlice` we obtain:
293   // indexEdges = [%i, %j, %ubi, %ubj]
294   // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
295   SetVector<Operation *> operationsToRemove;
296   for (Operation *op : llvm::reverse(backwardSlice)) {
297     // Add the index operands of `padOp` and `sliceOp` to start the
298     // exploration of the index computation.
299     if (op == padOp || op == sliceOp) {
300       addIndexOperandsToIndexEdges(op);
301       continue;
302     }
303     // Add the index operands of the loop if its induction variable is
304     // used for index computation.
305     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
306       if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
307         addIndexOperandsToIndexEdges(op);
308         continue;
309       }
310     }
311     // Add the index operands of all other operations if at least one result is
312     // used for index computation.
313     if (hasIndexResult(op)) {
314       addIndexOperandsToIndexEdges(op);
315       // Check the operands of the remaining operations all have index type.
316       if (llvm::any_of(op->getOperandTypes(),
317                        [](Type type) { return !type.isIndex(); })) {
318         LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
319                           << op << " -> skip\n");
320         return failure();
321       }
322       // Check the remaining operations do not have regions or memory effects.
323       auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
324       bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
325       if (hasMemoryEffect || op->getNumRegions() != 0) {
326         LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
327                           << op << " -> skip\n");
328         return failure();
329       }
330       continue;
331     }
332     // Remove all other operations not used by the index computation. An
333     // exception are constant operations that may be used by `padOp`.
334     if (!isa<arith::ConstantOp>(op))
335       operationsToRemove.insert(op);
336   }
337   backwardSlice.set_subtract(operationsToRemove);
338   return success();
339 }
340 
341 SmallVector<Value>
342 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
343   SmallVector<Value> dynamicTensorSizes;
344 
345   // Upper bound the packing loop lengths to size the packed tensor. Taking
346   // upper bounds can make the sizes of the packed tensor independent of the
347   // enclosing loops. This independence is a prerequisite for reusing the same
348   // buffer for all enclosing loop iterations and hoisting its allocation out of
349   // the enclosing loops.
350   for (auto forOp : packingLoops) {
351     // Compute an upper bound `ubVal` for the upper bound of `forOp`.
352     AffineMap boundMap;
353     SmallVector<Value> boundOperands;
354     getUpperBoundForIndex(forOp.getUpperBound(), boundMap, boundOperands);
355     Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands);
356     // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
357     // store the result to `dynamicTensorSizes`.
358     // TODO: instead of using the lower bound of `forOp` directly, implement a
359     // lower bound computation similar to the upper bound computation.
360     AffineExpr lb, ub, step;
361     bindDims(b.getContext(), lb, ub);
362     bindSymbols(b.getContext(), step);
363     Value res = b.createOrFold<AffineApplyOp>(
364         (ub - lb).ceilDiv(step), ValueRange{forOp.getLowerBound(), ubVal,
365                                             cast<scf::ForOp>(forOp).getStep()});
366     dynamicTensorSizes.push_back(res);
367   }
368 
369   return dynamicTensorSizes;
370 }
371 
372 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
373   return outer.isDefinedOutsideOfLoop(v) || matchPattern(v, m_Constant());
374 }
375 
376 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
377 /// The returned Value is guaranteed not to depend on any loop comprised in
378 /// [`outer`, `forOp`].
379 /// Return null if such a loop-independent quantity cannot be computed.
380 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
381                                      scf::ForOp forOp) {
382   MLIRContext *ctx = forOp->getContext();
383   AffineExpr iv, lb, step;
384   bindDims(ctx, iv, lb);
385   bindSymbols(ctx, step);
386   if (!isDefinedOutsideOrConstant(outer, forOp.getLowerBound()) ||
387       !isDefinedOutsideOrConstant(outer, forOp.getStep()))
388     return Value();
389   Value ivVal = forOp.getInductionVar(), lbVal = forOp.getLowerBound(),
390         stepVal = forOp.getStep();
391   auto loc = forOp->getLoc();
392   return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
393                                        ValueRange{ivVal, lbVal, stepVal});
394 }
395 
396 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
397     tensor::PadOp opToHoist, int numLoops, ArrayRef<int64_t> transposeVector,
398     tensor::PadOp &hoistedOp, SmallVectorImpl<GenericOp> &transposeOps) {
399   LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
400                     << " loops\n");
401   HoistingAnalysis analysis(opToHoist, numLoops);
402   if (!analysis.isValid()) {
403     LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
404     return failure();
405   }
406 
407   scf::ForOp outer = analysis.outermostEnclosingForOp;
408   ImplicitLocOpBuilder b(outer->getLoc(), outer);
409 
410   SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);
411 
412   // Update actual number of loops, which may be smaller.
413   int nPackedLoops = analysis.packingLoops.size();
414 
415   Location loc = opToHoist->getLoc();
416   RankedTensorType paddedTensorType = opToHoist.getResultType();
417   int paddedRank = paddedTensorType.getRank();
418 
419   // Compute the type of the transposed padded tensor.
420   FailureOr<RankedTensorType> transposedTensorType =
421       computeTransposedType(paddedTensorType, transposeVector);
422   if (failed(transposedTensorType))
423     return failure();
424 
425   // Create the packed tensor<?x?x..?xtransposedShape> into which we amortize
426   // padding.
427   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
428   // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
429   // tensor.
430   llvm::append_range(packedShape, transposedTensorType->getShape());
431   auto packedTensorType = RankedTensorType::get(
432       packedShape, transposedTensorType->getElementType());
433   Value packedTensor = b.create<linalg::InitTensorOp>(
434       loc, dynamicTensorSizes, packedTensorType.getShape(),
435       packedTensorType.getElementType());
436 
437   // Clone the operations involved in the backward slice, iteratively stepping
438   // into the loops that we encounter.
439   // The implementation proceeds in a stack-like fashion:
440   //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
441   //      deeper in the stack.
442   //   2. Create a GenericOp if `transposeVector` is non-empty.
443   //   3. Create a InsertSliceOp at the top of the stack.
444   //   4. Iteratively pop and yield the result of the InsertSliceOp across
445   //      the cloned loops.
446   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
447   clonedLoopIvs.reserve(nPackedLoops);
448   leadingPackedTensorIndexings.reserve(nPackedLoops);
449   BlockAndValueMapping bvm;
450   // Stack step 1. iteratively clone loops and push `packedTensor`.
451   for (Operation *op : analysis.backwardSlice) {
452     // Specifically sit out in the extract_slice(packedTensor) case: this is the
453     // piece we seek to replace.
454     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
455       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
456         continue;
457     // Clone all operations except it is a loop.
458     auto forOp = dyn_cast<scf::ForOp>(op);
459     if (!forOp) {
460       b.clone(*op, bvm);
461       continue;
462     }
463     // Create a packing loop that takes `packedTensor` as iteration argument.
464     auto clonedForOp = b.create<scf::ForOp>(
465         loc, bvm.lookupOrDefault(forOp.getLowerBound()),
466         bvm.lookupOrDefault(forOp.getUpperBound()),
467         bvm.lookupOrDefault(forOp.getStep()), packedTensor);
468     // Map the induction var, region args and results to the `clonedForOp`.
469     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
470     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
471     bvm.map(forOp.getResults(), clonedForOp.getResults());
472     assert(clonedForOp->getNumRegions() == 1);
473     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
474 
475     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
476     Value loopIndependentIterationCount =
477         buildLoopIterationCount(b, outer, clonedForOp);
478     // Assert the loop-independent iteration count can be computed.
479     if (!loopIndependentIterationCount)
480       llvm_unreachable("loop independence prerequisite not met");
481     leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
482     packedTensor = clonedForOp.getRegionIterArgs().front();
483   }
484 
485   // offsets = [clonedLoopIvs, 0 .. 0].
486   SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
487                                     leadingPackedTensorIndexings.end());
488   offsets.append(paddedRank, b.getIndexAttr(0));
489   // sizes = [1 .. 1, transposedShape].
490   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
491   for (int64_t sz : transposedTensorType->getShape()) {
492     // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
493     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
494     sizes.push_back(b.getIndexAttr(sz));
495   }
496   // strides = [1 .. 1].
497   SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
498                                     b.getIndexAttr(1));
499 
500   // Stack step 2. create GenericOp if `transposeVector` is non-empty.
501   Value paddedTensor = bvm.lookup(opToHoist.result());
502   if (!transposeVector.empty()) {
503     Value outputTensor = b.create<tensor::ExtractSliceOp>(
504         loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
505     transposeOps.push_back(
506         makeTransposeOp(b, loc, paddedTensor, outputTensor, transposeVector));
507     paddedTensor = transposeOps.back()->getResult(0);
508   }
509 
510   // Stack step 3. create InsertSliceOp at the top of the stack.
511   Value inserted = b.create<tensor::InsertSliceOp>(
512       loc, paddedTensor, packedTensor, offsets, sizes, strides);
513 
514   // Stack step 4. iteratively pop the stack and propagate the yield.
515   Value valueToYield = inserted;
516   for (Value iv : llvm::reverse(clonedLoopIvs)) {
517     auto forOp = scf::getForInductionVarOwner(iv);
518     b.setInsertionPointToEnd(&forOp.getRegion().front());
519     b.create<scf::YieldOp>(loc, valueToYield);
520     valueToYield = forOp.getResult(0);
521   }
522 
523   // Now the packed tensor is ready, replace the original padding op by a
524   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
525   b.setInsertionPoint(opToHoist);
526   SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
527       llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
528         return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
529       }));
530   // Assert all loop iteration counts can be computed.
531   if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
532     llvm_unreachable("loop independence prerequisite not met");
533   // offsets = [originalLoopIvs, 0 .. 0].
534   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
535   offsets.append(paddedRank, b.getIndexAttr(0));
536   // sizes = [1 .. 1, transposedShape] (definedabove).
537   // strides = [1 .. 1] (defined above)
538   packedTensor =
539       scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
540   Value newResult = b.create<tensor::ExtractSliceOp>(
541       loc, *transposedTensorType, packedTensor, offsets, sizes, strides);
542 
543   // Transpose the packed tensor back to the original storage order.
544   if (!transposeVector.empty()) {
545     Value initTensor =
546         b.create<InitTensorOp>(loc, ValueRange{}, paddedTensorType.getShape(),
547                                paddedTensorType.getElementType());
548     transposeOps.push_back(
549         makeTransposeOp(b, loc, newResult, initTensor, transposeVector));
550     newResult = transposeOps.back()->getResult(0);
551   }
552 
553   // Make the newly cloned `opToHoist` available to the caller.
554   hoistedOp =
555       cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
556   return newResult;
557 }
558