1 //===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
14 #include "mlir/Analysis/AffineStructures.h"
15 #include "mlir/Analysis/SliceAnalysis.h"
16 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
17 #include "mlir/Dialect/Affine/Utils.h"
18 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
19 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
20 #include "mlir/Dialect/SCF/SCF.h"
21 #include "mlir/Dialect/SCF/Utils.h"
22 #include "mlir/Dialect/StandardOps/IR/Ops.h"
23 #include "mlir/Dialect/Tensor/IR/Tensor.h"
24 #include "mlir/Dialect/Vector/VectorOps.h"
25 #include "mlir/Dialect/Vector/VectorUtils.h"
26 #include "mlir/IR/AsmState.h"
27 #include "mlir/IR/BuiltinOps.h"
28 #include "mlir/IR/Dominance.h"
29 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
30 #include "mlir/Transforms/LoopUtils.h"
31 #include "llvm/ADT/StringRef.h"
32 #include "llvm/Support/Debug.h"
33 
34 using llvm::dbgs;
35 
36 #define DEBUG_TYPE "hoist-padding"
37 
38 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
39 
40 using namespace mlir;
41 using namespace mlir::linalg;
42 
43 /// Analysis class to support PadTensorOp hoisting across multiple enclosing
44 /// loops. The failure conditions are:
45 ///   1. Pad op has a use that is not an input of a LinalgOp.
46 ///   2. There is no immediately enclosing scf::ForOp.
47 ///   3. The backward slice from the pad op to the scf::ForOp to hoist above
48 ///      contains an unknown op with a region.
49 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above is
50 ///      empty.
51 ///   5. The source tensor of pad op is not defined by an extract slice op.
52 ///   6. The source tensor of the extract slice op is not defined outside of
53 ///      the outermost enclosing scf::ForOp.
54 ///   7. There is no enclosing scf::ForOp that indexes the padded data.
55 /// Other cases succeed and will trigger hoisting of the pad op.
56 struct HoistingAnalysis {
57   HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);
58 
59   bool isValid() { return valid; }
60 
61   /// Footprint of the packedTensor, computed from the packingLoops and
62   /// `backwardSlice`.
63   FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);
64 
65   /// The padTensorOp that needs to be hoisted.
66   PadTensorOp padTensorOp;
67 
68   /// The maximum number of immediately enclosing scf::ForOp to hoist over.
69   int nLevels;
70 
71   /// The outermost loop, determined by `nLevels` above which `padTensorOp` will
72   /// be hoisted.
73   scf::ForOp outermostEnclosingForOp;
74 
75   /// Backward slice rooted at `padTensorOp` and nested under
76   /// `outermostEnclosingForOp`.
77   SetVector<Operation *> backwardSlice;
78 
79   /// The scf::ForOp immediately enclosing `padTensorOp` such that:
80   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
81   ///  2. whose induction variable is used, directly or indirectly, in the
82   ///     computation of `padTensorOp`.
83   /// The span of these loops determines the footprint of the packed tensor.
84   /// SmallSetVector<scf::ForOp> packingLoops;
85   SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
86       packingLoops;
87 
88 private:
89   /// Returns the loops in `backwardSlice` used to index the padded data. The
90   /// method starts from `padTensorOp` and `sliceOp`, follows the use-def
91   /// chains of their index operands, and stores any enclosing loop whose
92   /// induction variable is part of the walked index computation.
93   ///
94   /// Example:
95   /// ```
96   /// %source = linalg.fill(%cst, %arg0)
97   /// scf.for %i
98   ///   scf.for %j
99   ///     scf.for %k // not used to index %source!
100   ///       %ubi = affine.min #map(%i)
101   ///       %ubj = affine.min #map(%j)
102   ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
103   ///       %padded_slice = linalg.pad_tensor %slice
104   /// ```
105   /// getIndexingLoops(%padded_slice, %slice) returns [scf.for %i, scf.for %j]
106   SetVector<Operation *> getIndexingLoops(PadTensorOp padTensorOp,
107                                           tensor::ExtractSliceOp sliceOp);
108 
109   /// Encodes whether the analysis is valid and hoisting can proceed.
110   bool valid;
111 };
112 
113 /// Return true if all uses of `padTensorOp` are an input tensor of some
114 /// LinalgOp.
115 static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
116   for (OpOperand &use : padTensorOp.result().getUses()) {
117     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
118     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
119       LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)
120                         << "\nthat is not an input tensor of a LinalgOp, "
121                         << "cannot hoist\n"
122                         << *(use.getOwner()) << "\n");
123       return false;
124     }
125   }
126   return true;
127 }
128 
129 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
130 /// Stops at the first parent that is not an scf::ForOp.
131 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
132 /// Control-flow and other containing ops with regions are not modeled atm.
133 static void
134 getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
135                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
136   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
137   (void)state;
138   scf::ForOp outermostEnclosingForOp = nullptr;
139   Operation *nextEnclosingOp = padTensorOp->getParentOp();
140   while (nLevels-- > 0 &&
141          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
142     LLVM_DEBUG(
143         DBGS() << "loops: ";
144         outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
145         dbgs() << "\n");
146     reverseEnclosingLoops.push_back(outermostEnclosingForOp);
147     nextEnclosingOp = outermostEnclosingForOp->getParentOp();
148   }
149 }
150 
151 HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
152     : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) {
153   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
154   (void)state;
155 
156   // Bail on any use that isn't an input of a Linalg op.
157   // Hoisting of inplace updates happens after vectorization.
158   if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))
159     return;
160 
161   // Get at most nLevels of immediately enclosing loops.
162   SmallVector<scf::ForOp> reverseEnclosingLoops;
163   getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops);
164   if (reverseEnclosingLoops.empty()) {
165     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
166     return;
167   }
168 
169   outermostEnclosingForOp = reverseEnclosingLoops.back();
170 
171   // Get all the ops in the backwards slice starting from `padTensorOp` and that
172   // are dominated by the outermost enclosing loop.
173   // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp.
174   bool analysisFailure = false;
175   DominanceInfo domInfo(outermostEnclosingForOp);
176   getBackwardSlice(
177       padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) {
178         if (!domInfo.dominates(outermostEnclosingForOp, op))
179           return false;
180         if (op != padTensorOp && op->getNumRegions() > 0 &&
181             !isa<scf::ForOp, LinalgOp>(op)) {
182           analysisFailure = true;
183           LLVM_DEBUG(DBGS()
184                      << "Unsupported op with region: " << *op << " -> skip\n");
185           return false;
186         }
187         return true;
188       });
189 
190   if (analysisFailure || backwardSlice.empty())
191     return;
192 
193   // Get the `sliceOp` that defines the source tensor of `padTensorOp` and
194   // check its source is defined outside of the outermost loop. This check
195   // ensures the padded data is available for packing before entering the
196   // outermost enclosing loop.
197   //
198   // Example:
199   // ```
200   // %source = linalg.fill(%cst, %arg0)
201   // // %source is available for packing here!
202   // scf.for %i
203   //   scf.for %j
204   //     scf.for %k
205   //       %slice = tensor.extract_slice %source [%i, %j]
206   //       %padded_slice = linalg.pad_tensor %slice
207   // ```
208   auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
209   if (!sliceOp) {
210     LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
211     return;
212   }
213   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
214     LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
215     return;
216   }
217 
218   // Search the loops found in `backwardSlice` used to index the padded data.
219   SetVector<Operation *> indexingLoops = getIndexingLoops(padTensorOp, sliceOp);
220 
221   // Add only the loops part of `indexingLoops` to the packing loops. All other
222   // loops are not used to index the padded data and consequently access the
223   // same data in every loop iteration. Adding them to the packing loops would
224   // increase the cache footprint of the packed data by storing the same data
225   // multiple times.
226   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
227     if (indexingLoops.contains(forOp))
228       packingLoops.insert(forOp);
229   }
230   assert(indexingLoops.size() == packingLoops.size() &&
231          "expect the all indexing loops are enclosing loops");
232   if (packingLoops.empty()) {
233     LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
234     return;
235   }
236 
237   // The analysis is valid and hoisting can occur.
238   valid = true;
239 }
240 
241 /// Add all index operands of `operation` to `indexEdges`. An index operand is
242 /// an operand of type index.
243 static void addIndexOperandsToIndexEdges(Operation *operation,
244                                          SetVector<Value> &indexEdges) {
245   for (Value operand : operation->getOperands())
246     if (operand.getType().isIndex())
247       indexEdges.insert(operand);
248 }
249 
250 SetVector<Operation *>
251 HoistingAnalysis::getIndexingLoops(PadTensorOp padTensorOp,
252                                    tensor::ExtractSliceOp sliceOp) {
253   // Set of all values used for index computation.
254   SetVector<Value> indexEdges;
255 
256   // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index
257   // type in `backwardSlice`. Add the index operands of an operation to
258   // `indexEdges` if one of its results is an index edge found so far and store
259   // all loops part of the index computation to `indexingLoops`.
260   //
261   // Example:
262   // ```
263   // %source = linalg.fill(%cst, %arg0)
264   // scf.for %i
265   //   scf.for %j
266   //     scf.for %k // not used to index %source!
267   //       %ubi = affine.min #map(%i)
268   //       %ubj = affine.min #map(%j)
269   //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
270   //       %padded_slice = linalg.pad_tensor %slice
271   // ```
272   // After iterating `backwardSlice` we obtain:
273   // indexEdges = [%i, %j, %ubi, %ubj]
274   // indexingLoops = [scf.for %i, scf.for %j]
275   SetVector<Operation *> indexingLoops;
276   for (Operation *op : llvm::reverse(backwardSlice)) {
277     // Add the index operands of `padTensorOp` and `sliceOp` to start the
278     // exploration of the index computation.
279     if (op == padTensorOp || op == sliceOp) {
280       addIndexOperandsToIndexEdges(op, indexEdges);
281       continue;
282     }
283     // Add the index operands of the loop if its induction variable is
284     // used for index computation. Additionally, insert the loop into
285     // `indexingLoops`
286     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
287       if (indexEdges.contains(forOp.getInductionVar())) {
288         addIndexOperandsToIndexEdges(op, indexEdges);
289         indexingLoops.insert(forOp);
290         continue;
291       }
292     }
293     // Add the index operands of all other operations if at least one result is
294     // used for index computation.
295     if (llvm::any_of(op->getResults(),
296                      [&](Value result) { return indexEdges.contains(result); }))
297       addIndexOperandsToIndexEdges(op, indexEdges);
298   }
299   return indexingLoops;
300 }
301 
302 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
303   return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
304 }
305 
306 /// For each loop in `loops`, determine the ops involved in the construction of
307 /// its upper bound---up to the outerLimit loop--- and fold them as new
308 /// inequalities in the constraint set.
309 /// This is achieved by computing the backwardSlice of the loop's upper bound
310 /// and iteratively folding each op in reverse topological order to guarantee
311 /// use-def ordering.
312 /// As operations are folded in, their result is projected out of the
313 /// constraints set.
314 /// The following operations are supported:
315 ///   - scf::ForOp are simply skipped.
316 ///   - AffineApplyOp are composed to replace the result by an equality.
317 ///   - AffineMinOp are composed by adding each entry as an upper bound.
318 /// Additionally, the following terminal operations are handled:
319 ///   - DimOp and ConstantOp are skipped.
320 /// If any other operation is met, return failure.
321 // TODO: extend on a per-need basis.
322 static LogicalResult
323 foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
324                                   scf::ForOp outerLimit,
325                                   ArrayRef<scf::ForOp> loops) {
326   SetVector<Value> toProjectOut;
327   for (scf::ForOp loop : loops) {
328     auto ub = loop.upperBound();
329 
330     // Set of all values used for index computation.
331     SetVector<Value> indexEdges;
332     indexEdges.insert(ub);
333 
334     // Compute the backward slice `indexSlice` containing the index computation
335     // performed to obtain the upper bound `ub`. Starting from `ub` add the
336     // index operands of an operation to `indexEdges` if one of its results is
337     // an index edge. Otherwise, stop the slice computation. For a loop, check
338     // if its induction variable is an index edge.
339     //
340     // Example:
341     // ```
342     // %c0 = arith.constant 0
343     // scf.for %i = %c0 to ...
344     //   scf.for %j = %c0 to ...
345     //     %ub = affine.min #map(%i)
346     //     scf.for %k = %c0 to %ub
347     // ```
348     // After computing the backward slice we obtain:
349     // indexEdges = [%ub, %i, %c0]
350     // indexSlice = [arith.constant 0, scf.for %i, affine.min #map(%i)]
351     SetVector<Operation *> indexSlice;
352     getBackwardSlice(ub, &indexSlice, [&](Operation *op) {
353       // Continue only along the index operands of the ForOp.
354       if (auto forOp = dyn_cast<scf::ForOp>(op)) {
355         // Consider only loops part of the enclosing loops.
356         if (!outerLimit->isAncestor(op))
357           return false;
358         if (!indexEdges.contains(forOp.getInductionVar()))
359           return false;
360         addIndexOperandsToIndexEdges(op, indexEdges);
361         return true;
362       }
363       // All supported index operations have one result.
364       assert(op->getNumResults() == 1 &&
365              "expect operations to have one result");
366       if (!indexEdges.contains(op->getResult(0)))
367         return false;
368       addIndexOperandsToIndexEdges(op, indexEdges);
369       return true;
370     });
371     indexSlice.insert(ub.getDefiningOp());
372 
373     // Iterate over all ops in the slice and compose them in the constraints.
374     for (Operation *op : llvm::reverse(indexSlice)) {
375       // All ForOps have previously been added to the constraints and ConstantOp
376       // and DimOp are terminals of the index computation.
377       if (isa<scf::ForOp, arith::ConstantOp, tensor::DimOp>(op))
378         continue;
379       // Check all index computation operations are supported.
380       if (!isa<AffineApplyOp, AffineMinOp>(op))
381         return failure();
382       // Ensure there is an id.
383       auto ensureIdFailed = [&](Value v) {
384         if (constraints.containsId(v)) {
385           unsigned pos;
386           constraints.findId(v, &pos);
387           return pos >= constraints.getNumDimIds();
388         }
389         constraints.appendDimId(v);
390         return false;
391       };
392 
393       // Ensure all ids exist and add results for later projection.
394       if (llvm::any_of(op->getResults(), ensureIdFailed) ||
395           llvm::any_of(op->getOperands(), ensureIdFailed))
396         return failure();
397 
398       // All supported ops have 1 result.
399       // TODO: extend when needed.
400       assert(op->getNumResults() == 1 &&
401              "expect operations to have one result");
402       toProjectOut.insert(op->getResult(0));
403 
404       // Compose supported ops.
405       if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
406         AffineValueMap avm(affineApplyOp.getAffineMap(),
407                            affineApplyOp.getOperands(),
408                            affineApplyOp.getResult());
409         if (failed(constraints.composeMap(&avm)))
410           return failure();
411         continue;
412       }
413       auto affineMinOp = cast<AffineMinOp>(op);
414       unsigned pos;
415       bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
416       (void)foundMinOp;
417       assert(foundMinOp);
418       AffineMap alignedMap = constraints.computeAlignedMap(
419           affineMinOp.getAffineMap(), affineMinOp.getOperands());
420       if (failed(
421               constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
422         return failure();
423     }
424   }
425   for (Value v : toProjectOut)
426     constraints.projectOut(v);
427   return success();
428 }
429 
430 // Footprint of the packedTensor, computed from the packingLoops and
431 // `backwardSlice`.
432 FailureOr<SmallVector<Value>>
433 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
434   // Create the base affine constaints for the packedLoops.
435   auto constraints = FlatAffineValueConstraints::getHyperrectangular(
436       llvm::to_vector<8>(llvm::map_range(
437           packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })),
438       llvm::to_vector<8>(llvm::map_range(
439           packingLoops, [](scf::ForOp op) { return op.lowerBound(); })),
440       llvm::to_vector<8>(llvm::map_range(
441           packingLoops, [](scf::ForOp op) { return op.upperBound(); })));
442 
443   // Iteratively try to fold the upper bounds into the constraints set.
444   if (failed(foldUpperBoundsIntoConstraintsSet(
445           constraints, outermostEnclosingForOp, packingLoops.getArrayRef())))
446     return failure();
447 
448   int nPackedLoops = packingLoops.size();
449   SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops);
450   // Compute the bounds of the first positions, assuming the others are fixed.
451   constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops,
452                              outermostEnclosingForOp->getContext(), &lbs, &ubs);
453 
454   SmallVector<Value> allValues;
455   constraints.getAllValues(&allValues);
456   SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops,
457                                       allValues.end());
458 
459   // For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
460   // IP just before the outermost loop considered that we hoist above.
461   assert(nPackedLoops == static_cast<int64_t>(lbs.size()) &&
462          "expected matching lb sizes");
463   assert(nPackedLoops == static_cast<int64_t>(ubs.size()) &&
464          "expected matching ub sizes");
465   SmallVector<Value> dynamicTensorSizes;
466   for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
467     scf::ForOp loop = std::get<0>(it);
468     AffineMap lbMap = std::get<1>(it);
469     AffineMap ubMap = std::get<2>(it);
470     SmallVector<Value> lbOperands(allNonLoopValues);
471     canonicalizeMapAndOperands(&lbMap, &lbOperands);
472     Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);
473 
474     SmallVector<Value> ubOperands(allNonLoopValues);
475     canonicalizeMapAndOperands(&ubMap, &ubOperands);
476     Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);
477 
478     AffineExpr lb, ub, step;
479     bindDims(b.getContext(), lb, ub);
480     bindSymbols(b.getContext(), step);
481     Value res = b.createOrFold<AffineApplyOp>(
482         (ub - lb).ceilDiv(step),
483         ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});
484 
485     dynamicTensorSizes.push_back(res);
486   }
487   return dynamicTensorSizes;
488 }
489 
490 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
491 /// The returned Value is guaranteed not to depend on any loop comprised in
492 /// [`outer`, `forOp`].
493 /// Return null if such a loop-independent quantity cannot be computed.
494 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
495                                      scf::ForOp forOp) {
496   MLIRContext *ctx = forOp->getContext();
497   AffineExpr iv, lb, step;
498   bindDims(ctx, iv, lb);
499   bindSymbols(ctx, step);
500   if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
501       !isDefinedOutsideOrConstant(outer, forOp.step()))
502     return Value();
503   Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
504         stepVal = forOp.step();
505   auto loc = forOp->getLoc();
506   return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
507                                        ValueRange{ivVal, lbVal, stepVal});
508 }
509 
510 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
511                                                      int numLoops,
512                                                      PadTensorOp &hoistedOp) {
513   LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
514                     << " loops\n");
515   HoistingAnalysis analysis(opToHoist, numLoops);
516   if (!analysis.isValid()) {
517     LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
518     return failure();
519   }
520 
521   scf::ForOp outer = analysis.outermostEnclosingForOp;
522   ImplicitLocOpBuilder b(outer->getLoc(), outer);
523 
524   auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b);
525   if (failed(maybeDynamicTensorSizes))
526     return failure();
527   SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes;
528 
529   // Update actual number of loops, which may be smaller.
530   int nPackedLoops = analysis.packingLoops.size();
531 
532   Location loc = opToHoist->getLoc();
533   RankedTensorType paddedTensorType = opToHoist.getResultType();
534   int paddedRank = paddedTensorType.getRank();
535 
536   // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
537   // padding.
538   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
539   // TODO: go grab dims when necessary, for now PadTensorOp returns a static
540   // tensor.
541   llvm::append_range(packedShape, paddedTensorType.getShape());
542   auto packedTensorType =
543       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
544   Value packedTensor = b.create<linalg::InitTensorOp>(
545       loc, dynamicTensorSizes, packedTensorType.getShape(),
546       packedTensorType.getElementType());
547 
548   // Clone the operations involved in the backward slice, iteratively stepping
549   // into the loops that we encounter.
550   // The implementation proceeds in a stack-like fashion:
551   //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
552   //      deeper in the stack.
553   //   2. Create a InsertSliceOp at the top of the stack.
554   //   3. Iteratively pop and yield the result of the InsertSliceOp across
555   //     the cloned loops.
556   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
557   clonedLoopIvs.reserve(nPackedLoops);
558   leadingPackedTensorIndexings.reserve(nPackedLoops);
559   BlockAndValueMapping bvm;
560   // Insert `opToHoist` into the backwardSlice so we clone it too.
561   analysis.backwardSlice.insert(opToHoist);
562   // Stack step 1. iteratively clone loops and push `packedTensor`.
563   for (Operation *op : analysis.backwardSlice) {
564     // Specifically sit out in the extract_slice(packedTensor) case: this is the
565     // piece we seek to replace.
566     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
567       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
568         continue;
569     auto effects = dyn_cast<MemoryEffectOpInterface>(op);
570     bool hasNoEffects = !effects || effects.hasNoEffect();
571     if (hasNoEffects &&
572         (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
573       b.clone(*op, bvm);
574       continue;
575     }
576     // TODO: support more cases as they appear.
577     auto forOp = dyn_cast<scf::ForOp>(op);
578     assert(forOp && "Expected scf::ForOp when hoisting pad ops");
579     // Unused loop, just skip it.
580     if (!analysis.packingLoops.contains(forOp))
581       continue;
582 
583     auto clonedForOp =
584         b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
585                              bvm.lookupOrDefault(forOp.upperBound()),
586                              bvm.lookupOrDefault(forOp.step()), packedTensor);
587     // Map the induction var, region args and results to the `clonedForOp`.
588     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
589     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
590     bvm.map(forOp.getResults(), clonedForOp.getResults());
591     assert(clonedForOp->getNumRegions() == 1);
592     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
593 
594     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
595     Value loopIndependentIterationCount =
596         buildLoopIterationCount(b, outer, clonedForOp);
597     // Assert the loop-independent iteration count can be computed.
598     if (!loopIndependentIterationCount)
599       llvm_unreachable("loop independence prerequisite not met");
600     leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
601     packedTensor = clonedForOp.getRegionIterArgs().front();
602   }
603 
604   // Stack step 2. create InsertSliceOp at the top of the stack.
605   // offsets = [clonedLoopIvs, 0 .. 0].
606   SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
607                                     leadingPackedTensorIndexings.end());
608   offsets.append(paddedRank, b.getIndexAttr(0));
609   // sizes = [1 .. 1, paddedShape].
610   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
611   for (int64_t sz : paddedTensorType.getShape()) {
612     // TODO: go grab dims when necessary, for now PadTensorOp returns a static
613     // tensor.
614     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
615     sizes.push_back(b.getIndexAttr(sz));
616   }
617   // strides = [1 .. 1].
618   SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
619                                     b.getIndexAttr(1));
620 
621   Value inserted =
622       b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()),
623                                       packedTensor, offsets, sizes, strides);
624 
625   // Stack step 3. iteratively pop the stack and propagate the yield.
626   Value valueToYield = inserted;
627   for (Value iv : llvm::reverse(clonedLoopIvs)) {
628     auto forOp = scf::getForInductionVarOwner(iv);
629     b.setInsertionPointToEnd(&forOp.getRegion().front());
630     b.create<scf::YieldOp>(loc, valueToYield);
631     valueToYield = forOp.getResult(0);
632   }
633 
634   // Now the packed tensor is ready, replace the original padding op by a
635   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
636   b.setInsertionPoint(opToHoist);
637   SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
638       llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
639         return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
640       }));
641   // Assert all loop iteration counts can be computed.
642   if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
643     llvm_unreachable("loop independence prerequisite not met");
644   // offsets = [originalLoopIvs, 0 .. 0].
645   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
646   offsets.append(paddedRank, b.getIndexAttr(0));
647   // sizes = [1 .. 1, paddedShape] (definedabove).
648   // strides = [1 .. 1] (defined above)
649   packedTensor =
650       scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
651   Value newResult = b.create<tensor::ExtractSliceOp>(
652       loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides);
653 
654   // Make the newly cloned `opToHoist` available to the caller.
655   hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
656   return newResult;
657 }
658