1 //===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Affine/Utils.h"
16 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
17 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
18 #include "mlir/Dialect/SCF/SCF.h"
19 #include "mlir/Dialect/SCF/Utils.h"
20 #include "mlir/Dialect/StandardOps/IR/Ops.h"
21 #include "mlir/Dialect/Tensor/IR/Tensor.h"
22 #include "mlir/Dialect/Vector/VectorOps.h"
23 #include "mlir/Dialect/Vector/VectorUtils.h"
24 #include "mlir/IR/AsmState.h"
25 #include "mlir/IR/BuiltinOps.h"
26 #include "mlir/IR/Dominance.h"
27 #include "mlir/Transforms/LoopUtils.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/Support/Debug.h"
30 
31 using llvm::dbgs;
32 
33 #define DEBUG_TYPE "hoist-padding"
34 
35 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
36 
37 using namespace mlir;
38 using namespace mlir::linalg;
39 
40 /// Analysis class to support PadTensorOp hoisting across multiple enclosing
41 /// loops. The failure conditions are:
42 ///   1. Pad op has a use that is not an input of a LinalgOp.
43 ///   2. Pad op does not have a constant padding value.
44 ///   3. There is no immediately enclosing scf::ForOp.
45 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above
46 ///      contains an unknown op with non index type operands, a region, or a
47 ///      memory effect.
48 ///   5. The backward slice from the pad op to the scf::ForOp to hoist above is
49 ///      empty.
50 ///   6. The source tensor of pad op is not defined by an extract slice op.
51 ///   7. The source tensor of the extract slice op is not defined outside of
52 ///      the outermost enclosing scf::ForOp.
53 ///   8. There is no enclosing scf::ForOp that indexes the padded data.
54 /// Other cases succeed and will trigger hoisting of the pad op.
55 struct HoistingAnalysis {
56   HoistingAnalysis(PadTensorOp padTensorOp, int numLoops);
57 
58   bool isValid() { return valid; }
59 
60   /// Footprint of the packedTensor, computed from the packingLoops.
61   SmallVector<Value> getPackedTensorSizes(ImplicitLocOpBuilder &b);
62 
63   /// The outermost loop, determined by `nLevels` above which `padTensorOp` will
64   /// be hoisted.
65   scf::ForOp outermostEnclosingForOp;
66 
67   /// Backward slice rooted at `padTensorOp` and nested under
68   /// `outermostEnclosingForOp`.
69   SetVector<Operation *> backwardSlice;
70 
71   /// The scf::ForOp immediately enclosing `padTensorOp` such that:
72   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
73   ///  2. whose induction variable is used, directly or indirectly, in the
74   ///     computation of `padTensorOp`.
75   /// The span of these loops determines the footprint of the packed tensor.
76   SmallVector<scf::ForOp> packingLoops;
77 
78 private:
79   /// Drop any non-index dependencies of `padTensorOp` and `sliceOp` from
80   /// `backwardSlice`. The method follows the use-def chains of the index
81   /// operands consumed by `padTensorOp` and `sliceOp` and drops the operations
82   /// not part of this index computation. Afterwards, the filtered
83   /// `backwardSlice` contains only the loops whose induction variable is used,
84   /// directly or indirectly, to index the padded tensor. The method returns
85   /// failure if the filtered backward slice contains an unexpected operation.
86   ///
87   /// Example:
88   /// ```
89   /// %source = linalg.fill(%cst, %arg0)
90   /// scf.for %i
91   ///   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!
92   ///   scf.for %j (%arg2 = %unrelated)
93   ///     scf.for %k                             // not used to index %source!
94   ///       %ubi = affine.min #map(%i)
95   ///       %ubj = affine.min #map(%j)
96   ///       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
97   ///       %padded_slice = linalg.pad_tensor %slice
98   /// ```
99   /// dropNonIndexDependencies(%padded_slice, %slice)
100   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
101   LogicalResult dropNonIndexDependencies(PadTensorOp padTensorOp,
102                                          tensor::ExtractSliceOp sliceOp);
103 
104   /// Encodes whether the analysis is valid and hoisting can proceed.
105   bool valid;
106 };
107 
108 /// Return true if all uses of `padTensorOp` are an input tensor of some
109 /// LinalgOp.
110 static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
111   for (OpOperand &use : padTensorOp.result().getUses()) {
112     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
113     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
114       LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)
115                         << "\nthat is not an input tensor of a LinalgOp, "
116                         << "cannot hoist\n"
117                         << *(use.getOwner()) << "\n");
118       return false;
119     }
120   }
121   return true;
122 }
123 
124 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
125 /// Stops at the first parent that is not an scf::ForOp.
126 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
127 /// Control-flow and other containing ops with regions are not modeled atm.
128 static void
129 getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
130                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
131   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
132   (void)state;
133   scf::ForOp outermostEnclosingForOp = nullptr;
134   Operation *nextEnclosingOp = padTensorOp->getParentOp();
135   while (nLevels-- > 0 &&
136          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
137     LLVM_DEBUG(
138         DBGS() << "loops: ";
139         outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
140         dbgs() << "\n");
141     reverseEnclosingLoops.push_back(outermostEnclosingForOp);
142     nextEnclosingOp = outermostEnclosingForOp->getParentOp();
143   }
144 }
145 
146 HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
147   valid = false;
148 
149   // Bail on any use that isn't an input of a Linalg op.
150   // Hoisting of inplace updates happens after vectorization.
151   if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))
152     return;
153 
154   // Get at most `numLoops` of immediately enclosing loops.
155   SmallVector<scf::ForOp> reverseEnclosingLoops;
156   getAtMostNEnclosingLoops(padTensorOp, numLoops, reverseEnclosingLoops);
157   if (reverseEnclosingLoops.empty()) {
158     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
159     return;
160   }
161 
162   outermostEnclosingForOp = reverseEnclosingLoops.back();
163 
164   // Get the `sliceOp` that defines the source tensor of `padTensorOp` and
165   // check its source is defined outside of the outermost loop. This check
166   // ensures the padded data is available for packing before entering the
167   // outermost enclosing loop.
168   //
169   // Example:
170   // ```
171   // %source = linalg.fill(%cst, %arg0)
172   // // %source is available for packing here!
173   // scf.for %i
174   //   scf.for %j
175   //     scf.for %k
176   //       %slice = tensor.extract_slice %source [%i, %j]
177   //       %padded_slice = linalg.pad_tensor %slice
178   // ```
179   auto sliceOp = padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
180   if (!sliceOp) {
181     LLVM_DEBUG(DBGS() << "Cannot find the extract slice op -> skip\n");
182     return;
183   }
184   if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.source())) {
185     LLVM_DEBUG(DBGS() << "Source not defined outside of loops -> skip\n");
186     return;
187   }
188 
189   // Check the region of `padTensorOp` depends on a constant only. Adding
190   // hoisting support for arbitrary padding regions would require cloning all
191   // dependencies captured by the padding region.
192   Value paddingValue = padTensorOp.getConstantPaddingValue();
193   if (!paddingValue ||
194       !isa_and_nonnull<arith::ConstantOp>(paddingValue.getDefiningOp())) {
195     LLVM_DEBUG(DBGS() << "Cannot find constant padding value -> skip\n");
196     return;
197   }
198 
199   // Get all the ops in the backwards slice starting from `padTensorOp` and that
200   // are dominated by the outermost enclosing loop.
201   DominanceInfo domInfo(outermostEnclosingForOp);
202   getBackwardSlice(padTensorOp.getOperation(), &backwardSlice,
203                    [&](Operation *op) {
204                      return domInfo.dominates(outermostEnclosingForOp, op);
205                    });
206   if (backwardSlice.empty())
207     return;
208   // Add `padTensorOp` itself to the backward slice.
209   backwardSlice.insert(padTensorOp.getOperation());
210 
211   // Remove all ops in the backward slice that are not used to index the padded
212   // tensor. In particular, keep `padTensorOp`, `sliceOp`, and the loop and
213   // affine operations used for the index computation.
214   if (failed(dropNonIndexDependencies(padTensorOp, sliceOp)))
215     return;
216 
217   // Add only the loops part of the filtered `backwardSlice` to the packing
218   // loops. All other loops are not used to index the padded data and
219   // consequently access the same data in every loop iteration. Adding them to
220   // the packing loops would increase the cache footprint of the packed data
221   // by storing the same data multiple times.
222   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops))
223     if (backwardSlice.contains(forOp))
224       packingLoops.push_back(forOp);
225   if (packingLoops.empty()) {
226     LLVM_DEBUG(DBGS() << "Cannot find a packing loop -> skip\n");
227     return;
228   }
229 
230   // The analysis is valid and hoisting can occur.
231   valid = true;
232 }
233 
234 LogicalResult
235 HoistingAnalysis::dropNonIndexDependencies(PadTensorOp padTensorOp,
236                                            tensor::ExtractSliceOp sliceOp) {
237   // Set of all values used for index computation.
238   SetVector<Value> indexEdges;
239 
240   // Add all index operands of `operation` to `indexEdges`. An index operand is
241   // an operand of type index.
242   auto addIndexOperandsToIndexEdges = [&](Operation *operation) {
243     for (Value operand : operation->getOperands())
244       if (operand.getType().isIndex())
245         indexEdges.insert(operand);
246   };
247 
248   // Check if any operation result is contained in `indexEdges`.
249   auto hasIndexResult = [&](Operation *operation) {
250     return llvm::any_of(operation->getResults(), [&](Value result) {
251       return indexEdges.contains(result);
252     });
253   };
254 
255   // Starting from `padTensorOp` and `sliceOp` walk the use-def edges of index
256   // type in `backwardSlice`. Add the index operands of an operation to
257   // `indexEdges` and remove all operations from `backwardSlice` that are not
258   // part of the index computation.
259   //
260   // Example:
261   // ```
262   // %source = linalg.fill(%cst, %arg0)
263   // scf.for %i
264   //   %unrelated = linalg.fill(%cst, %arg1)    // not used to index %source!
265   //   scf.for %j (%arg2 = %unrelated)
266   //     scf.for %k                             // not used to index %source!
267   //       %ubi = affine.min #map(%i)
268   //       %ubj = affine.min #map(%j)
269   //       %slice = tensor.extract_slice %source [%i, %j] [%ubi, %ubj]
270   //       %padded_slice = linalg.pad_tensor %slice
271   // ```
272   // After iterating `backwardSlice` we obtain:
273   // indexEdges = [%i, %j, %ubi, %ubj]
274   // backwardSlice = backwardSlice / [linalg.fill(%cst, %arg1), scf.for %k]
275   SetVector<Operation *> operationsToRemove;
276   for (Operation *op : llvm::reverse(backwardSlice)) {
277     // Add the index operands of `padTensorOp` and `sliceOp` to start the
278     // exploration of the index computation.
279     if (op == padTensorOp || op == sliceOp) {
280       addIndexOperandsToIndexEdges(op);
281       continue;
282     }
283     // Add the index operands of the loop if its induction variable is
284     // used for index computation.
285     if (auto forOp = dyn_cast<scf::ForOp>(op)) {
286       if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
287         addIndexOperandsToIndexEdges(op);
288         continue;
289       }
290     }
291     // Add the index operands of all other operations if at least one result is
292     // used for index computation.
293     if (hasIndexResult(op)) {
294       addIndexOperandsToIndexEdges(op);
295       // Check the operands of the remaining operations all have index type.
296       if (llvm::any_of(op->getOperandTypes(),
297                        [](Type type) { return !type.isIndex(); })) {
298         LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
299                           << op << " -> skip\n");
300         return failure();
301       }
302       // Check the remaining operations do not have regions or memory effects.
303       auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
304       bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
305       if (hasMemoryEffect || op->getNumRegions() != 0) {
306         LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
307                           << op << " -> skip\n");
308         return failure();
309       }
310       continue;
311     }
312     // Remove all other operations not used by the index computation. An
313     // exception are constant operations that may be used by `padTensorOp`.
314     if (!isa<arith::ConstantOp>(op))
315       operationsToRemove.insert(op);
316   }
317   backwardSlice.set_subtract(operationsToRemove);
318   return success();
319 }
320 
321 SmallVector<Value>
322 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
323   SmallVector<Value> dynamicTensorSizes;
324 
325   // Upper bound the packing loop lengths to size the packed tensor. Taking
326   // upper bounds can make the sizes of the packed tensor independent of the
327   // enclosing loops. This independence is a prerequisite for reusing the same
328   // buffer for all enclosing loop iterations and hoisting its allocation out of
329   // the enclosing loops.
330   for (auto forOp : packingLoops) {
331     // Compute an upper bound `ubVal` for the upper bound of `forOp`.
332     AffineMap boundMap;
333     SmallVector<Value> boundOperands;
334     getUpperBoundForIndex(forOp.upperBound(), boundMap, boundOperands);
335     Value ubVal = b.createOrFold<AffineMinOp>(boundMap, boundOperands);
336     // Compute the maximal packing loop length as (ub - lb).ceilDiv(step) and
337     // store the result to `dynamicTensorSizes`.
338     // TODO: instead of using the lower bound of `forOp` directly, implement a
339     // lower bound computation similar to the upper bound computation.
340     AffineExpr lb, ub, step;
341     bindDims(b.getContext(), lb, ub);
342     bindSymbols(b.getContext(), step);
343     Value res = b.createOrFold<AffineApplyOp>(
344         (ub - lb).ceilDiv(step),
345         ValueRange{forOp.lowerBound(), ubVal, cast<scf::ForOp>(forOp).step()});
346     dynamicTensorSizes.push_back(res);
347   }
348 
349   return dynamicTensorSizes;
350 }
351 
352 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
353   return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
354 }
355 
356 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
357 /// The returned Value is guaranteed not to depend on any loop comprised in
358 /// [`outer`, `forOp`].
359 /// Return null if such a loop-independent quantity cannot be computed.
360 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
361                                      scf::ForOp forOp) {
362   MLIRContext *ctx = forOp->getContext();
363   AffineExpr iv, lb, step;
364   bindDims(ctx, iv, lb);
365   bindSymbols(ctx, step);
366   if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
367       !isDefinedOutsideOrConstant(outer, forOp.step()))
368     return Value();
369   Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
370         stepVal = forOp.step();
371   auto loc = forOp->getLoc();
372   return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
373                                        ValueRange{ivVal, lbVal, stepVal});
374 }
375 
376 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
377                                                      int numLoops,
378                                                      PadTensorOp &hoistedOp) {
379   LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
380                     << " loops\n");
381   HoistingAnalysis analysis(opToHoist, numLoops);
382   if (!analysis.isValid()) {
383     LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
384     return failure();
385   }
386 
387   scf::ForOp outer = analysis.outermostEnclosingForOp;
388   ImplicitLocOpBuilder b(outer->getLoc(), outer);
389 
390   SmallVector<Value> dynamicTensorSizes = analysis.getPackedTensorSizes(b);
391 
392   // Update actual number of loops, which may be smaller.
393   int nPackedLoops = analysis.packingLoops.size();
394 
395   Location loc = opToHoist->getLoc();
396   RankedTensorType paddedTensorType = opToHoist.getResultType();
397   int paddedRank = paddedTensorType.getRank();
398 
399   // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
400   // padding.
401   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
402   // TODO: go grab dims when necessary, for now PadTensorOp returns a static
403   // tensor.
404   llvm::append_range(packedShape, paddedTensorType.getShape());
405   auto packedTensorType =
406       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
407   Value packedTensor = b.create<linalg::InitTensorOp>(
408       loc, dynamicTensorSizes, packedTensorType.getShape(),
409       packedTensorType.getElementType());
410 
411   // Clone the operations involved in the backward slice, iteratively stepping
412   // into the loops that we encounter.
413   // The implementation proceeds in a stack-like fashion:
414   //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
415   //      deeper in the stack.
416   //   2. Create a InsertSliceOp at the top of the stack.
417   //   3. Iteratively pop and yield the result of the InsertSliceOp across
418   //     the cloned loops.
419   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
420   clonedLoopIvs.reserve(nPackedLoops);
421   leadingPackedTensorIndexings.reserve(nPackedLoops);
422   BlockAndValueMapping bvm;
423   // Stack step 1. iteratively clone loops and push `packedTensor`.
424   for (Operation *op : analysis.backwardSlice) {
425     // Specifically sit out in the extract_slice(packedTensor) case: this is the
426     // piece we seek to replace.
427     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
428       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
429         continue;
430     // Clone all operations except it is a loop.
431     auto forOp = dyn_cast<scf::ForOp>(op);
432     if (!forOp) {
433       b.clone(*op, bvm);
434       continue;
435     }
436     // Create a packing loop that takes `packedTensor` as iteration argument.
437     auto clonedForOp =
438         b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
439                              bvm.lookupOrDefault(forOp.upperBound()),
440                              bvm.lookupOrDefault(forOp.step()), packedTensor);
441     // Map the induction var, region args and results to the `clonedForOp`.
442     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
443     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
444     bvm.map(forOp.getResults(), clonedForOp.getResults());
445     assert(clonedForOp->getNumRegions() == 1);
446     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
447 
448     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
449     Value loopIndependentIterationCount =
450         buildLoopIterationCount(b, outer, clonedForOp);
451     // Assert the loop-independent iteration count can be computed.
452     if (!loopIndependentIterationCount)
453       llvm_unreachable("loop independence prerequisite not met");
454     leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
455     packedTensor = clonedForOp.getRegionIterArgs().front();
456   }
457 
458   // Stack step 2. create InsertSliceOp at the top of the stack.
459   // offsets = [clonedLoopIvs, 0 .. 0].
460   SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
461                                     leadingPackedTensorIndexings.end());
462   offsets.append(paddedRank, b.getIndexAttr(0));
463   // sizes = [1 .. 1, paddedShape].
464   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
465   for (int64_t sz : paddedTensorType.getShape()) {
466     // TODO: go grab dims when necessary, for now PadTensorOp returns a static
467     // tensor.
468     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
469     sizes.push_back(b.getIndexAttr(sz));
470   }
471   // strides = [1 .. 1].
472   SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
473                                     b.getIndexAttr(1));
474 
475   Value inserted =
476       b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()),
477                                       packedTensor, offsets, sizes, strides);
478 
479   // Stack step 3. iteratively pop the stack and propagate the yield.
480   Value valueToYield = inserted;
481   for (Value iv : llvm::reverse(clonedLoopIvs)) {
482     auto forOp = scf::getForInductionVarOwner(iv);
483     b.setInsertionPointToEnd(&forOp.getRegion().front());
484     b.create<scf::YieldOp>(loc, valueToYield);
485     valueToYield = forOp.getResult(0);
486   }
487 
488   // Now the packed tensor is ready, replace the original padding op by a
489   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
490   b.setInsertionPoint(opToHoist);
491   SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
492       llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
493         return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
494       }));
495   // Assert all loop iteration counts can be computed.
496   if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
497     llvm_unreachable("loop independence prerequisite not met");
498   // offsets = [originalLoopIvs, 0 .. 0].
499   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
500   offsets.append(paddedRank, b.getIndexAttr(0));
501   // sizes = [1 .. 1, paddedShape] (definedabove).
502   // strides = [1 .. 1] (defined above)
503   packedTensor =
504       scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
505   Value newResult = b.create<tensor::ExtractSliceOp>(
506       loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides);
507 
508   // Make the newly cloned `opToHoist` available to the caller.
509   hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
510   return newResult;
511 }
512