1 //===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements functions concerned with hoisting padding operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
14 #include "mlir/Analysis/AffineStructures.h"
15 #include "mlir/Analysis/SliceAnalysis.h"
16 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
17 #include "mlir/Dialect/Affine/Utils.h"
18 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
19 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
20 #include "mlir/Dialect/SCF/SCF.h"
21 #include "mlir/Dialect/SCF/Utils.h"
22 #include "mlir/Dialect/StandardOps/IR/Ops.h"
23 #include "mlir/Dialect/Tensor/IR/Tensor.h"
24 #include "mlir/Dialect/Vector/VectorOps.h"
25 #include "mlir/Dialect/Vector/VectorUtils.h"
26 #include "mlir/IR/AsmState.h"
27 #include "mlir/IR/BuiltinOps.h"
28 #include "mlir/IR/Dominance.h"
29 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
30 #include "mlir/Transforms/LoopUtils.h"
31 #include "llvm/ADT/StringRef.h"
32 #include "llvm/Support/Debug.h"
33 
34 using llvm::dbgs;
35 
36 #define DEBUG_TYPE "hoist-padding"
37 
38 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
39 
40 using namespace mlir;
41 using namespace mlir::linalg;
42 
43 /// Analysis class to support PadTensorOp hoisting across multiple enclosing
44 /// loops. The failure conditions are:
45 ///   1. Pad op has a use that is not an input of a LinalgOp.
46 ///   2. There is no immediately enclosing scf::ForOp.
47 ///   3. The backward slice from the pad op to the scf::ForOp to hoist above
48 ///      contains an unknown op with a region.
49 ///   4. The backward slice from the pad op to the scf::ForOp to hoist above is
50 ///      empty.
51 /// Other cases succeed and will trigger hoisting of the pad op.
52 struct HoistingAnalysis {
53   HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);
54 
55   bool isValid() { return valid; }
56 
57   /// Footprint of the packedTensor, computed from the packingLoops and
58   /// `backwardSlice`.
59   FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);
60 
61   /// The padTensorOp that needs to be hoisted.
62   PadTensorOp padTensorOp;
63 
64   /// The maximum number of immediately enclosing scf::ForOp to hoist over.
65   int nLevels;
66 
67   /// The outermost loop, determined by `nLevels` above which `padTensorOp` will
68   /// be hoisted.
69   scf::ForOp outermostEnclosingForOp;
70 
71   /// Backward slice rooted at `padTensorOp` and nested under
72   /// `outermostEnclosingForOp`.
73   SetVector<Operation *> backwardSlice;
74 
75   /// The scf::ForOp immediately enclosing `padTensorOp` such that:
76   ///  1. they are nested under `outermostEnclosingForOp` (inclusive)
77   ///  2. whose induction variable is used, directly or indirectly, in the
78   ///     computation of `padTensorOp`.
79   /// The span of these loops determines the footprint of the packed tensor.
80   /// SmallSetVector<scf::ForOp> packingLoops;
81   SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
82       packingLoops;
83 
84 private:
85   /// Encodes whether the analysis is valid and hoisting can proceed.
86   bool valid;
87 };
88 
89 /// Return true if all uses of `padTensorOp` are an input tensor of some
90 /// LinalgOp.
91 static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
92   for (OpOperand &use : padTensorOp.result().getUses()) {
93     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
94     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
95       LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)
96                         << "\nthat is not an input tensor of a LinalgOp, "
97                         << "cannot hoist\n"
98                         << *(use.getOwner()) << "\n");
99       return false;
100     }
101   }
102   return true;
103 }
104 
105 /// Return at most nLevels of immediately enclosing scf::ForOp loops.
106 /// Stops at the first parent that is not an scf::ForOp.
107 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
108 /// Control-flow and other containing ops with regions are not modeled atm.
109 static void
110 getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
111                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
112   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
113   (void)state;
114   scf::ForOp outermostEnclosingForOp = nullptr;
115   Operation *nextEnclosingOp = padTensorOp->getParentOp();
116   while (nLevels-- > 0 &&
117          (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
118     LLVM_DEBUG(
119         DBGS() << "loops: ";
120         outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
121         dbgs() << "\n");
122     reverseEnclosingLoops.push_back(outermostEnclosingForOp);
123     nextEnclosingOp = outermostEnclosingForOp->getParentOp();
124   }
125 }
126 
127 HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
128     : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) {
129   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
130   (void)state;
131 
132   // Bail on any use that isn't an input of a Linalg op.
133   // Hoisting of inplace updates happens after vectorization.
134   if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))
135     return;
136 
137   // Get at most nLevels of immediately enclosing loops.
138   SmallVector<scf::ForOp> reverseEnclosingLoops;
139   getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops);
140   if (reverseEnclosingLoops.empty()) {
141     LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
142     return;
143   }
144 
145   outermostEnclosingForOp = reverseEnclosingLoops.back();
146 
147   // Get all the ops in the backwards slice starting from `padTensorOp` and that
148   // are dominated by the outermost enclosing loop.
149   // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp.
150   bool analysisFailure = false;
151   DominanceInfo domInfo(outermostEnclosingForOp);
152   getBackwardSlice(
153       padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) {
154         if (!domInfo.dominates(outermostEnclosingForOp, op))
155           return false;
156         if (op != padTensorOp && op->getNumRegions() > 0 &&
157             !isa<scf::ForOp, LinalgOp>(op)) {
158           analysisFailure = true;
159           LLVM_DEBUG(DBGS()
160                      << "Unsupported op with region: " << *op << " -> skip\n");
161           return false;
162         }
163         return true;
164       });
165 
166   if (analysisFailure || backwardSlice.empty())
167     return;
168 
169   // Backward slice is a topologically sorted list of ops starting at
170   // `outermostEnclosingForOp`.
171   assert(outermostEnclosingForOp == backwardSlice.front());
172 
173   // Filter out the loops whose induction variable is not used to compute the
174   // padded result. As a first approximation, just look for IVs that have no use
175   // in the backwardSlice.
176   // These are the dimensions of reuse that we can exploit to reduce the amount
177   // of copy / memory.
178   for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
179     for (Operation *user : forOp.getInductionVar().getUsers()) {
180       if (backwardSlice.contains(user)) {
181         packingLoops.insert(forOp);
182         break;
183       }
184     }
185   }
186 
187   // The analysis is valid and hoisting can occur.
188   valid = true;
189 }
190 
191 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
192   return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
193 }
194 
195 /// For each loop in `loops`, determine the ops involved in the construction of
196 /// its upper bound---up to the outerLimit loop--- and fold them as new
197 /// inequalities in the constraint set.
198 /// This is achieved by computing the backwardSlice of the loop's upper bound
199 /// and iteratively folding each op in reverse topological order to guarantee
200 /// use-def ordering.
201 /// As operations are folded in, their result is projected out of the
202 /// constraints set.
203 /// The following operations are supported:
204 ///   - scf::ForOp are simply skipped.
205 ///   - AffineApplyOp are composed to replace the result by an equality.
206 ///   - AffineMinOp are composed by adding each entry as an upper bound.
207 /// If any other operation is met, return failure.
208 // TODO: extend on a per-need basis.
209 static LogicalResult
210 foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
211                                   scf::ForOp outerLimit,
212                                   ArrayRef<scf::ForOp> loops) {
213   SetVector<Value> toProjectOut;
214   for (scf::ForOp loop : loops) {
215     auto ub = loop.upperBound();
216     if (isDefinedOutsideOrConstant(outerLimit, ub))
217       continue;
218 
219     // Compute a backward slice up to, but not including, `outerLimit`.
220     SetVector<Operation *> backwardSlice;
221     getBackwardSlice(ub, &backwardSlice, [&](Operation *op) {
222       return outerLimit->isProperAncestor(op);
223     });
224     backwardSlice.insert(ub.getDefiningOp());
225 
226     // Iterate over all ops in the slice and compose them in the constraints.
227     for (Operation *op : llvm::reverse(backwardSlice)) {
228       if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op))
229         return failure();
230       if (isa<scf::ForOp>(op))
231         continue;
232       // Ensure there is a
233       auto ensureIdFailed = [&](Value v) {
234         if (constraints.containsId(v)) {
235           unsigned pos;
236           constraints.findId(v, &pos);
237           return pos >= constraints.getNumDimIds();
238         }
239         constraints.appendDimId(v);
240         return false;
241       };
242 
243       // Ensure all ids exist and add results for later projection.
244       if (llvm::any_of(op->getResults(), ensureIdFailed) ||
245           llvm::any_of(op->getOperands(), ensureIdFailed))
246         return failure();
247 
248       // All supported ops have 1 result.
249       // TODO: extend when needed.
250       toProjectOut.insert(op->getResult(0));
251 
252       // Compose supported ops.
253       if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
254         AffineValueMap avm(affineApplyOp.getAffineMap(),
255                            affineApplyOp.getOperands(),
256                            affineApplyOp.getResult());
257         if (failed(constraints.composeMap(&avm)))
258           return failure();
259         continue;
260       }
261       auto affineMinOp = cast<AffineMinOp>(op);
262       unsigned pos;
263       bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
264       (void)foundMinOp;
265       assert(foundMinOp);
266       AffineMap alignedMap = constraints.computeAlignedMap(
267           affineMinOp.getAffineMap(), affineMinOp.getOperands());
268       if (failed(
269               constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
270         return failure();
271     }
272   }
273   for (Value v : toProjectOut)
274     constraints.projectOut(v);
275   return success();
276 }
277 
278 // Footprint of the packedTensor, computed from the packingLoops and
279 // `backwardSlice`.
280 FailureOr<SmallVector<Value>>
281 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
282   // Create the base affine constaints for the packedLoops.
283   auto constraints = FlatAffineValueConstraints::getHyperrectangular(
284       llvm::to_vector<8>(llvm::map_range(
285           packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })),
286       llvm::to_vector<8>(llvm::map_range(
287           packingLoops, [](scf::ForOp op) { return op.lowerBound(); })),
288       llvm::to_vector<8>(llvm::map_range(
289           packingLoops, [](scf::ForOp op) { return op.upperBound(); })));
290 
291   // Iteratively try to fold the upper bounds into the constraints set.
292   if (failed(foldUpperBoundsIntoConstraintsSet(
293           constraints, outermostEnclosingForOp, packingLoops.getArrayRef())))
294     return failure();
295 
296   int nPackedLoops = packingLoops.size();
297   SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops);
298   // Compute the bounds of the first positions, assuming the others are fixed.
299   constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops,
300                              outermostEnclosingForOp->getContext(), &lbs, &ubs);
301 
302   SmallVector<Value> allValues;
303   constraints.getAllValues(&allValues);
304   SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops,
305                                       allValues.end());
306 
307   // For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
308   // IP just before the outermost loop considered that we hoist above.
309   assert(nPackedLoops == static_cast<int64_t>(lbs.size()) &&
310          "expected matching lb sizes");
311   assert(nPackedLoops == static_cast<int64_t>(ubs.size()) &&
312          "expected matching ub sizes");
313   SmallVector<Value> dynamicTensorSizes;
314   for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
315     scf::ForOp loop = std::get<0>(it);
316     AffineMap lbMap = std::get<1>(it);
317     AffineMap ubMap = std::get<2>(it);
318     SmallVector<Value> lbOperands(allNonLoopValues);
319     canonicalizeMapAndOperands(&lbMap, &lbOperands);
320     Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);
321 
322     SmallVector<Value> ubOperands(allNonLoopValues);
323     canonicalizeMapAndOperands(&ubMap, &ubOperands);
324     Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);
325 
326     AffineExpr lb, ub, step;
327     bindDims(b.getContext(), lb, ub);
328     bindSymbols(b.getContext(), step);
329     Value res = b.createOrFold<AffineApplyOp>(
330         (ub - lb).ceilDiv(step),
331         ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});
332 
333     dynamicTensorSizes.push_back(res);
334   }
335   return dynamicTensorSizes;
336 }
337 
338 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
339 /// The returned Value is guaranteed not to depend on any loop comprised in
340 /// [`outer`, `forOp`].
341 /// Return null if such a loop-independent quantity cannot be computed.
342 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
343                                      scf::ForOp forOp) {
344   MLIRContext *ctx = forOp->getContext();
345   AffineExpr iv, lb, step;
346   bindDims(ctx, iv, lb);
347   bindSymbols(ctx, step);
348   if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
349       !isDefinedOutsideOrConstant(outer, forOp.step()))
350     return Value();
351   Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
352         stepVal = forOp.step();
353   auto loc = forOp->getLoc();
354   return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
355                                        ValueRange{ivVal, lbVal, stepVal});
356 }
357 
358 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
359                                                      int numLoops,
360                                                      PadTensorOp &hoistedOp) {
361   LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
362                     << " loops\n");
363   HoistingAnalysis analysis(opToHoist, numLoops);
364   if (!analysis.isValid()) {
365     LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
366     return failure();
367   }
368 
369   scf::ForOp outer = analysis.outermostEnclosingForOp;
370   ImplicitLocOpBuilder b(outer->getLoc(), outer);
371 
372   auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b);
373   if (failed(maybeDynamicTensorSizes))
374     return failure();
375   SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes;
376 
377   // Update actual number of loops, which may be smaller.
378   int nPackedLoops = analysis.packingLoops.size();
379 
380   Location loc = opToHoist->getLoc();
381   RankedTensorType paddedTensorType = opToHoist.getResultType();
382   int paddedRank = paddedTensorType.getRank();
383 
384   // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
385   // padding.
386   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
387   // TODO: go grab dims when necessary, for now PadTensorOp returns a static
388   // tensor.
389   llvm::append_range(packedShape, paddedTensorType.getShape());
390   auto packedTensorType =
391       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
392   Value packedTensor = b.create<linalg::InitTensorOp>(
393       loc, dynamicTensorSizes, packedTensorType.getShape(),
394       packedTensorType.getElementType());
395 
396   // Clone the operations involved in the backward slice, iteratively stepping
397   // into the loops that we encounter.
398   // The implementation proceeds in a stack-like fashion:
399   //   1. Iteratively clone and step into the loops, pushing the `packedTensor`
400   //      deeper in the stack.
401   //   2. Create a InsertSliceOp at the top of the stack.
402   //   3. Iteratively pop and yield the result of the InsertSliceOp across
403   //     the cloned loops.
404   SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
405   clonedLoopIvs.reserve(nPackedLoops);
406   leadingPackedTensorIndexings.reserve(nPackedLoops);
407   BlockAndValueMapping bvm;
408   // Insert `opToHoist` into the backwardSlice so we clone it too.
409   analysis.backwardSlice.insert(opToHoist);
410   // Stack step 1. iteratively clone loops and push `packedTensor`.
411   for (Operation *op : analysis.backwardSlice) {
412     // Specifically sit out in the extract_slice(packedTensor) case: this is the
413     // piece we seek to replace.
414     if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
415       if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
416         continue;
417     auto effects = dyn_cast<MemoryEffectOpInterface>(op);
418     bool hasNoEffects = !effects || effects.hasNoEffect();
419     if (hasNoEffects &&
420         (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
421       b.clone(*op, bvm);
422       continue;
423     }
424     // TODO: support more cases as they appear.
425     auto forOp = dyn_cast<scf::ForOp>(op);
426     assert(forOp && "Expected scf::ForOp when hoisting pad ops");
427     // Unused loop, just skip it.
428     if (!analysis.packingLoops.contains(forOp))
429       continue;
430 
431     auto clonedForOp =
432         b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
433                              bvm.lookupOrDefault(forOp.upperBound()),
434                              bvm.lookupOrDefault(forOp.step()), packedTensor);
435     // Map the induction var, region args and results to the `clonedForOp`.
436     bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
437     bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
438     bvm.map(forOp.getResults(), clonedForOp.getResults());
439     assert(clonedForOp->getNumRegions() == 1);
440     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
441 
442     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
443     Value loopIndependentIterationCount =
444         buildLoopIterationCount(b, outer, clonedForOp);
445     // Assert the loop-independent iteration count can be computed.
446     if (!loopIndependentIterationCount)
447       llvm_unreachable("loop independence prerequisite not met");
448     leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
449     packedTensor = clonedForOp.getRegionIterArgs().front();
450   }
451 
452   // Stack step 2. create InsertSliceOp at the top of the stack.
453   // offsets = [clonedLoopIvs, 0 .. 0].
454   SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
455                                     leadingPackedTensorIndexings.end());
456   offsets.append(paddedRank, b.getIndexAttr(0));
457   // sizes = [1 .. 1, paddedShape].
458   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
459   for (int64_t sz : paddedTensorType.getShape()) {
460     // TODO: go grab dims when necessary, for now PadTensorOp returns a static
461     // tensor.
462     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
463     sizes.push_back(b.getIndexAttr(sz));
464   }
465   // strides = [1 .. 1].
466   SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
467                                     b.getIndexAttr(1));
468 
469   Value inserted =
470       b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()),
471                                       packedTensor, offsets, sizes, strides);
472 
473   // Stack step 3. iteratively pop the stack and propagate the yield.
474   Value valueToYield = inserted;
475   for (Value iv : llvm::reverse(clonedLoopIvs)) {
476     auto forOp = scf::getForInductionVarOwner(iv);
477     b.setInsertionPointToEnd(&forOp.getRegion().front());
478     b.create<scf::YieldOp>(loc, valueToYield);
479     valueToYield = forOp.getResult(0);
480   }
481 
482   // Now the packed tensor is ready, replace the original padding op by a
483   // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
484   b.setInsertionPoint(opToHoist);
485   SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
486       llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
487         return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
488       }));
489   // Assert all loop iteration counts can be computed.
490   if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
491     llvm_unreachable("loop independence prerequisite not met");
492   // offsets = [originalLoopIvs, 0 .. 0].
493   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
494   offsets.append(paddedRank, b.getIndexAttr(0));
495   // sizes = [1 .. 1, paddedShape] (definedabove).
496   // strides = [1 .. 1] (defined above)
497   packedTensor =
498       scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
499   Value newResult = b.create<tensor::ExtractSliceOp>(
500       loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides);
501 
502   // Make the newly cloned `opToHoist` available to the caller.
503   hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
504   return newResult;
505 }
506