//===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements functions concerned with hoisting padding operations. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/SCF/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dominance.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" using llvm::dbgs; #define DEBUG_TYPE "hoist-padding" #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") using namespace mlir; using namespace mlir::linalg; /// Analysis class to support PadTensorOp hoisting across multiple enclosing /// loops. The failure conditions are: /// 1. Pad op has a use that is not an input of a LinalgOp. /// 2. There is no immediately enclosing scf::ForOp. /// 3. The backward slice from the pad op to the scf::ForOp to hoist above /// contains /// an unknown op with a region. /// 4. The backward slice from the pad op to the scf::ForOp to hoist above is /// empty. /// Other cases succeed and will trigger hoisting of the pad op. struct HoistingAnalysis { HoistingAnalysis(PadTensorOp padTensorOp, int nLevels); bool isValid() { return valid; } /// Footprint of the packedTensor, computed from the packingLoops and /// `backwardSlice`. FailureOr> getPackedTensorSizes(ImplicitLocOpBuilder &b); /// The padTensorOp that needs to be hoisted. PadTensorOp padTensorOp; /// The maximum number of immediately enclosing scf::ForOp to hoist over. int nLevels; /// The outermost loop, determined by `nLevels` above which `padTensorOp` will /// be hoisted. scf::ForOp outermostEnclosingForOp; /// Backward slice rooted at `padTensorOp` and nested under /// `outermostEnclosingForOp`. SetVector backwardSlice; /// The scf::ForOp immediately enclosing `padTensorOp` such that: /// 1. they are nested under `outermostEnclosingForOp` (inclusive) /// 2. whose induction variable is used, directly or indirectly, in the /// computation of `padTensorOp`. /// The span of these loops determines the footprint of the packed tensor. /// SmallSetVector packingLoops; SetVector, DenseSet> packingLoops; private: /// Encodes whether the analysis is valid and hoisting can proceed. bool valid; }; /// Return true if all uses of `padTensorOp` are an input tensor of some /// LinalgOp. static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) { for (OpOperand &use : padTensorOp.result().getUses()) { auto linalgUser = dyn_cast(use.getOwner()); if (!linalgUser || !linalgUser.isInputTensor(&use)) { LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp) << "\nthat is not an input tensor of a LinalgOp, " << "cannot hoist\n" << *(use.getOwner()) << "\n"); return false; } } return true; } /// Return at most nLevels of immediately enclosing scf::ForOp loops. /// Stops at the first parent that is not an scf::ForOp. /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. /// Control-flow and other containing ops with regions are not modeled atm. static void getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels, SmallVector &reverseEnclosingLoops) { AsmState state(padTensorOp->getParentOfType()); (void)state; scf::ForOp outermostEnclosingForOp = nullptr; Operation *nextEnclosingOp = padTensorOp->getParentOp(); while (nLevels-- > 0 && (outermostEnclosingForOp = dyn_cast(nextEnclosingOp))) { LLVM_DEBUG( DBGS() << "loops: "; outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); dbgs() << "\n"); reverseEnclosingLoops.push_back(outermostEnclosingForOp); nextEnclosingOp = outermostEnclosingForOp->getParentOp(); } } HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels) : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) { AsmState state(padTensorOp->getParentOfType()); (void)state; // Bail on any use that isn't an input of a Linalg op. // Hoisting of inplace updates happens after vectorization. if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp)) return; // Get at most nLevels of immediately enclosing loops. SmallVector reverseEnclosingLoops; getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops); if (reverseEnclosingLoops.empty()) { LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); return; } outermostEnclosingForOp = reverseEnclosingLoops.back(); // Get all the ops in the backwards slice starting from `padTensorOp` and that // are dominated by the outermost enclosing loop. // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp. bool analysisFailure = false; DominanceInfo domInfo(outermostEnclosingForOp); getBackwardSlice( padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) { if (!domInfo.dominates(outermostEnclosingForOp, op)) return false; if (op != padTensorOp && op->getNumRegions() > 0 && !isa(op)) { analysisFailure = true; LLVM_DEBUG(DBGS() << "Unsupported op with region: " << *op << " -> skip\n"); return false; } return true; }); if (analysisFailure || backwardSlice.empty()) return; // Backward slice is a topologically sorted list of ops starting at // `outermostEnclosingForOp`. assert(outermostEnclosingForOp == backwardSlice.front()); // Filter out the loops whose induction variable is not used to compute the // padded result. As a first approximation, just look for IVs that have no use // in the backwardSlice. // These are the dimensions of reuse that we can exploit to reduce the amount // of copy / memory. for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { for (Operation *user : forOp.getInductionVar().getUsers()) { if (backwardSlice.contains(user)) { packingLoops.insert(forOp); break; } } } // The analysis is valid and hoisting can occur. valid = true; } static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp(); } /// For each loop in `loops`, determine the ops involved in the construction of /// its upper bound---up to the outerLimit loop--- and fold them as new /// inequalities in the constraint set. /// This is achieved by computing the backwardSlice of the loop's upper bound /// and iteratively folding each op in reverse topological order to guarantee /// use-def ordering. /// As operations are folded in, their result is projected out of the /// constraints set. /// The following operations are supported: /// - scf::ForOp are simply skipped. /// - AffineApplyOp are composed to replace the result by an equality. /// - AffineMinOp are composed by adding each entry as an upper bound. /// If any other operation is met, return failure. // TODO: extend on a per-need basis. static LogicalResult foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints, scf::ForOp outerLimit, ArrayRef loops) { SetVector toProjectOut; for (scf::ForOp loop : loops) { auto ub = loop.upperBound(); if (isDefinedOutsideOrConstant(outerLimit, ub)) continue; // Compute a backward slice up to, but not including, `outerLimit`. SetVector backwardSlice; getBackwardSlice(ub, &backwardSlice, [&](Operation *op) { return outerLimit->isProperAncestor(op); }); backwardSlice.insert(ub.getDefiningOp()); // Iterate over all ops in the slice and compose them in the constraints. for (Operation *op : llvm::reverse(backwardSlice)) { if (!isa(op)) return failure(); if (isa(op)) continue; // Ensure there is a auto ensureIdFailed = [&](Value v) { if (constraints.containsId(v)) { unsigned pos; constraints.findId(v, &pos); return pos >= constraints.getNumDimIds(); } constraints.appendDimId(v); return false; }; // Ensure all ids exist and add results for later projection. if (llvm::any_of(op->getResults(), ensureIdFailed) || llvm::any_of(op->getOperands(), ensureIdFailed)) return failure(); // All supported ops have 1 result. // TODO: extend when needed. toProjectOut.insert(op->getResult(0)); // Compose supported ops. if (auto affineApplyOp = dyn_cast(op)) { AffineValueMap avm(affineApplyOp.getAffineMap(), affineApplyOp.getOperands(), affineApplyOp.getResult()); if (failed(constraints.composeMap(&avm))) return failure(); continue; } auto affineMinOp = cast(op); unsigned pos; bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos); (void)foundMinOp; assert(foundMinOp); AffineMap alignedMap = constraints.computeAlignedMap( affineMinOp.getAffineMap(), affineMinOp.getOperands()); if (failed( constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap))) return failure(); } } for (Value v : toProjectOut) constraints.projectOut(v); return success(); } // Footprint of the packedTensor, computed from the packingLoops and // `backwardSlice`. FailureOr> HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { // Create the base affine constaints for the packedLoops. auto constraints = FlatAffineValueConstraints::getHyperrectangular( llvm::to_vector<8>(llvm::map_range( packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })), llvm::to_vector<8>(llvm::map_range( packingLoops, [](scf::ForOp op) { return op.lowerBound(); })), llvm::to_vector<8>(llvm::map_range( packingLoops, [](scf::ForOp op) { return op.upperBound(); }))); // Iteratively try to fold the upper bounds into the constraints set. if (failed(foldUpperBoundsIntoConstraintsSet( constraints, outermostEnclosingForOp, packingLoops.getArrayRef()))) return failure(); int nPackedLoops = packingLoops.size(); SmallVector lbs(nPackedLoops), ubs(nPackedLoops); // Compute the bounds of the first positions, assuming the others are fixed. constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops, outermostEnclosingForOp->getContext(), &lbs, &ubs); SmallVector allValues; constraints.getAllValues(&allValues); SmallVector allNonLoopValues(allValues.begin() + nPackedLoops, allValues.end()); // For each packingLoop, create the extent by (ub - lb).ceilDiv(step). // IP just before the outermost loop considered that we hoist above. assert(nPackedLoops == static_cast(lbs.size()) && "expected matching lb sizes"); assert(nPackedLoops == static_cast(ubs.size()) && "expected matching ub sizes"); SmallVector dynamicTensorSizes; for (auto it : llvm::zip(packingLoops, lbs, ubs)) { scf::ForOp loop = std::get<0>(it); AffineMap lbMap = std::get<1>(it); AffineMap ubMap = std::get<2>(it); SmallVector lbOperands(allNonLoopValues); canonicalizeMapAndOperands(&lbMap, &lbOperands); Value lbVal = b.createOrFold(lbMap, lbOperands); SmallVector ubOperands(allNonLoopValues); canonicalizeMapAndOperands(&ubMap, &ubOperands); Value ubVal = b.createOrFold(ubMap, ubOperands); AffineExpr lb, ub, step; bindDims(b.getContext(), lb, ub); bindSymbols(b.getContext(), step); Value res = b.createOrFold( (ub - lb).ceilDiv(step), ValueRange{lbVal, ubVal, cast(loop).step()}); dynamicTensorSizes.push_back(res); } return dynamicTensorSizes; } /// Return success if `v` is a value that is only transitively defined by ops of /// type in `OpTypeList`. template static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) { // Compute a backward slice up to, but not including, `outerLimit`. SetVector backwardSlice; getBackwardSlice(v, &backwardSlice, [&](Operation *op) { return outerLimit->isProperAncestor(op); }); // Traverse the backward slice and ensure we can perform the computation to // hoist. for (Operation *op : backwardSlice) { if (isa(op)) continue; LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n"); return false; } return true; } /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). /// The returned Value is guaranteed not to depend on any loop comprised in /// [`outer`, `forOp`]. /// Return null if such a loop-independent quantity cannot be computed. static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, scf::ForOp forOp) { MLIRContext *ctx = forOp->getContext(); AffineExpr iv, lb, step; bindDims(ctx, iv, lb); bindSymbols(ctx, step); if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) || !isDefinedOutsideOrConstant(outer, forOp.step())) return Value(); Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(), stepVal = forOp.step(); auto loc = forOp->getLoc(); return b.createOrFold(loc, (iv - lb).ceilDiv(step), ValueRange{ivVal, lbVal, stepVal}); } LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp, int nLoops) { LLVM_DEBUG(DBGS() << "Try to hoist " << *(padTensorOp) << " by " << nLoops << " loops\n"); HoistingAnalysis analysis(padTensorOp, nLoops); if (!analysis.isValid()) { LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); return failure(); } scf::ForOp outer = analysis.outermostEnclosingForOp; ImplicitLocOpBuilder b(outer->getLoc(), outer); auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b); if (failed(maybeDynamicTensorSizes)) return failure(); SmallVector dynamicTensorSizes = *maybeDynamicTensorSizes; // Update actual number of loops, which may be smaller. int nPackedLoops = analysis.packingLoops.size(); Location loc = padTensorOp->getLoc(); RankedTensorType paddedTensorType = padTensorOp.getResultType(); int paddedRank = paddedTensorType.getRank(); // Create the packed tensor into which we amortize // padding. SmallVector packedShape(nPackedLoops, ShapedType::kDynamicSize); // TODO: go grab dims when necessary, for now PadTensorOp returns a static // tensor. llvm::append_range(packedShape, paddedTensorType.getShape()); auto packedTensorType = RankedTensorType::get(packedShape, paddedTensorType.getElementType()); Value packedTensor = b.create( loc, dynamicTensorSizes, packedTensorType.getShape(), packedTensorType.getElementType()); // Clone the operations involved in the backward slice, iteratively stepping // into the loops that we encounter. // The implementation proceeds in a stack-like fashion: // 1. Iteratively clone and step into the loops, pushing the `packedTensor` // deeper in the stack. // 2. Create a InsertSliceOp at the top of the stack. // 3. Iteratively pop and yield the result of the InsertSliceOp across // the cloned loops. SmallVector clonedLoopIvs, leadingPackedTensorIndexings; clonedLoopIvs.reserve(nPackedLoops); leadingPackedTensorIndexings.reserve(nPackedLoops); BlockAndValueMapping bvm; // Insert `padTensorOp` into the backwardSlice so we clone it too. analysis.backwardSlice.insert(padTensorOp); // Stack step 1. iteratively clone loops and push `packedTensor`. for (Operation *op : analysis.backwardSlice) { // Specifically sit out in the extract_slice(packedTensor) case: this is the // piece we seek to replace. if (auto sliceOp = dyn_cast(op)) if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) continue; auto effects = dyn_cast(op); bool hasNoEffects = !effects || effects.hasNoEffect(); if (hasNoEffects && (op->getNumRegions() == 0 || isa(op))) { b.clone(*op, bvm); continue; } // TODO: support more cases as they appear. auto forOp = dyn_cast(op); assert(forOp && "Expected scf::ForOp when hoisting pad ops"); // Unused loop, just skip it. if (!analysis.packingLoops.contains(forOp)) continue; auto clonedForOp = b.create(loc, bvm.lookupOrDefault(forOp.lowerBound()), bvm.lookupOrDefault(forOp.upperBound()), bvm.lookupOrDefault(forOp.step()), packedTensor); // Map the induction var, region args and results to the `clonedForOp`. bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); bvm.map(forOp.getResults(), clonedForOp.getResults()); assert(clonedForOp->getNumRegions() == 1); clonedLoopIvs.push_back(clonedForOp.getInductionVar()); b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); Value loopIndependentIterationCount = buildLoopIterationCount(b, outer, clonedForOp); // Assert the loop-independent iteration count can be computed. if (!loopIndependentIterationCount) llvm_unreachable("loop independence prerequisite not met"); leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); packedTensor = clonedForOp.getRegionIterArgs().front(); } // Stack step 2. create InsertSliceOp at the top of the stack. // offsets = [clonedLoopIvs, 0 .. 0]. SmallVector offsets(leadingPackedTensorIndexings.begin(), leadingPackedTensorIndexings.end()); offsets.append(paddedRank, b.getIndexAttr(0)); // sizes = [1 .. 1, paddedShape]. SmallVector sizes(nPackedLoops, b.getIndexAttr(1)); for (int64_t sz : paddedTensorType.getShape()) { // TODO: go grab dims when necessary, for now PadTensorOp returns a static // tensor. assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); sizes.push_back(b.getIndexAttr(sz)); } // strides = [1 .. 1]. SmallVector strides(nPackedLoops + paddedRank, b.getIndexAttr(1)); Value inserted = b.create(loc, bvm.lookup(padTensorOp.result()), packedTensor, offsets, sizes, strides); // Stack step 3. iteratively pop the stack and propagate the yield. Value valueToYield = inserted; for (Value iv : llvm::reverse(clonedLoopIvs)) { auto forOp = scf::getForInductionVarOwner(iv); b.setInsertionPointToEnd(&forOp.getRegion().front()); b.create(loc, valueToYield); valueToYield = forOp.getResult(0); } // Now the packed tensor is ready, replace the original padding op by a // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. b.setInsertionPoint(padTensorOp); SmallVector loopIterationCounts = llvm::to_vector<4>( llvm::map_range(analysis.packingLoops, [&](Operation *loop) { return buildLoopIterationCount(b, outer, cast(loop)); })); // Assert all loop iteration counts can be computed. if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) llvm_unreachable("loop independence prerequisite not met"); // offsets = [originalLoopIvs, 0 .. 0]. offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); offsets.append(paddedRank, b.getIndexAttr(0)); // sizes = [1 .. 1, paddedShape] (definedabove). // strides = [1 .. 1] (defined above) packedTensor = scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); padTensorOp.replaceAllUsesWith( b.create(loc, padTensorOp.getResultType(), packedTensor, offsets, sizes, strides) ->getResult(0)); Operation *toErase = padTensorOp; // Make the newly cloned `padTensorOp` available to the caller. padTensorOp = cast(bvm.lookup(padTensorOp.result()).getDefiningOp()); toErase->erase(); return success(); }