1 //===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements functions concerned with hoisting padding operations. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Linalg/Transforms/HoistPadding.h" 14 #include "mlir/Analysis/AffineStructures.h" 15 #include "mlir/Analysis/SliceAnalysis.h" 16 #include "mlir/Dialect/Affine/IR/AffineValueMap.h" 17 #include "mlir/Dialect/Affine/Utils.h" 18 #include "mlir/Dialect/Linalg/IR/LinalgOps.h" 19 #include "mlir/Dialect/Linalg/Transforms/Transforms.h" 20 #include "mlir/Dialect/SCF/SCF.h" 21 #include "mlir/Dialect/SCF/Utils.h" 22 #include "mlir/Dialect/StandardOps/IR/Ops.h" 23 #include "mlir/Dialect/Tensor/IR/Tensor.h" 24 #include "mlir/Dialect/Vector/VectorOps.h" 25 #include "mlir/Dialect/Vector/VectorUtils.h" 26 #include "mlir/IR/AsmState.h" 27 #include "mlir/IR/BuiltinOps.h" 28 #include "mlir/IR/Dominance.h" 29 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 30 #include "mlir/Transforms/LoopUtils.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/Support/Debug.h" 33 34 using llvm::dbgs; 35 36 #define DEBUG_TYPE "hoist-padding" 37 38 #define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ") 39 40 using namespace mlir; 41 using namespace mlir::linalg; 42 43 /// Analysis class to support PadTensorOp hoisting across multiple enclosing 44 /// loops. The failure conditions are: 45 /// 1. Pad op has a use that is not an input of a LinalgOp. 46 /// 2. There is no immediately enclosing scf::ForOp. 47 /// 3. The backward slice from the pad op to the scf::ForOp to hoist above 48 /// contains an unknown op with a region. 49 /// 4. The backward slice from the pad op to the scf::ForOp to hoist above is 50 /// empty. 51 /// Other cases succeed and will trigger hoisting of the pad op. 52 struct HoistingAnalysis { 53 HoistingAnalysis(PadTensorOp padTensorOp, int nLevels); 54 55 bool isValid() { return valid; } 56 57 /// Footprint of the packedTensor, computed from the packingLoops and 58 /// `backwardSlice`. 59 FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b); 60 61 /// The padTensorOp that needs to be hoisted. 62 PadTensorOp padTensorOp; 63 64 /// The maximum number of immediately enclosing scf::ForOp to hoist over. 65 int nLevels; 66 67 /// The outermost loop, determined by `nLevels` above which `padTensorOp` will 68 /// be hoisted. 69 scf::ForOp outermostEnclosingForOp; 70 71 /// Backward slice rooted at `padTensorOp` and nested under 72 /// `outermostEnclosingForOp`. 73 SetVector<Operation *> backwardSlice; 74 75 /// The scf::ForOp immediately enclosing `padTensorOp` such that: 76 /// 1. they are nested under `outermostEnclosingForOp` (inclusive) 77 /// 2. whose induction variable is used, directly or indirectly, in the 78 /// computation of `padTensorOp`. 79 /// The span of these loops determines the footprint of the packed tensor. 80 /// SmallSetVector<scf::ForOp> packingLoops; 81 SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>> 82 packingLoops; 83 84 private: 85 /// Encodes whether the analysis is valid and hoisting can proceed. 86 bool valid; 87 }; 88 89 /// Return true if all uses of `padTensorOp` are an input tensor of some 90 /// LinalgOp. 91 static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) { 92 for (OpOperand &use : padTensorOp.result().getUses()) { 93 auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner()); 94 if (!linalgUser || !linalgUser.isInputTensor(&use)) { 95 LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp) 96 << "\nthat is not an input tensor of a LinalgOp, " 97 << "cannot hoist\n" 98 << *(use.getOwner()) << "\n"); 99 return false; 100 } 101 } 102 return true; 103 } 104 105 /// Return at most nLevels of immediately enclosing scf::ForOp loops. 106 /// Stops at the first parent that is not an scf::ForOp. 107 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm. 108 /// Control-flow and other containing ops with regions are not modeled atm. 109 static void 110 getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels, 111 SmallVector<scf::ForOp> &reverseEnclosingLoops) { 112 AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>()); 113 (void)state; 114 scf::ForOp outermostEnclosingForOp = nullptr; 115 Operation *nextEnclosingOp = padTensorOp->getParentOp(); 116 while (nLevels-- > 0 && 117 (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) { 118 LLVM_DEBUG( 119 DBGS() << "loops: "; 120 outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state); 121 dbgs() << "\n"); 122 reverseEnclosingLoops.push_back(outermostEnclosingForOp); 123 nextEnclosingOp = outermostEnclosingForOp->getParentOp(); 124 } 125 } 126 127 HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels) 128 : padTensorOp(padTensorOp), nLevels(nLevels), valid(false) { 129 AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>()); 130 (void)state; 131 132 // Bail on any use that isn't an input of a Linalg op. 133 // Hoisting of inplace updates happens after vectorization. 134 if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp)) 135 return; 136 137 // Get at most nLevels of immediately enclosing loops. 138 SmallVector<scf::ForOp> reverseEnclosingLoops; 139 getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops); 140 if (reverseEnclosingLoops.empty()) { 141 LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n"); 142 return; 143 } 144 145 outermostEnclosingForOp = reverseEnclosingLoops.back(); 146 147 // Get all the ops in the backwards slice starting from `padTensorOp` and that 148 // are dominated by the outermost enclosing loop. 149 // Bail on any op with a region that is not either a scf::ForOp or a LinalgOp. 150 bool analysisFailure = false; 151 DominanceInfo domInfo(outermostEnclosingForOp); 152 getBackwardSlice( 153 padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) { 154 if (!domInfo.dominates(outermostEnclosingForOp, op)) 155 return false; 156 if (op != padTensorOp && op->getNumRegions() > 0 && 157 !isa<scf::ForOp, LinalgOp>(op)) { 158 analysisFailure = true; 159 LLVM_DEBUG(DBGS() 160 << "Unsupported op with region: " << *op << " -> skip\n"); 161 return false; 162 } 163 return true; 164 }); 165 166 if (analysisFailure || backwardSlice.empty()) 167 return; 168 169 // Backward slice is a topologically sorted list of ops starting at 170 // `outermostEnclosingForOp`. 171 assert(outermostEnclosingForOp == backwardSlice.front()); 172 173 // Filter out the loops whose induction variable is not used to compute the 174 // padded result. As a first approximation, just look for IVs that have no use 175 // in the backwardSlice. 176 // These are the dimensions of reuse that we can exploit to reduce the amount 177 // of copy / memory. 178 for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) { 179 for (Operation *user : forOp.getInductionVar().getUsers()) { 180 if (backwardSlice.contains(user)) { 181 packingLoops.insert(forOp); 182 break; 183 } 184 } 185 } 186 187 // The analysis is valid and hoisting can occur. 188 valid = true; 189 } 190 191 static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { 192 return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>(); 193 } 194 195 /// For each loop in `loops`, determine the ops involved in the construction of 196 /// its upper bound---up to the outerLimit loop--- and fold them as new 197 /// inequalities in the constraint set. 198 /// This is achieved by computing the backwardSlice of the loop's upper bound 199 /// and iteratively folding each op in reverse topological order to guarantee 200 /// use-def ordering. 201 /// As operations are folded in, their result is projected out of the 202 /// constraints set. 203 /// The following operations are supported: 204 /// - scf::ForOp are simply skipped. 205 /// - AffineApplyOp are composed to replace the result by an equality. 206 /// - AffineMinOp are composed by adding each entry as an upper bound. 207 /// If any other operation is met, return failure. 208 // TODO: extend on a per-need basis. 209 static LogicalResult 210 foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints, 211 scf::ForOp outerLimit, 212 ArrayRef<scf::ForOp> loops) { 213 SetVector<Value> toProjectOut; 214 for (scf::ForOp loop : loops) { 215 auto ub = loop.upperBound(); 216 if (isDefinedOutsideOrConstant(outerLimit, ub)) 217 continue; 218 219 // Compute a backward slice up to, but not including, `outerLimit`. 220 SetVector<Operation *> backwardSlice; 221 getBackwardSlice(ub, &backwardSlice, [&](Operation *op) { 222 return outerLimit->isProperAncestor(op); 223 }); 224 backwardSlice.insert(ub.getDefiningOp()); 225 226 // Iterate over all ops in the slice and compose them in the constraints. 227 for (Operation *op : llvm::reverse(backwardSlice)) { 228 if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op)) 229 return failure(); 230 if (isa<scf::ForOp>(op)) 231 continue; 232 // Ensure there is a 233 auto ensureIdFailed = [&](Value v) { 234 if (constraints.containsId(v)) { 235 unsigned pos; 236 constraints.findId(v, &pos); 237 return pos >= constraints.getNumDimIds(); 238 } 239 constraints.appendDimId(v); 240 return false; 241 }; 242 243 // Ensure all ids exist and add results for later projection. 244 if (llvm::any_of(op->getResults(), ensureIdFailed) || 245 llvm::any_of(op->getOperands(), ensureIdFailed)) 246 return failure(); 247 248 // All supported ops have 1 result. 249 // TODO: extend when needed. 250 toProjectOut.insert(op->getResult(0)); 251 252 // Compose supported ops. 253 if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) { 254 AffineValueMap avm(affineApplyOp.getAffineMap(), 255 affineApplyOp.getOperands(), 256 affineApplyOp.getResult()); 257 if (failed(constraints.composeMap(&avm))) 258 return failure(); 259 continue; 260 } 261 auto affineMinOp = cast<AffineMinOp>(op); 262 unsigned pos; 263 bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos); 264 (void)foundMinOp; 265 assert(foundMinOp); 266 AffineMap alignedMap = constraints.computeAlignedMap( 267 affineMinOp.getAffineMap(), affineMinOp.getOperands()); 268 if (failed( 269 constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap))) 270 return failure(); 271 } 272 } 273 for (Value v : toProjectOut) 274 constraints.projectOut(v); 275 return success(); 276 } 277 278 // Footprint of the packedTensor, computed from the packingLoops and 279 // `backwardSlice`. 280 FailureOr<SmallVector<Value>> 281 HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) { 282 // Create the base affine constaints for the packedLoops. 283 auto constraints = FlatAffineValueConstraints::getHyperrectangular( 284 llvm::to_vector<8>(llvm::map_range( 285 packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })), 286 llvm::to_vector<8>(llvm::map_range( 287 packingLoops, [](scf::ForOp op) { return op.lowerBound(); })), 288 llvm::to_vector<8>(llvm::map_range( 289 packingLoops, [](scf::ForOp op) { return op.upperBound(); }))); 290 291 // Iteratively try to fold the upper bounds into the constraints set. 292 if (failed(foldUpperBoundsIntoConstraintsSet( 293 constraints, outermostEnclosingForOp, packingLoops.getArrayRef()))) 294 return failure(); 295 296 int nPackedLoops = packingLoops.size(); 297 SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops); 298 // Compute the bounds of the first positions, assuming the others are fixed. 299 constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops, 300 outermostEnclosingForOp->getContext(), &lbs, &ubs); 301 302 SmallVector<Value> allValues; 303 constraints.getAllValues(&allValues); 304 SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops, 305 allValues.end()); 306 307 // For each packingLoop, create the extent by (ub - lb).ceilDiv(step). 308 // IP just before the outermost loop considered that we hoist above. 309 assert(nPackedLoops == static_cast<int64_t>(lbs.size()) && 310 "expected matching lb sizes"); 311 assert(nPackedLoops == static_cast<int64_t>(ubs.size()) && 312 "expected matching ub sizes"); 313 SmallVector<Value> dynamicTensorSizes; 314 for (auto it : llvm::zip(packingLoops, lbs, ubs)) { 315 scf::ForOp loop = std::get<0>(it); 316 AffineMap lbMap = std::get<1>(it); 317 AffineMap ubMap = std::get<2>(it); 318 SmallVector<Value> lbOperands(allNonLoopValues); 319 canonicalizeMapAndOperands(&lbMap, &lbOperands); 320 Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands); 321 322 SmallVector<Value> ubOperands(allNonLoopValues); 323 canonicalizeMapAndOperands(&ubMap, &ubOperands); 324 Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands); 325 326 AffineExpr lb, ub, step; 327 bindDims(b.getContext(), lb, ub); 328 bindSymbols(b.getContext(), step); 329 Value res = b.createOrFold<AffineApplyOp>( 330 (ub - lb).ceilDiv(step), 331 ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()}); 332 333 dynamicTensorSizes.push_back(res); 334 } 335 return dynamicTensorSizes; 336 } 337 338 /// Return the current iteration number in the loop (iv - lb).ceilDiv(step). 339 /// The returned Value is guaranteed not to depend on any loop comprised in 340 /// [`outer`, `forOp`]. 341 /// Return null if such a loop-independent quantity cannot be computed. 342 static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer, 343 scf::ForOp forOp) { 344 MLIRContext *ctx = forOp->getContext(); 345 AffineExpr iv, lb, step; 346 bindDims(ctx, iv, lb); 347 bindSymbols(ctx, step); 348 if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) || 349 !isDefinedOutsideOrConstant(outer, forOp.step())) 350 return Value(); 351 Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(), 352 stepVal = forOp.step(); 353 auto loc = forOp->getLoc(); 354 return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step), 355 ValueRange{ivVal, lbVal, stepVal}); 356 } 357 358 FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist, 359 int numLoops, 360 PadTensorOp &hoistedOp) { 361 LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops 362 << " loops\n"); 363 HoistingAnalysis analysis(opToHoist, numLoops); 364 if (!analysis.isValid()) { 365 LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n"); 366 return failure(); 367 } 368 369 scf::ForOp outer = analysis.outermostEnclosingForOp; 370 ImplicitLocOpBuilder b(outer->getLoc(), outer); 371 372 auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b); 373 if (failed(maybeDynamicTensorSizes)) 374 return failure(); 375 SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes; 376 377 // Update actual number of loops, which may be smaller. 378 int nPackedLoops = analysis.packingLoops.size(); 379 380 Location loc = opToHoist->getLoc(); 381 RankedTensorType paddedTensorType = opToHoist.getResultType(); 382 int paddedRank = paddedTensorType.getRank(); 383 384 // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize 385 // padding. 386 SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize); 387 // TODO: go grab dims when necessary, for now PadTensorOp returns a static 388 // tensor. 389 llvm::append_range(packedShape, paddedTensorType.getShape()); 390 auto packedTensorType = 391 RankedTensorType::get(packedShape, paddedTensorType.getElementType()); 392 Value packedTensor = b.create<linalg::InitTensorOp>( 393 loc, dynamicTensorSizes, packedTensorType.getShape(), 394 packedTensorType.getElementType()); 395 396 // Clone the operations involved in the backward slice, iteratively stepping 397 // into the loops that we encounter. 398 // The implementation proceeds in a stack-like fashion: 399 // 1. Iteratively clone and step into the loops, pushing the `packedTensor` 400 // deeper in the stack. 401 // 2. Create a InsertSliceOp at the top of the stack. 402 // 3. Iteratively pop and yield the result of the InsertSliceOp across 403 // the cloned loops. 404 SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings; 405 clonedLoopIvs.reserve(nPackedLoops); 406 leadingPackedTensorIndexings.reserve(nPackedLoops); 407 BlockAndValueMapping bvm; 408 // Insert `opToHoist` into the backwardSlice so we clone it too. 409 analysis.backwardSlice.insert(opToHoist); 410 // Stack step 1. iteratively clone loops and push `packedTensor`. 411 for (Operation *op : analysis.backwardSlice) { 412 // Specifically sit out in the extract_slice(packedTensor) case: this is the 413 // piece we seek to replace. 414 if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) 415 if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor) 416 continue; 417 auto effects = dyn_cast<MemoryEffectOpInterface>(op); 418 bool hasNoEffects = !effects || effects.hasNoEffect(); 419 if (hasNoEffects && 420 (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) { 421 b.clone(*op, bvm); 422 continue; 423 } 424 // TODO: support more cases as they appear. 425 auto forOp = dyn_cast<scf::ForOp>(op); 426 assert(forOp && "Expected scf::ForOp when hoisting pad ops"); 427 // Unused loop, just skip it. 428 if (!analysis.packingLoops.contains(forOp)) 429 continue; 430 431 auto clonedForOp = 432 b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()), 433 bvm.lookupOrDefault(forOp.upperBound()), 434 bvm.lookupOrDefault(forOp.step()), packedTensor); 435 // Map the induction var, region args and results to the `clonedForOp`. 436 bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); 437 bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); 438 bvm.map(forOp.getResults(), clonedForOp.getResults()); 439 assert(clonedForOp->getNumRegions() == 1); 440 clonedLoopIvs.push_back(clonedForOp.getInductionVar()); 441 442 b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); 443 Value loopIndependentIterationCount = 444 buildLoopIterationCount(b, outer, clonedForOp); 445 // Assert the loop-independent iteration count can be computed. 446 if (!loopIndependentIterationCount) 447 llvm_unreachable("loop independence prerequisite not met"); 448 leadingPackedTensorIndexings.push_back(loopIndependentIterationCount); 449 packedTensor = clonedForOp.getRegionIterArgs().front(); 450 } 451 452 // Stack step 2. create InsertSliceOp at the top of the stack. 453 // offsets = [clonedLoopIvs, 0 .. 0]. 454 SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(), 455 leadingPackedTensorIndexings.end()); 456 offsets.append(paddedRank, b.getIndexAttr(0)); 457 // sizes = [1 .. 1, paddedShape]. 458 SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1)); 459 for (int64_t sz : paddedTensorType.getShape()) { 460 // TODO: go grab dims when necessary, for now PadTensorOp returns a static 461 // tensor. 462 assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); 463 sizes.push_back(b.getIndexAttr(sz)); 464 } 465 // strides = [1 .. 1]. 466 SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank, 467 b.getIndexAttr(1)); 468 469 Value inserted = 470 b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()), 471 packedTensor, offsets, sizes, strides); 472 473 // Stack step 3. iteratively pop the stack and propagate the yield. 474 Value valueToYield = inserted; 475 for (Value iv : llvm::reverse(clonedLoopIvs)) { 476 auto forOp = scf::getForInductionVarOwner(iv); 477 b.setInsertionPointToEnd(&forOp.getRegion().front()); 478 b.create<scf::YieldOp>(loc, valueToYield); 479 valueToYield = forOp.getResult(0); 480 } 481 482 // Now the packed tensor is ready, replace the original padding op by a 483 // 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. 484 b.setInsertionPoint(opToHoist); 485 SmallVector<Value> loopIterationCounts = llvm::to_vector<4>( 486 llvm::map_range(analysis.packingLoops, [&](Operation *loop) { 487 return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop)); 488 })); 489 // Assert all loop iteration counts can be computed. 490 if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; })) 491 llvm_unreachable("loop independence prerequisite not met"); 492 // offsets = [originalLoopIvs, 0 .. 0]. 493 offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end()); 494 offsets.append(paddedRank, b.getIndexAttr(0)); 495 // sizes = [1 .. 1, paddedShape] (definedabove). 496 // strides = [1 .. 1] (defined above) 497 packedTensor = 498 scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); 499 Value newResult = b.create<tensor::ExtractSliceOp>( 500 loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides); 501 502 // Make the newly cloned `opToHoist` available to the caller. 503 hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp()); 504 return newResult; 505 } 506