1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "mlir/Dialect/SCF/Utils/Utils.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
16 #include "mlir/Dialect/Func/IR/FuncOps.h"
17 #include "mlir/Dialect/SCF/IR/SCF.h"
18 #include "mlir/IR/BlockAndValueMapping.h"
19 #include "mlir/IR/BuiltinOps.h"
20 #include "mlir/IR/PatternMatch.h"
21 #include "mlir/Support/MathExtras.h"
22 #include "mlir/Transforms/RegionUtils.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SetVector.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27
28 using namespace mlir;
29
30 namespace {
31 // This structure is to pass and return sets of loop parameters without
32 // confusing the order.
33 struct LoopParams {
34 Value lowerBound;
35 Value upperBound;
36 Value step;
37 };
38 } // namespace
39
40 scf::ForOp
replaceLoopWithNewYields(OpBuilder & builder,scf::ForOp loop,ValueRange newIterOperands,const NewYieldValueFn & newYieldValuesFn)41 mlir::replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop,
42 ValueRange newIterOperands,
43 const NewYieldValueFn &newYieldValuesFn) {
44 // Create a new loop before the existing one, with the extra operands.
45 OpBuilder::InsertionGuard g(builder);
46 builder.setInsertionPoint(loop);
47 auto operands = llvm::to_vector(loop.getIterOperands());
48 operands.append(newIterOperands.begin(), newIterOperands.end());
49 scf::ForOp newLoop = builder.create<scf::ForOp>(
50 loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(), loop.getStep(),
51 operands, [](OpBuilder &, Location, Value, ValueRange) {});
52
53 Block *loopBody = loop.getBody();
54 Block *newLoopBody = newLoop.getBody();
55
56 // Move the body of the original loop to the new loop.
57 newLoopBody->getOperations().splice(newLoopBody->end(),
58 loopBody->getOperations());
59
60 // Generate the new yield values to use by using the callback and append the
61 // yield values to the scf.yield operation.
62 auto yield = cast<scf::YieldOp>(newLoopBody->getTerminator());
63 ArrayRef<BlockArgument> newBBArgs =
64 newLoopBody->getArguments().take_back(newIterOperands.size());
65 {
66 OpBuilder::InsertionGuard g(builder);
67 builder.setInsertionPoint(yield);
68 SmallVector<Value> newYieldedValues =
69 newYieldValuesFn(builder, loop.getLoc(), newBBArgs);
70 assert(newIterOperands.size() == newYieldedValues.size() &&
71 "expected as many new yield values as new iter operands");
72 yield.getResultsMutable().append(newYieldedValues);
73 }
74
75 // Remap the BlockArguments from the original loop to the new loop
76 // BlockArguments.
77 ArrayRef<BlockArgument> bbArgs = loopBody->getArguments();
78 for (auto it :
79 llvm::zip(bbArgs, newLoopBody->getArguments().take_front(bbArgs.size())))
80 std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
81
82 // Replace all uses of `newIterOperands` with the corresponding basic block
83 // arguments.
84 for (auto it : llvm::zip(newIterOperands, newBBArgs)) {
85 std::get<0>(it).replaceUsesWithIf(std::get<1>(it), [&](OpOperand &use) {
86 Operation *user = use.getOwner();
87 return newLoop->isProperAncestor(user);
88 });
89 }
90
91 // Replace all uses of the original loop with corresponding values from the
92 // new loop.
93 loop.replaceAllUsesWith(
94 newLoop.getResults().take_front(loop.getNumResults()));
95
96 // Add a fake yield to the original loop body that just returns the
97 // BlockArguments corresponding to the iter_args. This makes it a no-op loop.
98 // The loop is dead. The caller is expected to erase it.
99 builder.setInsertionPointToEnd(loopBody);
100 builder.create<scf::YieldOp>(loop->getLoc(), loop.getRegionIterArgs());
101
102 return newLoop;
103 }
104
replaceLoopNestWithNewYields(OpBuilder & builder,ArrayRef<scf::ForOp> loopNest,ValueRange newIterOperands,NewYieldValueFn newYieldValueFn)105 SmallVector<scf::ForOp> mlir::replaceLoopNestWithNewYields(
106 OpBuilder &builder, ArrayRef<scf::ForOp> loopNest,
107 ValueRange newIterOperands, NewYieldValueFn newYieldValueFn) {
108 if (loopNest.empty())
109 return {};
110 SmallVector<scf::ForOp> newLoopNest(loopNest.size());
111
112 newLoopNest.back() = replaceLoopWithNewYields(
113 builder, loopNest.back(), newIterOperands, newYieldValueFn);
114
115 for (unsigned loopDepth :
116 llvm::reverse(llvm::seq<unsigned>(0, loopNest.size() - 1))) {
117 NewYieldValueFn fn = [&](OpBuilder &innerBuilder, Location loc,
118 ArrayRef<BlockArgument> innerNewBBArgs) {
119 SmallVector<Value> newYields(
120 newLoopNest[loopDepth + 1]->getResults().take_back(
121 newIterOperands.size()));
122 return newYields;
123 };
124 newLoopNest[loopDepth] = replaceLoopWithNewYields(
125 builder, loopNest[loopDepth], newIterOperands, fn);
126 }
127 return newLoopNest;
128 }
129
130 /// Outline a region with a single block into a new FuncOp.
131 /// Assumes the FuncOp result types is the type of the yielded operands of the
132 /// single block. This constraint makes it easy to determine the result.
133 /// This method also clones the `arith::ConstantIndexOp` at the start of
134 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is
135 /// provided, it will be set to point to the operation that calls the outlined
136 /// function.
137 // TODO: support more than single-block regions.
138 // TODO: more flexible constant handling.
outlineSingleBlockRegion(RewriterBase & rewriter,Location loc,Region & region,StringRef funcName,func::CallOp * callOp)139 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
140 Location loc,
141 Region ®ion,
142 StringRef funcName,
143 func::CallOp *callOp) {
144 assert(!funcName.empty() && "funcName cannot be empty");
145 if (!region.hasOneBlock())
146 return failure();
147
148 Block *originalBlock = ®ion.front();
149 Operation *originalTerminator = originalBlock->getTerminator();
150
151 // Outline before current function.
152 OpBuilder::InsertionGuard g(rewriter);
153 rewriter.setInsertionPoint(region.getParentOfType<func::FuncOp>());
154
155 SetVector<Value> captures;
156 getUsedValuesDefinedAbove(region, captures);
157
158 ValueRange outlinedValues(captures.getArrayRef());
159 SmallVector<Type> outlinedFuncArgTypes;
160 SmallVector<Location> outlinedFuncArgLocs;
161 // Region's arguments are exactly the first block's arguments as per
162 // Region::getArguments().
163 // Func's arguments are cat(regions's arguments, captures arguments).
164 for (BlockArgument arg : region.getArguments()) {
165 outlinedFuncArgTypes.push_back(arg.getType());
166 outlinedFuncArgLocs.push_back(arg.getLoc());
167 }
168 for (Value value : outlinedValues) {
169 outlinedFuncArgTypes.push_back(value.getType());
170 outlinedFuncArgLocs.push_back(value.getLoc());
171 }
172 FunctionType outlinedFuncType =
173 FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,
174 originalTerminator->getOperandTypes());
175 auto outlinedFunc =
176 rewriter.create<func::FuncOp>(loc, funcName, outlinedFuncType);
177 Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
178
179 // Merge blocks while replacing the original block operands.
180 // Warning: `mergeBlocks` erases the original block, reconstruct it later.
181 int64_t numOriginalBlockArguments = originalBlock->getNumArguments();
182 auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
183 {
184 OpBuilder::InsertionGuard g(rewriter);
185 rewriter.setInsertionPointToEnd(outlinedFuncBody);
186 rewriter.mergeBlocks(
187 originalBlock, outlinedFuncBody,
188 outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
189 // Explicitly set up a new ReturnOp terminator.
190 rewriter.setInsertionPointToEnd(outlinedFuncBody);
191 rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(),
192 originalTerminator->getOperands());
193 }
194
195 // Reconstruct the block that was deleted and add a
196 // terminator(call_results).
197 Block *newBlock = rewriter.createBlock(
198 ®ion, region.begin(),
199 TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
200 ArrayRef<Location>(outlinedFuncArgLocs)
201 .take_front(numOriginalBlockArguments));
202 {
203 OpBuilder::InsertionGuard g(rewriter);
204 rewriter.setInsertionPointToEnd(newBlock);
205 SmallVector<Value> callValues;
206 llvm::append_range(callValues, newBlock->getArguments());
207 llvm::append_range(callValues, outlinedValues);
208 auto call = rewriter.create<func::CallOp>(loc, outlinedFunc, callValues);
209 if (callOp)
210 *callOp = call;
211
212 // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.
213 // Clone `originalTerminator` to take the callOp results then erase it from
214 // `outlinedFuncBody`.
215 BlockAndValueMapping bvm;
216 bvm.map(originalTerminator->getOperands(), call->getResults());
217 rewriter.clone(*originalTerminator, bvm);
218 rewriter.eraseOp(originalTerminator);
219 }
220
221 // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.
222 // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.
223 for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
224 outlinedValues.size()))) {
225 Value orig = std::get<0>(it);
226 Value repl = std::get<1>(it);
227 {
228 OpBuilder::InsertionGuard g(rewriter);
229 rewriter.setInsertionPointToStart(outlinedFuncBody);
230 if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
231 BlockAndValueMapping bvm;
232 repl = rewriter.clone(*cst, bvm)->getResult(0);
233 }
234 }
235 orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
236 return outlinedFunc->isProperAncestor(opOperand.getOwner());
237 });
238 }
239
240 return outlinedFunc;
241 }
242
outlineIfOp(RewriterBase & b,scf::IfOp ifOp,func::FuncOp * thenFn,StringRef thenFnName,func::FuncOp * elseFn,StringRef elseFnName)243 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp,
244 func::FuncOp *thenFn, StringRef thenFnName,
245 func::FuncOp *elseFn, StringRef elseFnName) {
246 IRRewriter rewriter(b);
247 Location loc = ifOp.getLoc();
248 FailureOr<func::FuncOp> outlinedFuncOpOrFailure;
249 if (thenFn && !ifOp.getThenRegion().empty()) {
250 outlinedFuncOpOrFailure = outlineSingleBlockRegion(
251 rewriter, loc, ifOp.getThenRegion(), thenFnName);
252 if (failed(outlinedFuncOpOrFailure))
253 return failure();
254 *thenFn = *outlinedFuncOpOrFailure;
255 }
256 if (elseFn && !ifOp.getElseRegion().empty()) {
257 outlinedFuncOpOrFailure = outlineSingleBlockRegion(
258 rewriter, loc, ifOp.getElseRegion(), elseFnName);
259 if (failed(outlinedFuncOpOrFailure))
260 return failure();
261 *elseFn = *outlinedFuncOpOrFailure;
262 }
263 return success();
264 }
265
getInnermostParallelLoops(Operation * rootOp,SmallVectorImpl<scf::ParallelOp> & result)266 bool mlir::getInnermostParallelLoops(Operation *rootOp,
267 SmallVectorImpl<scf::ParallelOp> &result) {
268 assert(rootOp != nullptr && "Root operation must not be a nullptr.");
269 bool rootEnclosesPloops = false;
270 for (Region ®ion : rootOp->getRegions()) {
271 for (Block &block : region.getBlocks()) {
272 for (Operation &op : block) {
273 bool enclosesPloops = getInnermostParallelLoops(&op, result);
274 rootEnclosesPloops |= enclosesPloops;
275 if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {
276 rootEnclosesPloops = true;
277
278 // Collect parallel loop if it is an innermost one.
279 if (!enclosesPloops)
280 result.push_back(ploop);
281 }
282 }
283 }
284 }
285 return rootEnclosesPloops;
286 }
287
288 // Build the IR that performs ceil division of a positive value by a constant:
289 // ceildiv(a, B) = divis(a + (B-1), B)
290 // where divis is rounding-to-zero division.
ceilDivPositive(OpBuilder & builder,Location loc,Value dividend,int64_t divisor)291 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
292 int64_t divisor) {
293 assert(divisor > 0 && "expected positive divisor");
294 assert(dividend.getType().isIndex() && "expected index-typed value");
295
296 Value divisorMinusOneCst =
297 builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
298 Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
299 Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
300 return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
301 }
302
303 // Build the IR that performs ceil division of a positive value by another
304 // positive value:
305 // ceildiv(a, b) = divis(a + (b - 1), b)
306 // where divis is rounding-to-zero division.
ceilDivPositive(OpBuilder & builder,Location loc,Value dividend,Value divisor)307 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
308 Value divisor) {
309 assert(dividend.getType().isIndex() && "expected index-typed value");
310
311 Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
312 Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
313 Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
314 return builder.create<arith::DivSIOp>(loc, sum, divisor);
315 }
316
317 /// Helper to replace uses of loop carried values (iter_args) and loop
318 /// yield values while promoting single iteration scf.for ops.
replaceIterArgsAndYieldResults(scf::ForOp forOp)319 static void replaceIterArgsAndYieldResults(scf::ForOp forOp) {
320 // Replace uses of iter arguments with iter operands (initial values).
321 auto iterOperands = forOp.getIterOperands();
322 auto iterArgs = forOp.getRegionIterArgs();
323 for (auto e : llvm::zip(iterOperands, iterArgs))
324 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
325
326 // Replace uses of loop results with the values yielded by the loop.
327 auto outerResults = forOp.getResults();
328 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
329 for (auto e : llvm::zip(outerResults, innerResults))
330 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
331 }
332
333 /// Promotes the loop body of a forOp to its containing block if the forOp
334 /// it can be determined that the loop has a single iteration.
promoteIfSingleIteration(scf::ForOp forOp)335 LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
336 auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
337 auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
338 auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
339 if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
340 ubCstOp.value() < 0 || stepCstOp.value() < 0)
341 return failure();
342 int64_t tripCount =
343 mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
344 if (tripCount != 1)
345 return failure();
346 auto iv = forOp.getInductionVar();
347 iv.replaceAllUsesWith(lbCstOp);
348
349 replaceIterArgsAndYieldResults(forOp);
350
351 // Move the loop body operations, except for its terminator, to the loop's
352 // containing block.
353 auto *parentBlock = forOp->getBlock();
354 forOp.getBody()->getTerminator()->erase();
355 parentBlock->getOperations().splice(Block::iterator(forOp),
356 forOp.getBody()->getOperations());
357 forOp.erase();
358 return success();
359 }
360
361 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
362 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
363 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
364 /// unrolled iteration using annotateFn.
generateUnrolledLoop(Block * loopBodyBlock,Value forOpIV,uint64_t unrollFactor,function_ref<Value (unsigned,Value,OpBuilder)> ivRemapFn,function_ref<void (unsigned,Operation *,OpBuilder)> annotateFn,ValueRange iterArgs,ValueRange yieldedValues)365 static void generateUnrolledLoop(
366 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
367 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
368 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
369 ValueRange iterArgs, ValueRange yieldedValues) {
370 // Builder to insert unrolled bodies just before the terminator of the body of
371 // 'forOp'.
372 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
373
374 if (!annotateFn)
375 annotateFn = [](unsigned, Operation *, OpBuilder) {};
376
377 // Keep a pointer to the last non-terminator operation in the original block
378 // so that we know what to clone (since we are doing this in-place).
379 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
380
381 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
382 SmallVector<Value, 4> lastYielded(yieldedValues);
383
384 for (unsigned i = 1; i < unrollFactor; i++) {
385 BlockAndValueMapping operandMap;
386
387 // Prepare operand map.
388 operandMap.map(iterArgs, lastYielded);
389
390 // If the induction variable is used, create a remapping to the value for
391 // this unrolled instance.
392 if (!forOpIV.use_empty()) {
393 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
394 operandMap.map(forOpIV, ivUnroll);
395 }
396
397 // Clone the original body of 'forOp'.
398 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
399 Operation *clonedOp = builder.clone(*it, operandMap);
400 annotateFn(i, clonedOp, builder);
401 }
402
403 // Update yielded values.
404 for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
405 lastYielded[i] = operandMap.lookup(yieldedValues[i]);
406 }
407
408 // Make sure we annotate the Ops in the original body. We do this last so that
409 // any annotations are not copied into the cloned Ops above.
410 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
411 annotateFn(0, &*it, builder);
412
413 // Update operands of the yield statement.
414 loopBodyBlock->getTerminator()->setOperands(lastYielded);
415 }
416
417 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
loopUnrollByFactor(scf::ForOp forOp,uint64_t unrollFactor,function_ref<void (unsigned,Operation *,OpBuilder)> annotateFn)418 LogicalResult mlir::loopUnrollByFactor(
419 scf::ForOp forOp, uint64_t unrollFactor,
420 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
421 assert(unrollFactor > 0 && "expected positive unroll factor");
422
423 // Return if the loop body is empty.
424 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
425 return success();
426
427 // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
428 // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
429 OpBuilder boundsBuilder(forOp);
430 auto loc = forOp.getLoc();
431 auto step = forOp.getStep();
432 Value upperBoundUnrolled;
433 Value stepUnrolled;
434 bool generateEpilogueLoop = true;
435
436 auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
437 auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
438 auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
439 if (lbCstOp && ubCstOp && stepCstOp) {
440 // Constant loop bounds computation.
441 int64_t lbCst = lbCstOp.value();
442 int64_t ubCst = ubCstOp.value();
443 int64_t stepCst = stepCstOp.value();
444 assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
445 "expected positive loop bounds and step");
446 int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
447
448 if (unrollFactor == 1) {
449 if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
450 return failure();
451 return success();
452 }
453
454 int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
455 int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
456 assert(upperBoundUnrolledCst <= ubCst);
457 int64_t stepUnrolledCst = stepCst * unrollFactor;
458
459 // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
460 generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
461 if (generateEpilogueLoop)
462 upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
463 loc, upperBoundUnrolledCst);
464 else
465 upperBoundUnrolled = ubCstOp;
466
467 // Create constant for 'stepUnrolled'.
468 stepUnrolled = stepCst == stepUnrolledCst
469 ? step
470 : boundsBuilder.create<arith::ConstantIndexOp>(
471 loc, stepUnrolledCst);
472 } else {
473 // Dynamic loop bounds computation.
474 // TODO: Add dynamic asserts for negative lb/ub/step, or
475 // consider using ceilDiv from AffineApplyExpander.
476 auto lowerBound = forOp.getLowerBound();
477 auto upperBound = forOp.getUpperBound();
478 Value diff =
479 boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
480 Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
481 Value unrollFactorCst =
482 boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
483 Value tripCountRem =
484 boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
485 // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
486 Value tripCountEvenMultiple =
487 boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
488 // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
489 upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
490 loc, lowerBound,
491 boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
492 // Scale 'step' by 'unrollFactor'.
493 stepUnrolled =
494 boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
495 }
496
497 // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
498 if (generateEpilogueLoop) {
499 OpBuilder epilogueBuilder(forOp->getContext());
500 epilogueBuilder.setInsertionPoint(forOp->getBlock(),
501 std::next(Block::iterator(forOp)));
502 auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
503 epilogueForOp.setLowerBound(upperBoundUnrolled);
504
505 // Update uses of loop results.
506 auto results = forOp.getResults();
507 auto epilogueResults = epilogueForOp.getResults();
508
509 for (auto e : llvm::zip(results, epilogueResults)) {
510 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
511 }
512 epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),
513 epilogueForOp.getNumIterOperands(), results);
514 (void)promoteIfSingleIteration(epilogueForOp);
515 }
516
517 // Create unrolled loop.
518 forOp.setUpperBound(upperBoundUnrolled);
519 forOp.setStep(stepUnrolled);
520
521 auto iterArgs = ValueRange(forOp.getRegionIterArgs());
522 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
523
524 generateUnrolledLoop(
525 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
526 [&](unsigned i, Value iv, OpBuilder b) {
527 // iv' = iv + step * i;
528 auto stride = b.create<arith::MulIOp>(
529 loc, step, b.create<arith::ConstantIndexOp>(loc, i));
530 return b.create<arith::AddIOp>(loc, iv, stride);
531 },
532 annotateFn, iterArgs, yieldedValues);
533 // Promote the loop body up if this has turned into a single iteration loop.
534 (void)promoteIfSingleIteration(forOp);
535 return success();
536 }
537
538 /// Return the new lower bound, upper bound, and step in that order. Insert any
539 /// additional bounds calculations before the given builder and any additional
540 /// conversion back to the original loop induction value inside the given Block.
normalizeLoop(OpBuilder & boundsBuilder,OpBuilder & insideLoopBuilder,Location loc,Value lowerBound,Value upperBound,Value step,Value inductionVar)541 static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
542 OpBuilder &insideLoopBuilder, Location loc,
543 Value lowerBound, Value upperBound, Value step,
544 Value inductionVar) {
545 // Check if the loop is already known to have a constant zero lower bound or
546 // a constant one step.
547 bool isZeroBased = false;
548 if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
549 isZeroBased = ubCst.value() == 0;
550
551 bool isStepOne = false;
552 if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
553 isStepOne = stepCst.value() == 1;
554
555 // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
556 // assuming the step is strictly positive. Update the bounds and the step
557 // of the loop to go from 0 to the number of iterations, if necessary.
558 // TODO: introduce support for negative steps or emit dynamic asserts
559 // on step positivity, whatever gets implemented first.
560 if (isZeroBased && isStepOne)
561 return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
562 /*step=*/step};
563
564 Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
565 Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
566
567 Value newLowerBound =
568 isZeroBased ? lowerBound
569 : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
570 Value newStep =
571 isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
572
573 // Insert code computing the value of the original loop induction variable
574 // from the "normalized" one.
575 Value scaled =
576 isStepOne
577 ? inductionVar
578 : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
579 Value shifted =
580 isZeroBased
581 ? scaled
582 : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
583
584 SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
585 shifted.getDefiningOp()};
586 inductionVar.replaceAllUsesExcept(shifted, preserve);
587 return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
588 /*step=*/newStep};
589 }
590
591 /// Transform a loop with a strictly positive step
592 /// for %i = %lb to %ub step %s
593 /// into a 0-based loop with step 1
594 /// for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
595 /// %i = %ii * %s + %lb
596 /// Insert the induction variable remapping in the body of `inner`, which is
597 /// expected to be either `loop` or another loop perfectly nested under `loop`.
598 /// Insert the definition of new bounds immediate before `outer`, which is
599 /// expected to be either `loop` or its parent in the loop nest.
normalizeLoop(scf::ForOp loop,scf::ForOp outer,scf::ForOp inner)600 static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
601 OpBuilder builder(outer);
602 OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
603 auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
604 loop.getLowerBound(), loop.getUpperBound(),
605 loop.getStep(), loop.getInductionVar());
606
607 loop.setLowerBound(loopPieces.lowerBound);
608 loop.setUpperBound(loopPieces.upperBound);
609 loop.setStep(loopPieces.step);
610 }
611
coalesceLoops(MutableArrayRef<scf::ForOp> loops)612 void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
613 if (loops.size() < 2)
614 return;
615
616 scf::ForOp innermost = loops.back();
617 scf::ForOp outermost = loops.front();
618
619 // 1. Make sure all loops iterate from 0 to upperBound with step 1. This
620 // allows the following code to assume upperBound is the number of iterations.
621 for (auto loop : loops)
622 normalizeLoop(loop, outermost, innermost);
623
624 // 2. Emit code computing the upper bound of the coalesced loop as product
625 // of the number of iterations of all loops.
626 OpBuilder builder(outermost);
627 Location loc = outermost.getLoc();
628 Value upperBound = outermost.getUpperBound();
629 for (auto loop : loops.drop_front())
630 upperBound =
631 builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
632 outermost.setUpperBound(upperBound);
633
634 builder.setInsertionPointToStart(outermost.getBody());
635
636 // 3. Remap induction variables. For each original loop, the value of the
637 // induction variable can be obtained by dividing the induction variable of
638 // the linearized loop by the total number of iterations of the loops nested
639 // in it modulo the number of iterations in this loop (remove the values
640 // related to the outer loops):
641 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
642 // Compute these iteratively from the innermost loop by creating a "running
643 // quotient" of division by the range.
644 Value previous = outermost.getInductionVar();
645 for (unsigned i = 0, e = loops.size(); i < e; ++i) {
646 unsigned idx = loops.size() - i - 1;
647 if (i != 0)
648 previous = builder.create<arith::DivSIOp>(loc, previous,
649 loops[idx + 1].getUpperBound());
650
651 Value iv = (i == e - 1) ? previous
652 : builder.create<arith::RemSIOp>(
653 loc, previous, loops[idx].getUpperBound());
654 replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
655 loops.back().getRegion());
656 }
657
658 // 4. Move the operations from the innermost just above the second-outermost
659 // loop, delete the extra terminator and the second-outermost loop.
660 scf::ForOp second = loops[1];
661 innermost.getBody()->back().erase();
662 outermost.getBody()->getOperations().splice(
663 Block::iterator(second.getOperation()),
664 innermost.getBody()->getOperations());
665 second.erase();
666 }
667
collapseParallelLoops(scf::ParallelOp loops,ArrayRef<std::vector<unsigned>> combinedDimensions)668 void mlir::collapseParallelLoops(
669 scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
670 OpBuilder outsideBuilder(loops);
671 Location loc = loops.getLoc();
672
673 // Presort combined dimensions.
674 auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
675 for (auto &dims : sortedDimensions)
676 llvm::sort(dims);
677
678 // Normalize ParallelOp's iteration pattern.
679 SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
680 normalizedUpperBounds;
681 for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
682 OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
683 auto resultBounds =
684 normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
685 loops.getLowerBound()[i], loops.getUpperBound()[i],
686 loops.getStep()[i], loops.getBody()->getArgument(i));
687
688 normalizedLowerBounds.push_back(resultBounds.lowerBound);
689 normalizedUpperBounds.push_back(resultBounds.upperBound);
690 normalizedSteps.push_back(resultBounds.step);
691 }
692
693 // Combine iteration spaces.
694 SmallVector<Value, 3> lowerBounds, upperBounds, steps;
695 auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
696 auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
697 for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
698 Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
699 for (auto idx : sortedDimensions[i]) {
700 newUpperBound = outsideBuilder.create<arith::MulIOp>(
701 loc, newUpperBound, normalizedUpperBounds[idx]);
702 }
703 lowerBounds.push_back(cst0);
704 steps.push_back(cst1);
705 upperBounds.push_back(newUpperBound);
706 }
707
708 // Create new ParallelLoop with conversions to the original induction values.
709 // The loop below uses divisions to get the relevant range of values in the
710 // new induction value that represent each range of the original induction
711 // value. The remainders then determine based on that range, which iteration
712 // of the original induction value this represents. This is a normalized value
713 // that is un-normalized already by the previous logic.
714 auto newPloop = outsideBuilder.create<scf::ParallelOp>(
715 loc, lowerBounds, upperBounds, steps,
716 [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
717 for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
718 Value previous = ploopIVs[i];
719 unsigned numberCombinedDimensions = combinedDimensions[i].size();
720 // Iterate over all except the last induction value.
721 for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
722 unsigned idx = combinedDimensions[i][j];
723
724 // Determine the current induction value's current loop iteration
725 Value iv = insideBuilder.create<arith::RemSIOp>(
726 loc, previous, normalizedUpperBounds[idx]);
727 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
728 loops.getRegion());
729
730 // Remove the effect of the current induction value to prepare for
731 // the next value.
732 previous = insideBuilder.create<arith::DivSIOp>(
733 loc, previous, normalizedUpperBounds[idx]);
734 }
735
736 // The final induction value is just the remaining value.
737 unsigned idx = combinedDimensions[i][0];
738 replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
739 previous, loops.getRegion());
740 }
741 });
742
743 // Replace the old loop with the new loop.
744 loops.getBody()->back().erase();
745 newPloop.getBody()->getOperations().splice(
746 Block::iterator(newPloop.getBody()->back()),
747 loops.getBody()->getOperations());
748 loops.erase();
749 }
750
751 // Hoist the ops within `outer` that appear before `inner`.
752 // Such ops include the ops that have been introduced by parametric tiling.
753 // Ops that come from triangular loops (i.e. that belong to the program slice
754 // rooted at `outer`) and ops that have side effects cannot be hoisted.
755 // Return failure when any op fails to hoist.
hoistOpsBetween(scf::ForOp outer,scf::ForOp inner)756 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
757 SetVector<Operation *> forwardSlice;
758 getForwardSlice(
759 outer.getInductionVar(), &forwardSlice,
760 [&inner](Operation *op) { return op != inner.getOperation(); });
761 LogicalResult status = success();
762 SmallVector<Operation *, 8> toHoist;
763 for (auto &op : outer.getBody()->without_terminator()) {
764 // Stop when encountering the inner loop.
765 if (&op == inner.getOperation())
766 break;
767 // Skip over non-hoistable ops.
768 if (forwardSlice.count(&op) > 0) {
769 status = failure();
770 continue;
771 }
772 // Skip intermediate scf::ForOp, these are not considered a failure.
773 if (isa<scf::ForOp>(op))
774 continue;
775 // Skip other ops with regions.
776 if (op.getNumRegions() > 0) {
777 status = failure();
778 continue;
779 }
780 // Skip if op has side effects.
781 // TODO: loads to immutable memory regions are ok.
782 if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
783 status = failure();
784 continue;
785 }
786 toHoist.push_back(&op);
787 }
788 auto *outerForOp = outer.getOperation();
789 for (auto *op : toHoist)
790 op->moveBefore(outerForOp);
791 return status;
792 }
793
794 // Traverse the interTile and intraTile loops and try to hoist ops such that
795 // bands of perfectly nested loops are isolated.
796 // Return failure if either perfect interTile or perfect intraTile bands cannot
797 // be formed.
tryIsolateBands(const TileLoops & tileLoops)798 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
799 LogicalResult status = success();
800 const Loops &interTile = tileLoops.first;
801 const Loops &intraTile = tileLoops.second;
802 auto size = interTile.size();
803 assert(size == intraTile.size());
804 if (size <= 1)
805 return success();
806 for (unsigned s = 1; s < size; ++s)
807 status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
808 : failure();
809 for (unsigned s = 1; s < size; ++s)
810 status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
811 : failure();
812 return status;
813 }
814
815 /// Collect perfectly nested loops starting from `rootForOps`. Loops are
816 /// perfectly nested if each loop is the first and only non-terminator operation
817 /// in the parent loop. Collect at most `maxLoops` loops and append them to
818 /// `forOps`.
819 template <typename T>
getPerfectlyNestedLoopsImpl(SmallVectorImpl<T> & forOps,T rootForOp,unsigned maxLoops=std::numeric_limits<unsigned>::max ())820 static void getPerfectlyNestedLoopsImpl(
821 SmallVectorImpl<T> &forOps, T rootForOp,
822 unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
823 for (unsigned i = 0; i < maxLoops; ++i) {
824 forOps.push_back(rootForOp);
825 Block &body = rootForOp.getRegion().front();
826 if (body.begin() != std::prev(body.end(), 2))
827 return;
828
829 rootForOp = dyn_cast<T>(&body.front());
830 if (!rootForOp)
831 return;
832 }
833 }
834
stripmineSink(scf::ForOp forOp,Value factor,ArrayRef<scf::ForOp> targets)835 static Loops stripmineSink(scf::ForOp forOp, Value factor,
836 ArrayRef<scf::ForOp> targets) {
837 auto originalStep = forOp.getStep();
838 auto iv = forOp.getInductionVar();
839
840 OpBuilder b(forOp);
841 forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
842
843 Loops innerLoops;
844 for (auto t : targets) {
845 // Save information for splicing ops out of t when done
846 auto begin = t.getBody()->begin();
847 auto nOps = t.getBody()->getOperations().size();
848
849 // Insert newForOp before the terminator of `t`.
850 auto b = OpBuilder::atBlockTerminator((t.getBody()));
851 Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
852 Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
853 forOp.getUpperBound(), stepped);
854 Value ub = b.create<arith::SelectOp>(t.getLoc(), less,
855 forOp.getUpperBound(), stepped);
856
857 // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
858 auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
859 newForOp.getBody()->getOperations().splice(
860 newForOp.getBody()->getOperations().begin(),
861 t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
862 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
863 newForOp.getRegion());
864
865 innerLoops.push_back(newForOp);
866 }
867
868 return innerLoops;
869 }
870
871 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
872 // Returns the new for operation, nested immediately under `target`.
873 template <typename SizeType>
stripmineSink(scf::ForOp forOp,SizeType factor,scf::ForOp target)874 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
875 scf::ForOp target) {
876 // TODO: Use cheap structural assertions that targets are nested under
877 // forOp and that targets are not nested under each other when DominanceInfo
878 // exposes the capability. It seems overkill to construct a whole function
879 // dominance tree at this point.
880 auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
881 assert(res.size() == 1 && "Expected 1 inner forOp");
882 return res[0];
883 }
884
tile(ArrayRef<scf::ForOp> forOps,ArrayRef<Value> sizes,ArrayRef<scf::ForOp> targets)885 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
886 ArrayRef<Value> sizes,
887 ArrayRef<scf::ForOp> targets) {
888 SmallVector<SmallVector<scf::ForOp, 8>, 8> res;
889 SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
890 for (auto it : llvm::zip(forOps, sizes)) {
891 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
892 res.push_back(step);
893 currentTargets = step;
894 }
895 return res;
896 }
897
tile(ArrayRef<scf::ForOp> forOps,ArrayRef<Value> sizes,scf::ForOp target)898 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
899 scf::ForOp target) {
900 SmallVector<scf::ForOp, 8> res;
901 for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
902 assert(loops.size() == 1);
903 res.push_back(loops[0]);
904 }
905 return res;
906 }
907
tilePerfectlyNested(scf::ForOp rootForOp,ArrayRef<Value> sizes)908 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
909 // Collect perfectly nested loops. If more size values provided than nested
910 // loops available, truncate `sizes`.
911 SmallVector<scf::ForOp, 4> forOps;
912 forOps.reserve(sizes.size());
913 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
914 if (forOps.size() < sizes.size())
915 sizes = sizes.take_front(forOps.size());
916
917 return ::tile(forOps, sizes, forOps.back());
918 }
919
getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> & nestedLoops,scf::ForOp root)920 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
921 scf::ForOp root) {
922 getPerfectlyNestedLoopsImpl(nestedLoops, root);
923 }
924
extractFixedOuterLoops(scf::ForOp rootForOp,ArrayRef<int64_t> sizes)925 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
926 ArrayRef<int64_t> sizes) {
927 // Collect perfectly nested loops. If more size values provided than nested
928 // loops available, truncate `sizes`.
929 SmallVector<scf::ForOp, 4> forOps;
930 forOps.reserve(sizes.size());
931 getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
932 if (forOps.size() < sizes.size())
933 sizes = sizes.take_front(forOps.size());
934
935 // Compute the tile sizes such that i-th outer loop executes size[i]
936 // iterations. Given that the loop current executes
937 // numIterations = ceildiv((upperBound - lowerBound), step)
938 // iterations, we need to tile with size ceildiv(numIterations, size[i]).
939 SmallVector<Value, 4> tileSizes;
940 tileSizes.reserve(sizes.size());
941 for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
942 assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
943
944 auto forOp = forOps[i];
945 OpBuilder builder(forOp);
946 auto loc = forOp.getLoc();
947 Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
948 forOp.getLowerBound());
949 Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
950 Value iterationsPerBlock =
951 ceilDivPositive(builder, loc, numIterations, sizes[i]);
952 tileSizes.push_back(iterationsPerBlock);
953 }
954
955 // Call parametric tiling with the given sizes.
956 auto intraTile = tile(forOps, tileSizes, forOps.back());
957 TileLoops tileLoops = std::make_pair(forOps, intraTile);
958
959 // TODO: for now we just ignore the result of band isolation.
960 // In the future, mapping decisions may be impacted by the ability to
961 // isolate perfectly nested bands.
962 (void)tryIsolateBands(tileLoops);
963
964 return tileLoops;
965 }
966