1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/SCF/Utils/Utils.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
16 #include "mlir/Dialect/Func/IR/FuncOps.h"
17 #include "mlir/Dialect/SCF/SCF.h"
18 #include "mlir/IR/BlockAndValueMapping.h"
19 #include "mlir/IR/BuiltinOps.h"
20 #include "mlir/IR/PatternMatch.h"
21 #include "mlir/Support/MathExtras.h"
22 #include "mlir/Transforms/RegionUtils.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SetVector.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 
27 using namespace mlir;
28 
29 namespace {
30 // This structure is to pass and return sets of loop parameters without
31 // confusing the order.
32 struct LoopParams {
33   Value lowerBound;
34   Value upperBound;
35   Value step;
36 };
37 } // namespace
38 
39 scf::ForOp mlir::cloneWithNewYields(OpBuilder &b, scf::ForOp loop,
40                                     ValueRange newIterOperands,
41                                     ValueRange newYieldedValues,
42                                     bool replaceLoopResults) {
43   assert(newIterOperands.size() == newYieldedValues.size() &&
44          "newIterOperands must be of the same size as newYieldedValues");
45 
46   // Create a new loop before the existing one, with the extra operands.
47   OpBuilder::InsertionGuard g(b);
48   b.setInsertionPoint(loop);
49   auto operands = llvm::to_vector<4>(loop.getIterOperands());
50   operands.append(newIterOperands.begin(), newIterOperands.end());
51   scf::ForOp newLoop =
52       b.create<scf::ForOp>(loop.getLoc(), loop.getLowerBound(),
53                            loop.getUpperBound(), loop.getStep(), operands);
54 
55   auto &loopBody = *loop.getBody();
56   auto &newLoopBody = *newLoop.getBody();
57   // Clone / erase the yield inside the original loop to both:
58   //   1. augment its operands with the newYieldedValues.
59   //   2. automatically apply the BlockAndValueMapping on its operand
60   auto yield = cast<scf::YieldOp>(loopBody.getTerminator());
61   b.setInsertionPoint(yield);
62   auto yieldOperands = llvm::to_vector<4>(yield.getOperands());
63   yieldOperands.append(newYieldedValues.begin(), newYieldedValues.end());
64   auto newYield = b.create<scf::YieldOp>(yield.getLoc(), yieldOperands);
65 
66   // Clone the loop body with remaps.
67   BlockAndValueMapping bvm;
68   // a. remap the induction variable.
69   bvm.map(loop.getInductionVar(), newLoop.getInductionVar());
70   // b. remap the BB args.
71   bvm.map(loopBody.getArguments(),
72           newLoopBody.getArguments().take_front(loopBody.getNumArguments()));
73   // c. remap the iter args.
74   bvm.map(newIterOperands,
75           newLoop.getRegionIterArgs().take_back(newIterOperands.size()));
76   b.setInsertionPointToStart(&newLoopBody);
77   // Skip the original yield terminator which does not have enough operands.
78   for (auto &o : loopBody.without_terminator())
79     b.clone(o, bvm);
80 
81   // Replace `loop`'s results if requested.
82   if (replaceLoopResults) {
83     for (auto it : llvm::zip(loop.getResults(), newLoop.getResults().take_front(
84                                                     loop.getNumResults())))
85       std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
86   }
87 
88   // TODO: this is unsafe in the context of a PatternRewrite.
89   newYield.erase();
90 
91   return newLoop;
92 }
93 
94 /// Outline a region with a single block into a new FuncOp.
95 /// Assumes the FuncOp result types is the type of the yielded operands of the
96 /// single block. This constraint makes it easy to determine the result.
97 /// This method also clones the `arith::ConstantIndexOp` at the start of
98 /// `outlinedFuncBody` to alloc simple canonicalizations.
99 // TODO: support more than single-block regions.
100 // TODO: more flexible constant handling.
101 FailureOr<FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
102                                                  Location loc, Region &region,
103                                                  StringRef funcName) {
104   assert(!funcName.empty() && "funcName cannot be empty");
105   if (!region.hasOneBlock())
106     return failure();
107 
108   Block *originalBlock = &region.front();
109   Operation *originalTerminator = originalBlock->getTerminator();
110 
111   // Outline before current function.
112   OpBuilder::InsertionGuard g(rewriter);
113   rewriter.setInsertionPoint(region.getParentOfType<FuncOp>());
114 
115   SetVector<Value> captures;
116   getUsedValuesDefinedAbove(region, captures);
117 
118   ValueRange outlinedValues(captures.getArrayRef());
119   SmallVector<Type> outlinedFuncArgTypes;
120   SmallVector<Location> outlinedFuncArgLocs;
121   // Region's arguments are exactly the first block's arguments as per
122   // Region::getArguments().
123   // Func's arguments are cat(regions's arguments, captures arguments).
124   for (BlockArgument arg : region.getArguments()) {
125     outlinedFuncArgTypes.push_back(arg.getType());
126     outlinedFuncArgLocs.push_back(arg.getLoc());
127   }
128   for (Value value : outlinedValues) {
129     outlinedFuncArgTypes.push_back(value.getType());
130     outlinedFuncArgLocs.push_back(value.getLoc());
131   }
132   FunctionType outlinedFuncType =
133       FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,
134                         originalTerminator->getOperandTypes());
135   auto outlinedFunc = rewriter.create<FuncOp>(loc, funcName, outlinedFuncType);
136   Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
137 
138   // Merge blocks while replacing the original block operands.
139   // Warning: `mergeBlocks` erases the original block, reconstruct it later.
140   int64_t numOriginalBlockArguments = originalBlock->getNumArguments();
141   auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
142   {
143     OpBuilder::InsertionGuard g(rewriter);
144     rewriter.setInsertionPointToEnd(outlinedFuncBody);
145     rewriter.mergeBlocks(
146         originalBlock, outlinedFuncBody,
147         outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
148     // Explicitly set up a new ReturnOp terminator.
149     rewriter.setInsertionPointToEnd(outlinedFuncBody);
150     rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(),
151                                     originalTerminator->getOperands());
152   }
153 
154   // Reconstruct the block that was deleted and add a
155   // terminator(call_results).
156   Block *newBlock = rewriter.createBlock(
157       &region, region.begin(),
158       TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
159       ArrayRef<Location>(outlinedFuncArgLocs)
160           .take_front(numOriginalBlockArguments));
161   {
162     OpBuilder::InsertionGuard g(rewriter);
163     rewriter.setInsertionPointToEnd(newBlock);
164     SmallVector<Value> callValues;
165     llvm::append_range(callValues, newBlock->getArguments());
166     llvm::append_range(callValues, outlinedValues);
167     Operation *call =
168         rewriter.create<func::CallOp>(loc, outlinedFunc, callValues);
169 
170     // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.
171     // Clone `originalTerminator` to take the callOp results then erase it from
172     // `outlinedFuncBody`.
173     BlockAndValueMapping bvm;
174     bvm.map(originalTerminator->getOperands(), call->getResults());
175     rewriter.clone(*originalTerminator, bvm);
176     rewriter.eraseOp(originalTerminator);
177   }
178 
179   // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.
180   // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.
181   for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
182                                                outlinedValues.size()))) {
183     Value orig = std::get<0>(it);
184     Value repl = std::get<1>(it);
185     {
186       OpBuilder::InsertionGuard g(rewriter);
187       rewriter.setInsertionPointToStart(outlinedFuncBody);
188       if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
189         BlockAndValueMapping bvm;
190         repl = rewriter.clone(*cst, bvm)->getResult(0);
191       }
192     }
193     orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
194       return outlinedFunc->isProperAncestor(opOperand.getOwner());
195     });
196   }
197 
198   return outlinedFunc;
199 }
200 
201 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp, FuncOp *thenFn,
202                                 StringRef thenFnName, FuncOp *elseFn,
203                                 StringRef elseFnName) {
204   IRRewriter rewriter(b);
205   Location loc = ifOp.getLoc();
206   FailureOr<FuncOp> outlinedFuncOpOrFailure;
207   if (thenFn && !ifOp.getThenRegion().empty()) {
208     outlinedFuncOpOrFailure = outlineSingleBlockRegion(
209         rewriter, loc, ifOp.getThenRegion(), thenFnName);
210     if (failed(outlinedFuncOpOrFailure))
211       return failure();
212     *thenFn = *outlinedFuncOpOrFailure;
213   }
214   if (elseFn && !ifOp.getElseRegion().empty()) {
215     outlinedFuncOpOrFailure = outlineSingleBlockRegion(
216         rewriter, loc, ifOp.getElseRegion(), elseFnName);
217     if (failed(outlinedFuncOpOrFailure))
218       return failure();
219     *elseFn = *outlinedFuncOpOrFailure;
220   }
221   return success();
222 }
223 
224 bool mlir::getInnermostParallelLoops(Operation *rootOp,
225                                      SmallVectorImpl<scf::ParallelOp> &result) {
226   assert(rootOp != nullptr && "Root operation must not be a nullptr.");
227   bool rootEnclosesPloops = false;
228   for (Region &region : rootOp->getRegions()) {
229     for (Block &block : region.getBlocks()) {
230       for (Operation &op : block) {
231         bool enclosesPloops = getInnermostParallelLoops(&op, result);
232         rootEnclosesPloops |= enclosesPloops;
233         if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {
234           rootEnclosesPloops = true;
235 
236           // Collect parallel loop if it is an innermost one.
237           if (!enclosesPloops)
238             result.push_back(ploop);
239         }
240       }
241     }
242   }
243   return rootEnclosesPloops;
244 }
245 
246 // Build the IR that performs ceil division of a positive value by a constant:
247 //    ceildiv(a, B) = divis(a + (B-1), B)
248 // where divis is rounding-to-zero division.
249 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
250                              int64_t divisor) {
251   assert(divisor > 0 && "expected positive divisor");
252   assert(dividend.getType().isIndex() && "expected index-typed value");
253 
254   Value divisorMinusOneCst =
255       builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
256   Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
257   Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
258   return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
259 }
260 
261 // Build the IR that performs ceil division of a positive value by another
262 // positive value:
263 //    ceildiv(a, b) = divis(a + (b - 1), b)
264 // where divis is rounding-to-zero division.
265 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
266                              Value divisor) {
267   assert(dividend.getType().isIndex() && "expected index-typed value");
268 
269   Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
270   Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
271   Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
272   return builder.create<arith::DivSIOp>(loc, sum, divisor);
273 }
274 
275 /// Helper to replace uses of loop carried values (iter_args) and loop
276 /// yield values while promoting single iteration scf.for ops.
277 static void replaceIterArgsAndYieldResults(scf::ForOp forOp) {
278   // Replace uses of iter arguments with iter operands (initial values).
279   auto iterOperands = forOp.getIterOperands();
280   auto iterArgs = forOp.getRegionIterArgs();
281   for (auto e : llvm::zip(iterOperands, iterArgs))
282     std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
283 
284   // Replace uses of loop results with the values yielded by the loop.
285   auto outerResults = forOp.getResults();
286   auto innerResults = forOp.getBody()->getTerminator()->getOperands();
287   for (auto e : llvm::zip(outerResults, innerResults))
288     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
289 }
290 
291 /// Promotes the loop body of a forOp to its containing block if the forOp
292 /// it can be determined that the loop has a single iteration.
293 LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
294   auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
295   auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
296   auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
297   if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
298       ubCstOp.value() < 0 || stepCstOp.value() < 0)
299     return failure();
300   int64_t tripCount =
301       mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
302   if (tripCount != 1)
303     return failure();
304   auto iv = forOp.getInductionVar();
305   iv.replaceAllUsesWith(lbCstOp);
306 
307   replaceIterArgsAndYieldResults(forOp);
308 
309   // Move the loop body operations, except for its terminator, to the loop's
310   // containing block.
311   auto *parentBlock = forOp->getBlock();
312   forOp.getBody()->getTerminator()->erase();
313   parentBlock->getOperations().splice(Block::iterator(forOp),
314                                       forOp.getBody()->getOperations());
315   forOp.erase();
316   return success();
317 }
318 
319 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
320 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
321 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
322 /// unrolled iteration using annotateFn.
323 static void generateUnrolledLoop(
324     Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
325     function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
326     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
327     ValueRange iterArgs, ValueRange yieldedValues) {
328   // Builder to insert unrolled bodies just before the terminator of the body of
329   // 'forOp'.
330   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
331 
332   if (!annotateFn)
333     annotateFn = [](unsigned, Operation *, OpBuilder) {};
334 
335   // Keep a pointer to the last non-terminator operation in the original block
336   // so that we know what to clone (since we are doing this in-place).
337   Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
338 
339   // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
340   SmallVector<Value, 4> lastYielded(yieldedValues);
341 
342   for (unsigned i = 1; i < unrollFactor; i++) {
343     BlockAndValueMapping operandMap;
344 
345     // Prepare operand map.
346     operandMap.map(iterArgs, lastYielded);
347 
348     // If the induction variable is used, create a remapping to the value for
349     // this unrolled instance.
350     if (!forOpIV.use_empty()) {
351       Value ivUnroll = ivRemapFn(i, forOpIV, builder);
352       operandMap.map(forOpIV, ivUnroll);
353     }
354 
355     // Clone the original body of 'forOp'.
356     for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
357       Operation *clonedOp = builder.clone(*it, operandMap);
358       annotateFn(i, clonedOp, builder);
359     }
360 
361     // Update yielded values.
362     for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
363       lastYielded[i] = operandMap.lookup(yieldedValues[i]);
364   }
365 
366   // Make sure we annotate the Ops in the original body. We do this last so that
367   // any annotations are not copied into the cloned Ops above.
368   for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
369     annotateFn(0, &*it, builder);
370 
371   // Update operands of the yield statement.
372   loopBodyBlock->getTerminator()->setOperands(lastYielded);
373 }
374 
375 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
376 LogicalResult mlir::loopUnrollByFactor(
377     scf::ForOp forOp, uint64_t unrollFactor,
378     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
379   assert(unrollFactor > 0 && "expected positive unroll factor");
380 
381   // Return if the loop body is empty.
382   if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
383     return success();
384 
385   // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
386   // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
387   OpBuilder boundsBuilder(forOp);
388   auto loc = forOp.getLoc();
389   auto step = forOp.getStep();
390   Value upperBoundUnrolled;
391   Value stepUnrolled;
392   bool generateEpilogueLoop = true;
393 
394   auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
395   auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
396   auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
397   if (lbCstOp && ubCstOp && stepCstOp) {
398     // Constant loop bounds computation.
399     int64_t lbCst = lbCstOp.value();
400     int64_t ubCst = ubCstOp.value();
401     int64_t stepCst = stepCstOp.value();
402     assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
403            "expected positive loop bounds and step");
404     int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
405 
406     if (unrollFactor == 1) {
407       if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
408         return failure();
409       return success();
410     }
411 
412     int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
413     int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
414     assert(upperBoundUnrolledCst <= ubCst);
415     int64_t stepUnrolledCst = stepCst * unrollFactor;
416 
417     // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
418     generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
419     if (generateEpilogueLoop)
420       upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
421           loc, upperBoundUnrolledCst);
422     else
423       upperBoundUnrolled = ubCstOp;
424 
425     // Create constant for 'stepUnrolled'.
426     stepUnrolled = stepCst == stepUnrolledCst
427                        ? step
428                        : boundsBuilder.create<arith::ConstantIndexOp>(
429                              loc, stepUnrolledCst);
430   } else {
431     // Dynamic loop bounds computation.
432     // TODO: Add dynamic asserts for negative lb/ub/step, or
433     // consider using ceilDiv from AffineApplyExpander.
434     auto lowerBound = forOp.getLowerBound();
435     auto upperBound = forOp.getUpperBound();
436     Value diff =
437         boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
438     Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
439     Value unrollFactorCst =
440         boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
441     Value tripCountRem =
442         boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
443     // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
444     Value tripCountEvenMultiple =
445         boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
446     // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
447     upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
448         loc, lowerBound,
449         boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
450     // Scale 'step' by 'unrollFactor'.
451     stepUnrolled =
452         boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
453   }
454 
455   // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
456   if (generateEpilogueLoop) {
457     OpBuilder epilogueBuilder(forOp->getContext());
458     epilogueBuilder.setInsertionPoint(forOp->getBlock(),
459                                       std::next(Block::iterator(forOp)));
460     auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
461     epilogueForOp.setLowerBound(upperBoundUnrolled);
462 
463     // Update uses of loop results.
464     auto results = forOp.getResults();
465     auto epilogueResults = epilogueForOp.getResults();
466     auto epilogueIterOperands = epilogueForOp.getIterOperands();
467 
468     for (auto e : llvm::zip(results, epilogueResults, epilogueIterOperands)) {
469       std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
470       epilogueForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
471     }
472     (void)promoteIfSingleIteration(epilogueForOp);
473   }
474 
475   // Create unrolled loop.
476   forOp.setUpperBound(upperBoundUnrolled);
477   forOp.setStep(stepUnrolled);
478 
479   auto iterArgs = ValueRange(forOp.getRegionIterArgs());
480   auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
481 
482   generateUnrolledLoop(
483       forOp.getBody(), forOp.getInductionVar(), unrollFactor,
484       [&](unsigned i, Value iv, OpBuilder b) {
485         // iv' = iv + step * i;
486         auto stride = b.create<arith::MulIOp>(
487             loc, step, b.create<arith::ConstantIndexOp>(loc, i));
488         return b.create<arith::AddIOp>(loc, iv, stride);
489       },
490       annotateFn, iterArgs, yieldedValues);
491   // Promote the loop body up if this has turned into a single iteration loop.
492   (void)promoteIfSingleIteration(forOp);
493   return success();
494 }
495 
496 /// Return the new lower bound, upper bound, and step in that order. Insert any
497 /// additional bounds calculations before the given builder and any additional
498 /// conversion back to the original loop induction value inside the given Block.
499 static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
500                                 OpBuilder &insideLoopBuilder, Location loc,
501                                 Value lowerBound, Value upperBound, Value step,
502                                 Value inductionVar) {
503   // Check if the loop is already known to have a constant zero lower bound or
504   // a constant one step.
505   bool isZeroBased = false;
506   if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
507     isZeroBased = ubCst.value() == 0;
508 
509   bool isStepOne = false;
510   if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
511     isStepOne = stepCst.value() == 1;
512 
513   // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
514   // assuming the step is strictly positive.  Update the bounds and the step
515   // of the loop to go from 0 to the number of iterations, if necessary.
516   // TODO: introduce support for negative steps or emit dynamic asserts
517   // on step positivity, whatever gets implemented first.
518   if (isZeroBased && isStepOne)
519     return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
520             /*step=*/step};
521 
522   Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
523   Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
524 
525   Value newLowerBound =
526       isZeroBased ? lowerBound
527                   : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
528   Value newStep =
529       isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
530 
531   // Insert code computing the value of the original loop induction variable
532   // from the "normalized" one.
533   Value scaled =
534       isStepOne
535           ? inductionVar
536           : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
537   Value shifted =
538       isZeroBased
539           ? scaled
540           : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
541 
542   SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
543                                        shifted.getDefiningOp()};
544   inductionVar.replaceAllUsesExcept(shifted, preserve);
545   return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
546           /*step=*/newStep};
547 }
548 
549 /// Transform a loop with a strictly positive step
550 ///   for %i = %lb to %ub step %s
551 /// into a 0-based loop with step 1
552 ///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
553 ///     %i = %ii * %s + %lb
554 /// Insert the induction variable remapping in the body of `inner`, which is
555 /// expected to be either `loop` or another loop perfectly nested under `loop`.
556 /// Insert the definition of new bounds immediate before `outer`, which is
557 /// expected to be either `loop` or its parent in the loop nest.
558 static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
559   OpBuilder builder(outer);
560   OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
561   auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
562                                   loop.getLowerBound(), loop.getUpperBound(),
563                                   loop.getStep(), loop.getInductionVar());
564 
565   loop.setLowerBound(loopPieces.lowerBound);
566   loop.setUpperBound(loopPieces.upperBound);
567   loop.setStep(loopPieces.step);
568 }
569 
570 void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
571   if (loops.size() < 2)
572     return;
573 
574   scf::ForOp innermost = loops.back();
575   scf::ForOp outermost = loops.front();
576 
577   // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
578   // allows the following code to assume upperBound is the number of iterations.
579   for (auto loop : loops)
580     normalizeLoop(loop, outermost, innermost);
581 
582   // 2. Emit code computing the upper bound of the coalesced loop as product
583   // of the number of iterations of all loops.
584   OpBuilder builder(outermost);
585   Location loc = outermost.getLoc();
586   Value upperBound = outermost.getUpperBound();
587   for (auto loop : loops.drop_front())
588     upperBound =
589         builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
590   outermost.setUpperBound(upperBound);
591 
592   builder.setInsertionPointToStart(outermost.getBody());
593 
594   // 3. Remap induction variables. For each original loop, the value of the
595   // induction variable can be obtained by dividing the induction variable of
596   // the linearized loop by the total number of iterations of the loops nested
597   // in it modulo the number of iterations in this loop (remove the values
598   // related to the outer loops):
599   //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
600   // Compute these iteratively from the innermost loop by creating a "running
601   // quotient" of division by the range.
602   Value previous = outermost.getInductionVar();
603   for (unsigned i = 0, e = loops.size(); i < e; ++i) {
604     unsigned idx = loops.size() - i - 1;
605     if (i != 0)
606       previous = builder.create<arith::DivSIOp>(loc, previous,
607                                                 loops[idx + 1].getUpperBound());
608 
609     Value iv = (i == e - 1) ? previous
610                             : builder.create<arith::RemSIOp>(
611                                   loc, previous, loops[idx].getUpperBound());
612     replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
613                                loops.back().getRegion());
614   }
615 
616   // 4. Move the operations from the innermost just above the second-outermost
617   // loop, delete the extra terminator and the second-outermost loop.
618   scf::ForOp second = loops[1];
619   innermost.getBody()->back().erase();
620   outermost.getBody()->getOperations().splice(
621       Block::iterator(second.getOperation()),
622       innermost.getBody()->getOperations());
623   second.erase();
624 }
625 
626 void mlir::collapseParallelLoops(
627     scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
628   OpBuilder outsideBuilder(loops);
629   Location loc = loops.getLoc();
630 
631   // Presort combined dimensions.
632   auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
633   for (auto &dims : sortedDimensions)
634     std::sort(dims.begin(), dims.end());
635 
636   // Normalize ParallelOp's iteration pattern.
637   SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
638       normalizedUpperBounds;
639   for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
640     OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
641     auto resultBounds =
642         normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
643                       loops.getLowerBound()[i], loops.getUpperBound()[i],
644                       loops.getStep()[i], loops.getBody()->getArgument(i));
645 
646     normalizedLowerBounds.push_back(resultBounds.lowerBound);
647     normalizedUpperBounds.push_back(resultBounds.upperBound);
648     normalizedSteps.push_back(resultBounds.step);
649   }
650 
651   // Combine iteration spaces.
652   SmallVector<Value, 3> lowerBounds, upperBounds, steps;
653   auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
654   auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
655   for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
656     Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
657     for (auto idx : sortedDimensions[i]) {
658       newUpperBound = outsideBuilder.create<arith::MulIOp>(
659           loc, newUpperBound, normalizedUpperBounds[idx]);
660     }
661     lowerBounds.push_back(cst0);
662     steps.push_back(cst1);
663     upperBounds.push_back(newUpperBound);
664   }
665 
666   // Create new ParallelLoop with conversions to the original induction values.
667   // The loop below uses divisions to get the relevant range of values in the
668   // new induction value that represent each range of the original induction
669   // value. The remainders then determine based on that range, which iteration
670   // of the original induction value this represents. This is a normalized value
671   // that is un-normalized already by the previous logic.
672   auto newPloop = outsideBuilder.create<scf::ParallelOp>(
673       loc, lowerBounds, upperBounds, steps,
674       [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
675         for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
676           Value previous = ploopIVs[i];
677           unsigned numberCombinedDimensions = combinedDimensions[i].size();
678           // Iterate over all except the last induction value.
679           for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
680             unsigned idx = combinedDimensions[i][j];
681 
682             // Determine the current induction value's current loop iteration
683             Value iv = insideBuilder.create<arith::RemSIOp>(
684                 loc, previous, normalizedUpperBounds[idx]);
685             replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
686                                        loops.getRegion());
687 
688             // Remove the effect of the current induction value to prepare for
689             // the next value.
690             previous = insideBuilder.create<arith::DivSIOp>(
691                 loc, previous, normalizedUpperBounds[idx]);
692           }
693 
694           // The final induction value is just the remaining value.
695           unsigned idx = combinedDimensions[i][0];
696           replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
697                                      previous, loops.getRegion());
698         }
699       });
700 
701   // Replace the old loop with the new loop.
702   loops.getBody()->back().erase();
703   newPloop.getBody()->getOperations().splice(
704       Block::iterator(newPloop.getBody()->back()),
705       loops.getBody()->getOperations());
706   loops.erase();
707 }
708 
709 // Hoist the ops within `outer` that appear before `inner`.
710 // Such ops include the ops that have been introduced by parametric tiling.
711 // Ops that come from triangular loops (i.e. that belong to the program slice
712 // rooted at `outer`) and ops that have side effects cannot be hoisted.
713 // Return failure when any op fails to hoist.
714 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
715   SetVector<Operation *> forwardSlice;
716   getForwardSlice(
717       outer.getInductionVar(), &forwardSlice,
718       [&inner](Operation *op) { return op != inner.getOperation(); });
719   LogicalResult status = success();
720   SmallVector<Operation *, 8> toHoist;
721   for (auto &op : outer.getBody()->without_terminator()) {
722     // Stop when encountering the inner loop.
723     if (&op == inner.getOperation())
724       break;
725     // Skip over non-hoistable ops.
726     if (forwardSlice.count(&op) > 0) {
727       status = failure();
728       continue;
729     }
730     // Skip intermediate scf::ForOp, these are not considered a failure.
731     if (isa<scf::ForOp>(op))
732       continue;
733     // Skip other ops with regions.
734     if (op.getNumRegions() > 0) {
735       status = failure();
736       continue;
737     }
738     // Skip if op has side effects.
739     // TODO: loads to immutable memory regions are ok.
740     if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
741       status = failure();
742       continue;
743     }
744     toHoist.push_back(&op);
745   }
746   auto *outerForOp = outer.getOperation();
747   for (auto *op : toHoist)
748     op->moveBefore(outerForOp);
749   return status;
750 }
751 
752 // Traverse the interTile and intraTile loops and try to hoist ops such that
753 // bands of perfectly nested loops are isolated.
754 // Return failure if either perfect interTile or perfect intraTile bands cannot
755 // be formed.
756 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
757   LogicalResult status = success();
758   const Loops &interTile = tileLoops.first;
759   const Loops &intraTile = tileLoops.second;
760   auto size = interTile.size();
761   assert(size == intraTile.size());
762   if (size <= 1)
763     return success();
764   for (unsigned s = 1; s < size; ++s)
765     status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
766                                : failure();
767   for (unsigned s = 1; s < size; ++s)
768     status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
769                                : failure();
770   return status;
771 }
772 
773 /// Collect perfectly nested loops starting from `rootForOps`.  Loops are
774 /// perfectly nested if each loop is the first and only non-terminator operation
775 /// in the parent loop.  Collect at most `maxLoops` loops and append them to
776 /// `forOps`.
777 template <typename T>
778 static void getPerfectlyNestedLoopsImpl(
779     SmallVectorImpl<T> &forOps, T rootForOp,
780     unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
781   for (unsigned i = 0; i < maxLoops; ++i) {
782     forOps.push_back(rootForOp);
783     Block &body = rootForOp.getRegion().front();
784     if (body.begin() != std::prev(body.end(), 2))
785       return;
786 
787     rootForOp = dyn_cast<T>(&body.front());
788     if (!rootForOp)
789       return;
790   }
791 }
792 
793 static Loops stripmineSink(scf::ForOp forOp, Value factor,
794                            ArrayRef<scf::ForOp> targets) {
795   auto originalStep = forOp.getStep();
796   auto iv = forOp.getInductionVar();
797 
798   OpBuilder b(forOp);
799   forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
800 
801   Loops innerLoops;
802   for (auto t : targets) {
803     // Save information for splicing ops out of t when done
804     auto begin = t.getBody()->begin();
805     auto nOps = t.getBody()->getOperations().size();
806 
807     // Insert newForOp before the terminator of `t`.
808     auto b = OpBuilder::atBlockTerminator((t.getBody()));
809     Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
810     Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
811                                          forOp.getUpperBound(), stepped);
812     Value ub = b.create<arith::SelectOp>(t.getLoc(), less,
813                                          forOp.getUpperBound(), stepped);
814 
815     // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
816     auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
817     newForOp.getBody()->getOperations().splice(
818         newForOp.getBody()->getOperations().begin(),
819         t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
820     replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
821                                newForOp.getRegion());
822 
823     innerLoops.push_back(newForOp);
824   }
825 
826   return innerLoops;
827 }
828 
829 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
830 // Returns the new for operation, nested immediately under `target`.
831 template <typename SizeType>
832 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
833                                 scf::ForOp target) {
834   // TODO: Use cheap structural assertions that targets are nested under
835   // forOp and that targets are not nested under each other when DominanceInfo
836   // exposes the capability. It seems overkill to construct a whole function
837   // dominance tree at this point.
838   auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
839   assert(res.size() == 1 && "Expected 1 inner forOp");
840   return res[0];
841 }
842 
843 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
844                                  ArrayRef<Value> sizes,
845                                  ArrayRef<scf::ForOp> targets) {
846   SmallVector<SmallVector<scf::ForOp, 8>, 8> res;
847   SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
848   for (auto it : llvm::zip(forOps, sizes)) {
849     auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
850     res.push_back(step);
851     currentTargets = step;
852   }
853   return res;
854 }
855 
856 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
857                  scf::ForOp target) {
858   SmallVector<scf::ForOp, 8> res;
859   for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
860     assert(loops.size() == 1);
861     res.push_back(loops[0]);
862   }
863   return res;
864 }
865 
866 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
867   // Collect perfectly nested loops.  If more size values provided than nested
868   // loops available, truncate `sizes`.
869   SmallVector<scf::ForOp, 4> forOps;
870   forOps.reserve(sizes.size());
871   getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
872   if (forOps.size() < sizes.size())
873     sizes = sizes.take_front(forOps.size());
874 
875   return ::tile(forOps, sizes, forOps.back());
876 }
877 
878 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
879                                    scf::ForOp root) {
880   getPerfectlyNestedLoopsImpl(nestedLoops, root);
881 }
882 
883 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
884                                        ArrayRef<int64_t> sizes) {
885   // Collect perfectly nested loops.  If more size values provided than nested
886   // loops available, truncate `sizes`.
887   SmallVector<scf::ForOp, 4> forOps;
888   forOps.reserve(sizes.size());
889   getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
890   if (forOps.size() < sizes.size())
891     sizes = sizes.take_front(forOps.size());
892 
893   // Compute the tile sizes such that i-th outer loop executes size[i]
894   // iterations.  Given that the loop current executes
895   //   numIterations = ceildiv((upperBound - lowerBound), step)
896   // iterations, we need to tile with size ceildiv(numIterations, size[i]).
897   SmallVector<Value, 4> tileSizes;
898   tileSizes.reserve(sizes.size());
899   for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
900     assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
901 
902     auto forOp = forOps[i];
903     OpBuilder builder(forOp);
904     auto loc = forOp.getLoc();
905     Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
906                                                forOp.getLowerBound());
907     Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
908     Value iterationsPerBlock =
909         ceilDivPositive(builder, loc, numIterations, sizes[i]);
910     tileSizes.push_back(iterationsPerBlock);
911   }
912 
913   // Call parametric tiling with the given sizes.
914   auto intraTile = tile(forOps, tileSizes, forOps.back());
915   TileLoops tileLoops = std::make_pair(forOps, intraTile);
916 
917   // TODO: for now we just ignore the result of band isolation.
918   // In the future, mapping decisions may be impacted by the ability to
919   // isolate perfectly nested bands.
920   (void)tryIsolateBands(tileLoops);
921 
922   return tileLoops;
923 }
924