1 //===- Utils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/SCF/Utils/Utils.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
16 #include "mlir/Dialect/Func/IR/FuncOps.h"
17 #include "mlir/Dialect/SCF/IR/SCF.h"
18 #include "mlir/IR/BlockAndValueMapping.h"
19 #include "mlir/IR/BuiltinOps.h"
20 #include "mlir/IR/PatternMatch.h"
21 #include "mlir/Support/MathExtras.h"
22 #include "mlir/Transforms/RegionUtils.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SetVector.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27 
28 using namespace mlir;
29 
30 namespace {
31 // This structure is to pass and return sets of loop parameters without
32 // confusing the order.
33 struct LoopParams {
34   Value lowerBound;
35   Value upperBound;
36   Value step;
37 };
38 } // namespace
39 
40 scf::ForOp
replaceLoopWithNewYields(OpBuilder & builder,scf::ForOp loop,ValueRange newIterOperands,const NewYieldValueFn & newYieldValuesFn)41 mlir::replaceLoopWithNewYields(OpBuilder &builder, scf::ForOp loop,
42                                ValueRange newIterOperands,
43                                const NewYieldValueFn &newYieldValuesFn) {
44   // Create a new loop before the existing one, with the extra operands.
45   OpBuilder::InsertionGuard g(builder);
46   builder.setInsertionPoint(loop);
47   auto operands = llvm::to_vector(loop.getIterOperands());
48   operands.append(newIterOperands.begin(), newIterOperands.end());
49   scf::ForOp newLoop = builder.create<scf::ForOp>(
50       loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(), loop.getStep(),
51       operands, [](OpBuilder &, Location, Value, ValueRange) {});
52 
53   Block *loopBody = loop.getBody();
54   Block *newLoopBody = newLoop.getBody();
55 
56   // Move the body of the original loop to the new loop.
57   newLoopBody->getOperations().splice(newLoopBody->end(),
58                                       loopBody->getOperations());
59 
60   // Generate the new yield values to use by using the callback and append the
61   // yield values to the scf.yield operation.
62   auto yield = cast<scf::YieldOp>(newLoopBody->getTerminator());
63   ArrayRef<BlockArgument> newBBArgs =
64       newLoopBody->getArguments().take_back(newIterOperands.size());
65   {
66     OpBuilder::InsertionGuard g(builder);
67     builder.setInsertionPoint(yield);
68     SmallVector<Value> newYieldedValues =
69         newYieldValuesFn(builder, loop.getLoc(), newBBArgs);
70     assert(newIterOperands.size() == newYieldedValues.size() &&
71            "expected as many new yield values as new iter operands");
72     yield.getResultsMutable().append(newYieldedValues);
73   }
74 
75   // Remap the BlockArguments from the original loop to the new loop
76   // BlockArguments.
77   ArrayRef<BlockArgument> bbArgs = loopBody->getArguments();
78   for (auto it :
79        llvm::zip(bbArgs, newLoopBody->getArguments().take_front(bbArgs.size())))
80     std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
81 
82   // Replace all uses of `newIterOperands` with the corresponding basic block
83   // arguments.
84   for (auto it : llvm::zip(newIterOperands, newBBArgs)) {
85     std::get<0>(it).replaceUsesWithIf(std::get<1>(it), [&](OpOperand &use) {
86       Operation *user = use.getOwner();
87       return newLoop->isProperAncestor(user);
88     });
89   }
90 
91   // Replace all uses of the original loop with corresponding values from the
92   // new loop.
93   loop.replaceAllUsesWith(
94       newLoop.getResults().take_front(loop.getNumResults()));
95 
96   // Add a fake yield to the original loop body that just returns the
97   // BlockArguments corresponding to the iter_args. This makes it a no-op loop.
98   // The loop is dead. The caller is expected to erase it.
99   builder.setInsertionPointToEnd(loopBody);
100   builder.create<scf::YieldOp>(loop->getLoc(), loop.getRegionIterArgs());
101 
102   return newLoop;
103 }
104 
replaceLoopNestWithNewYields(OpBuilder & builder,ArrayRef<scf::ForOp> loopNest,ValueRange newIterOperands,NewYieldValueFn newYieldValueFn)105 SmallVector<scf::ForOp> mlir::replaceLoopNestWithNewYields(
106     OpBuilder &builder, ArrayRef<scf::ForOp> loopNest,
107     ValueRange newIterOperands, NewYieldValueFn newYieldValueFn) {
108   if (loopNest.empty())
109     return {};
110   SmallVector<scf::ForOp> newLoopNest(loopNest.size());
111 
112   newLoopNest.back() = replaceLoopWithNewYields(
113       builder, loopNest.back(), newIterOperands, newYieldValueFn);
114 
115   for (unsigned loopDepth :
116        llvm::reverse(llvm::seq<unsigned>(0, loopNest.size() - 1))) {
117     NewYieldValueFn fn = [&](OpBuilder &innerBuilder, Location loc,
118                              ArrayRef<BlockArgument> innerNewBBArgs) {
119       SmallVector<Value> newYields(
120           newLoopNest[loopDepth + 1]->getResults().take_back(
121               newIterOperands.size()));
122       return newYields;
123     };
124     newLoopNest[loopDepth] = replaceLoopWithNewYields(
125         builder, loopNest[loopDepth], newIterOperands, fn);
126   }
127   return newLoopNest;
128 }
129 
130 /// Outline a region with a single block into a new FuncOp.
131 /// Assumes the FuncOp result types is the type of the yielded operands of the
132 /// single block. This constraint makes it easy to determine the result.
133 /// This method also clones the `arith::ConstantIndexOp` at the start of
134 /// `outlinedFuncBody` to alloc simple canonicalizations. If `callOp` is
135 /// provided, it will be set to point to the operation that calls the outlined
136 /// function.
137 // TODO: support more than single-block regions.
138 // TODO: more flexible constant handling.
outlineSingleBlockRegion(RewriterBase & rewriter,Location loc,Region & region,StringRef funcName,func::CallOp * callOp)139 FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
140                                                        Location loc,
141                                                        Region &region,
142                                                        StringRef funcName,
143                                                        func::CallOp *callOp) {
144   assert(!funcName.empty() && "funcName cannot be empty");
145   if (!region.hasOneBlock())
146     return failure();
147 
148   Block *originalBlock = &region.front();
149   Operation *originalTerminator = originalBlock->getTerminator();
150 
151   // Outline before current function.
152   OpBuilder::InsertionGuard g(rewriter);
153   rewriter.setInsertionPoint(region.getParentOfType<func::FuncOp>());
154 
155   SetVector<Value> captures;
156   getUsedValuesDefinedAbove(region, captures);
157 
158   ValueRange outlinedValues(captures.getArrayRef());
159   SmallVector<Type> outlinedFuncArgTypes;
160   SmallVector<Location> outlinedFuncArgLocs;
161   // Region's arguments are exactly the first block's arguments as per
162   // Region::getArguments().
163   // Func's arguments are cat(regions's arguments, captures arguments).
164   for (BlockArgument arg : region.getArguments()) {
165     outlinedFuncArgTypes.push_back(arg.getType());
166     outlinedFuncArgLocs.push_back(arg.getLoc());
167   }
168   for (Value value : outlinedValues) {
169     outlinedFuncArgTypes.push_back(value.getType());
170     outlinedFuncArgLocs.push_back(value.getLoc());
171   }
172   FunctionType outlinedFuncType =
173       FunctionType::get(rewriter.getContext(), outlinedFuncArgTypes,
174                         originalTerminator->getOperandTypes());
175   auto outlinedFunc =
176       rewriter.create<func::FuncOp>(loc, funcName, outlinedFuncType);
177   Block *outlinedFuncBody = outlinedFunc.addEntryBlock();
178 
179   // Merge blocks while replacing the original block operands.
180   // Warning: `mergeBlocks` erases the original block, reconstruct it later.
181   int64_t numOriginalBlockArguments = originalBlock->getNumArguments();
182   auto outlinedFuncBlockArgs = outlinedFuncBody->getArguments();
183   {
184     OpBuilder::InsertionGuard g(rewriter);
185     rewriter.setInsertionPointToEnd(outlinedFuncBody);
186     rewriter.mergeBlocks(
187         originalBlock, outlinedFuncBody,
188         outlinedFuncBlockArgs.take_front(numOriginalBlockArguments));
189     // Explicitly set up a new ReturnOp terminator.
190     rewriter.setInsertionPointToEnd(outlinedFuncBody);
191     rewriter.create<func::ReturnOp>(loc, originalTerminator->getResultTypes(),
192                                     originalTerminator->getOperands());
193   }
194 
195   // Reconstruct the block that was deleted and add a
196   // terminator(call_results).
197   Block *newBlock = rewriter.createBlock(
198       &region, region.begin(),
199       TypeRange{outlinedFuncArgTypes}.take_front(numOriginalBlockArguments),
200       ArrayRef<Location>(outlinedFuncArgLocs)
201           .take_front(numOriginalBlockArguments));
202   {
203     OpBuilder::InsertionGuard g(rewriter);
204     rewriter.setInsertionPointToEnd(newBlock);
205     SmallVector<Value> callValues;
206     llvm::append_range(callValues, newBlock->getArguments());
207     llvm::append_range(callValues, outlinedValues);
208     auto call = rewriter.create<func::CallOp>(loc, outlinedFunc, callValues);
209     if (callOp)
210       *callOp = call;
211 
212     // `originalTerminator` was moved to `outlinedFuncBody` and is still valid.
213     // Clone `originalTerminator` to take the callOp results then erase it from
214     // `outlinedFuncBody`.
215     BlockAndValueMapping bvm;
216     bvm.map(originalTerminator->getOperands(), call->getResults());
217     rewriter.clone(*originalTerminator, bvm);
218     rewriter.eraseOp(originalTerminator);
219   }
220 
221   // Lastly, explicit RAUW outlinedValues, only for uses within `outlinedFunc`.
222   // Clone the `arith::ConstantIndexOp` at the start of `outlinedFuncBody`.
223   for (auto it : llvm::zip(outlinedValues, outlinedFuncBlockArgs.take_back(
224                                                outlinedValues.size()))) {
225     Value orig = std::get<0>(it);
226     Value repl = std::get<1>(it);
227     {
228       OpBuilder::InsertionGuard g(rewriter);
229       rewriter.setInsertionPointToStart(outlinedFuncBody);
230       if (Operation *cst = orig.getDefiningOp<arith::ConstantIndexOp>()) {
231         BlockAndValueMapping bvm;
232         repl = rewriter.clone(*cst, bvm)->getResult(0);
233       }
234     }
235     orig.replaceUsesWithIf(repl, [&](OpOperand &opOperand) {
236       return outlinedFunc->isProperAncestor(opOperand.getOwner());
237     });
238   }
239 
240   return outlinedFunc;
241 }
242 
outlineIfOp(RewriterBase & b,scf::IfOp ifOp,func::FuncOp * thenFn,StringRef thenFnName,func::FuncOp * elseFn,StringRef elseFnName)243 LogicalResult mlir::outlineIfOp(RewriterBase &b, scf::IfOp ifOp,
244                                 func::FuncOp *thenFn, StringRef thenFnName,
245                                 func::FuncOp *elseFn, StringRef elseFnName) {
246   IRRewriter rewriter(b);
247   Location loc = ifOp.getLoc();
248   FailureOr<func::FuncOp> outlinedFuncOpOrFailure;
249   if (thenFn && !ifOp.getThenRegion().empty()) {
250     outlinedFuncOpOrFailure = outlineSingleBlockRegion(
251         rewriter, loc, ifOp.getThenRegion(), thenFnName);
252     if (failed(outlinedFuncOpOrFailure))
253       return failure();
254     *thenFn = *outlinedFuncOpOrFailure;
255   }
256   if (elseFn && !ifOp.getElseRegion().empty()) {
257     outlinedFuncOpOrFailure = outlineSingleBlockRegion(
258         rewriter, loc, ifOp.getElseRegion(), elseFnName);
259     if (failed(outlinedFuncOpOrFailure))
260       return failure();
261     *elseFn = *outlinedFuncOpOrFailure;
262   }
263   return success();
264 }
265 
getInnermostParallelLoops(Operation * rootOp,SmallVectorImpl<scf::ParallelOp> & result)266 bool mlir::getInnermostParallelLoops(Operation *rootOp,
267                                      SmallVectorImpl<scf::ParallelOp> &result) {
268   assert(rootOp != nullptr && "Root operation must not be a nullptr.");
269   bool rootEnclosesPloops = false;
270   for (Region &region : rootOp->getRegions()) {
271     for (Block &block : region.getBlocks()) {
272       for (Operation &op : block) {
273         bool enclosesPloops = getInnermostParallelLoops(&op, result);
274         rootEnclosesPloops |= enclosesPloops;
275         if (auto ploop = dyn_cast<scf::ParallelOp>(op)) {
276           rootEnclosesPloops = true;
277 
278           // Collect parallel loop if it is an innermost one.
279           if (!enclosesPloops)
280             result.push_back(ploop);
281         }
282       }
283     }
284   }
285   return rootEnclosesPloops;
286 }
287 
288 // Build the IR that performs ceil division of a positive value by a constant:
289 //    ceildiv(a, B) = divis(a + (B-1), B)
290 // where divis is rounding-to-zero division.
ceilDivPositive(OpBuilder & builder,Location loc,Value dividend,int64_t divisor)291 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
292                              int64_t divisor) {
293   assert(divisor > 0 && "expected positive divisor");
294   assert(dividend.getType().isIndex() && "expected index-typed value");
295 
296   Value divisorMinusOneCst =
297       builder.create<arith::ConstantIndexOp>(loc, divisor - 1);
298   Value divisorCst = builder.create<arith::ConstantIndexOp>(loc, divisor);
299   Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOneCst);
300   return builder.create<arith::DivSIOp>(loc, sum, divisorCst);
301 }
302 
303 // Build the IR that performs ceil division of a positive value by another
304 // positive value:
305 //    ceildiv(a, b) = divis(a + (b - 1), b)
306 // where divis is rounding-to-zero division.
ceilDivPositive(OpBuilder & builder,Location loc,Value dividend,Value divisor)307 static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
308                              Value divisor) {
309   assert(dividend.getType().isIndex() && "expected index-typed value");
310 
311   Value cstOne = builder.create<arith::ConstantIndexOp>(loc, 1);
312   Value divisorMinusOne = builder.create<arith::SubIOp>(loc, divisor, cstOne);
313   Value sum = builder.create<arith::AddIOp>(loc, dividend, divisorMinusOne);
314   return builder.create<arith::DivSIOp>(loc, sum, divisor);
315 }
316 
317 /// Helper to replace uses of loop carried values (iter_args) and loop
318 /// yield values while promoting single iteration scf.for ops.
replaceIterArgsAndYieldResults(scf::ForOp forOp)319 static void replaceIterArgsAndYieldResults(scf::ForOp forOp) {
320   // Replace uses of iter arguments with iter operands (initial values).
321   auto iterOperands = forOp.getIterOperands();
322   auto iterArgs = forOp.getRegionIterArgs();
323   for (auto e : llvm::zip(iterOperands, iterArgs))
324     std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
325 
326   // Replace uses of loop results with the values yielded by the loop.
327   auto outerResults = forOp.getResults();
328   auto innerResults = forOp.getBody()->getTerminator()->getOperands();
329   for (auto e : llvm::zip(outerResults, innerResults))
330     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
331 }
332 
333 /// Promotes the loop body of a forOp to its containing block if the forOp
334 /// it can be determined that the loop has a single iteration.
promoteIfSingleIteration(scf::ForOp forOp)335 LogicalResult mlir::promoteIfSingleIteration(scf::ForOp forOp) {
336   auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
337   auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
338   auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
339   if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.value() < 0 ||
340       ubCstOp.value() < 0 || stepCstOp.value() < 0)
341     return failure();
342   int64_t tripCount =
343       mlir::ceilDiv(ubCstOp.value() - lbCstOp.value(), stepCstOp.value());
344   if (tripCount != 1)
345     return failure();
346   auto iv = forOp.getInductionVar();
347   iv.replaceAllUsesWith(lbCstOp);
348 
349   replaceIterArgsAndYieldResults(forOp);
350 
351   // Move the loop body operations, except for its terminator, to the loop's
352   // containing block.
353   auto *parentBlock = forOp->getBlock();
354   forOp.getBody()->getTerminator()->erase();
355   parentBlock->getOperations().splice(Block::iterator(forOp),
356                                       forOp.getBody()->getOperations());
357   forOp.erase();
358   return success();
359 }
360 
361 /// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
362 /// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
363 /// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
364 /// unrolled iteration using annotateFn.
generateUnrolledLoop(Block * loopBodyBlock,Value forOpIV,uint64_t unrollFactor,function_ref<Value (unsigned,Value,OpBuilder)> ivRemapFn,function_ref<void (unsigned,Operation *,OpBuilder)> annotateFn,ValueRange iterArgs,ValueRange yieldedValues)365 static void generateUnrolledLoop(
366     Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
367     function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
368     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
369     ValueRange iterArgs, ValueRange yieldedValues) {
370   // Builder to insert unrolled bodies just before the terminator of the body of
371   // 'forOp'.
372   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
373 
374   if (!annotateFn)
375     annotateFn = [](unsigned, Operation *, OpBuilder) {};
376 
377   // Keep a pointer to the last non-terminator operation in the original block
378   // so that we know what to clone (since we are doing this in-place).
379   Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
380 
381   // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
382   SmallVector<Value, 4> lastYielded(yieldedValues);
383 
384   for (unsigned i = 1; i < unrollFactor; i++) {
385     BlockAndValueMapping operandMap;
386 
387     // Prepare operand map.
388     operandMap.map(iterArgs, lastYielded);
389 
390     // If the induction variable is used, create a remapping to the value for
391     // this unrolled instance.
392     if (!forOpIV.use_empty()) {
393       Value ivUnroll = ivRemapFn(i, forOpIV, builder);
394       operandMap.map(forOpIV, ivUnroll);
395     }
396 
397     // Clone the original body of 'forOp'.
398     for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
399       Operation *clonedOp = builder.clone(*it, operandMap);
400       annotateFn(i, clonedOp, builder);
401     }
402 
403     // Update yielded values.
404     for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
405       lastYielded[i] = operandMap.lookup(yieldedValues[i]);
406   }
407 
408   // Make sure we annotate the Ops in the original body. We do this last so that
409   // any annotations are not copied into the cloned Ops above.
410   for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
411     annotateFn(0, &*it, builder);
412 
413   // Update operands of the yield statement.
414   loopBodyBlock->getTerminator()->setOperands(lastYielded);
415 }
416 
417 /// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
loopUnrollByFactor(scf::ForOp forOp,uint64_t unrollFactor,function_ref<void (unsigned,Operation *,OpBuilder)> annotateFn)418 LogicalResult mlir::loopUnrollByFactor(
419     scf::ForOp forOp, uint64_t unrollFactor,
420     function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
421   assert(unrollFactor > 0 && "expected positive unroll factor");
422 
423   // Return if the loop body is empty.
424   if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
425     return success();
426 
427   // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
428   // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
429   OpBuilder boundsBuilder(forOp);
430   auto loc = forOp.getLoc();
431   auto step = forOp.getStep();
432   Value upperBoundUnrolled;
433   Value stepUnrolled;
434   bool generateEpilogueLoop = true;
435 
436   auto lbCstOp = forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
437   auto ubCstOp = forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
438   auto stepCstOp = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
439   if (lbCstOp && ubCstOp && stepCstOp) {
440     // Constant loop bounds computation.
441     int64_t lbCst = lbCstOp.value();
442     int64_t ubCst = ubCstOp.value();
443     int64_t stepCst = stepCstOp.value();
444     assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
445            "expected positive loop bounds and step");
446     int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
447 
448     if (unrollFactor == 1) {
449       if (tripCount == 1 && failed(promoteIfSingleIteration(forOp)))
450         return failure();
451       return success();
452     }
453 
454     int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
455     int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
456     assert(upperBoundUnrolledCst <= ubCst);
457     int64_t stepUnrolledCst = stepCst * unrollFactor;
458 
459     // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
460     generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
461     if (generateEpilogueLoop)
462       upperBoundUnrolled = boundsBuilder.create<arith::ConstantIndexOp>(
463           loc, upperBoundUnrolledCst);
464     else
465       upperBoundUnrolled = ubCstOp;
466 
467     // Create constant for 'stepUnrolled'.
468     stepUnrolled = stepCst == stepUnrolledCst
469                        ? step
470                        : boundsBuilder.create<arith::ConstantIndexOp>(
471                              loc, stepUnrolledCst);
472   } else {
473     // Dynamic loop bounds computation.
474     // TODO: Add dynamic asserts for negative lb/ub/step, or
475     // consider using ceilDiv from AffineApplyExpander.
476     auto lowerBound = forOp.getLowerBound();
477     auto upperBound = forOp.getUpperBound();
478     Value diff =
479         boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
480     Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
481     Value unrollFactorCst =
482         boundsBuilder.create<arith::ConstantIndexOp>(loc, unrollFactor);
483     Value tripCountRem =
484         boundsBuilder.create<arith::RemSIOp>(loc, tripCount, unrollFactorCst);
485     // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
486     Value tripCountEvenMultiple =
487         boundsBuilder.create<arith::SubIOp>(loc, tripCount, tripCountRem);
488     // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
489     upperBoundUnrolled = boundsBuilder.create<arith::AddIOp>(
490         loc, lowerBound,
491         boundsBuilder.create<arith::MulIOp>(loc, tripCountEvenMultiple, step));
492     // Scale 'step' by 'unrollFactor'.
493     stepUnrolled =
494         boundsBuilder.create<arith::MulIOp>(loc, step, unrollFactorCst);
495   }
496 
497   // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
498   if (generateEpilogueLoop) {
499     OpBuilder epilogueBuilder(forOp->getContext());
500     epilogueBuilder.setInsertionPoint(forOp->getBlock(),
501                                       std::next(Block::iterator(forOp)));
502     auto epilogueForOp = cast<scf::ForOp>(epilogueBuilder.clone(*forOp));
503     epilogueForOp.setLowerBound(upperBoundUnrolled);
504 
505     // Update uses of loop results.
506     auto results = forOp.getResults();
507     auto epilogueResults = epilogueForOp.getResults();
508 
509     for (auto e : llvm::zip(results, epilogueResults)) {
510       std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
511     }
512     epilogueForOp->setOperands(epilogueForOp.getNumControlOperands(),
513                                epilogueForOp.getNumIterOperands(), results);
514     (void)promoteIfSingleIteration(epilogueForOp);
515   }
516 
517   // Create unrolled loop.
518   forOp.setUpperBound(upperBoundUnrolled);
519   forOp.setStep(stepUnrolled);
520 
521   auto iterArgs = ValueRange(forOp.getRegionIterArgs());
522   auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
523 
524   generateUnrolledLoop(
525       forOp.getBody(), forOp.getInductionVar(), unrollFactor,
526       [&](unsigned i, Value iv, OpBuilder b) {
527         // iv' = iv + step * i;
528         auto stride = b.create<arith::MulIOp>(
529             loc, step, b.create<arith::ConstantIndexOp>(loc, i));
530         return b.create<arith::AddIOp>(loc, iv, stride);
531       },
532       annotateFn, iterArgs, yieldedValues);
533   // Promote the loop body up if this has turned into a single iteration loop.
534   (void)promoteIfSingleIteration(forOp);
535   return success();
536 }
537 
538 /// Return the new lower bound, upper bound, and step in that order. Insert any
539 /// additional bounds calculations before the given builder and any additional
540 /// conversion back to the original loop induction value inside the given Block.
normalizeLoop(OpBuilder & boundsBuilder,OpBuilder & insideLoopBuilder,Location loc,Value lowerBound,Value upperBound,Value step,Value inductionVar)541 static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
542                                 OpBuilder &insideLoopBuilder, Location loc,
543                                 Value lowerBound, Value upperBound, Value step,
544                                 Value inductionVar) {
545   // Check if the loop is already known to have a constant zero lower bound or
546   // a constant one step.
547   bool isZeroBased = false;
548   if (auto ubCst = lowerBound.getDefiningOp<arith::ConstantIndexOp>())
549     isZeroBased = ubCst.value() == 0;
550 
551   bool isStepOne = false;
552   if (auto stepCst = step.getDefiningOp<arith::ConstantIndexOp>())
553     isStepOne = stepCst.value() == 1;
554 
555   // Compute the number of iterations the loop executes: ceildiv(ub - lb, step)
556   // assuming the step is strictly positive.  Update the bounds and the step
557   // of the loop to go from 0 to the number of iterations, if necessary.
558   // TODO: introduce support for negative steps or emit dynamic asserts
559   // on step positivity, whatever gets implemented first.
560   if (isZeroBased && isStepOne)
561     return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
562             /*step=*/step};
563 
564   Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
565   Value newUpperBound = ceilDivPositive(boundsBuilder, loc, diff, step);
566 
567   Value newLowerBound =
568       isZeroBased ? lowerBound
569                   : boundsBuilder.create<arith::ConstantIndexOp>(loc, 0);
570   Value newStep =
571       isStepOne ? step : boundsBuilder.create<arith::ConstantIndexOp>(loc, 1);
572 
573   // Insert code computing the value of the original loop induction variable
574   // from the "normalized" one.
575   Value scaled =
576       isStepOne
577           ? inductionVar
578           : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
579   Value shifted =
580       isZeroBased
581           ? scaled
582           : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
583 
584   SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
585                                        shifted.getDefiningOp()};
586   inductionVar.replaceAllUsesExcept(shifted, preserve);
587   return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
588           /*step=*/newStep};
589 }
590 
591 /// Transform a loop with a strictly positive step
592 ///   for %i = %lb to %ub step %s
593 /// into a 0-based loop with step 1
594 ///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
595 ///     %i = %ii * %s + %lb
596 /// Insert the induction variable remapping in the body of `inner`, which is
597 /// expected to be either `loop` or another loop perfectly nested under `loop`.
598 /// Insert the definition of new bounds immediate before `outer`, which is
599 /// expected to be either `loop` or its parent in the loop nest.
normalizeLoop(scf::ForOp loop,scf::ForOp outer,scf::ForOp inner)600 static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
601   OpBuilder builder(outer);
602   OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
603   auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
604                                   loop.getLowerBound(), loop.getUpperBound(),
605                                   loop.getStep(), loop.getInductionVar());
606 
607   loop.setLowerBound(loopPieces.lowerBound);
608   loop.setUpperBound(loopPieces.upperBound);
609   loop.setStep(loopPieces.step);
610 }
611 
coalesceLoops(MutableArrayRef<scf::ForOp> loops)612 void mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
613   if (loops.size() < 2)
614     return;
615 
616   scf::ForOp innermost = loops.back();
617   scf::ForOp outermost = loops.front();
618 
619   // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
620   // allows the following code to assume upperBound is the number of iterations.
621   for (auto loop : loops)
622     normalizeLoop(loop, outermost, innermost);
623 
624   // 2. Emit code computing the upper bound of the coalesced loop as product
625   // of the number of iterations of all loops.
626   OpBuilder builder(outermost);
627   Location loc = outermost.getLoc();
628   Value upperBound = outermost.getUpperBound();
629   for (auto loop : loops.drop_front())
630     upperBound =
631         builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
632   outermost.setUpperBound(upperBound);
633 
634   builder.setInsertionPointToStart(outermost.getBody());
635 
636   // 3. Remap induction variables. For each original loop, the value of the
637   // induction variable can be obtained by dividing the induction variable of
638   // the linearized loop by the total number of iterations of the loops nested
639   // in it modulo the number of iterations in this loop (remove the values
640   // related to the outer loops):
641   //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
642   // Compute these iteratively from the innermost loop by creating a "running
643   // quotient" of division by the range.
644   Value previous = outermost.getInductionVar();
645   for (unsigned i = 0, e = loops.size(); i < e; ++i) {
646     unsigned idx = loops.size() - i - 1;
647     if (i != 0)
648       previous = builder.create<arith::DivSIOp>(loc, previous,
649                                                 loops[idx + 1].getUpperBound());
650 
651     Value iv = (i == e - 1) ? previous
652                             : builder.create<arith::RemSIOp>(
653                                   loc, previous, loops[idx].getUpperBound());
654     replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
655                                loops.back().getRegion());
656   }
657 
658   // 4. Move the operations from the innermost just above the second-outermost
659   // loop, delete the extra terminator and the second-outermost loop.
660   scf::ForOp second = loops[1];
661   innermost.getBody()->back().erase();
662   outermost.getBody()->getOperations().splice(
663       Block::iterator(second.getOperation()),
664       innermost.getBody()->getOperations());
665   second.erase();
666 }
667 
collapseParallelLoops(scf::ParallelOp loops,ArrayRef<std::vector<unsigned>> combinedDimensions)668 void mlir::collapseParallelLoops(
669     scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
670   OpBuilder outsideBuilder(loops);
671   Location loc = loops.getLoc();
672 
673   // Presort combined dimensions.
674   auto sortedDimensions = llvm::to_vector<3>(combinedDimensions);
675   for (auto &dims : sortedDimensions)
676     llvm::sort(dims);
677 
678   // Normalize ParallelOp's iteration pattern.
679   SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
680       normalizedUpperBounds;
681   for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
682     OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
683     auto resultBounds =
684         normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
685                       loops.getLowerBound()[i], loops.getUpperBound()[i],
686                       loops.getStep()[i], loops.getBody()->getArgument(i));
687 
688     normalizedLowerBounds.push_back(resultBounds.lowerBound);
689     normalizedUpperBounds.push_back(resultBounds.upperBound);
690     normalizedSteps.push_back(resultBounds.step);
691   }
692 
693   // Combine iteration spaces.
694   SmallVector<Value, 3> lowerBounds, upperBounds, steps;
695   auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
696   auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
697   for (unsigned i = 0, e = sortedDimensions.size(); i < e; ++i) {
698     Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
699     for (auto idx : sortedDimensions[i]) {
700       newUpperBound = outsideBuilder.create<arith::MulIOp>(
701           loc, newUpperBound, normalizedUpperBounds[idx]);
702     }
703     lowerBounds.push_back(cst0);
704     steps.push_back(cst1);
705     upperBounds.push_back(newUpperBound);
706   }
707 
708   // Create new ParallelLoop with conversions to the original induction values.
709   // The loop below uses divisions to get the relevant range of values in the
710   // new induction value that represent each range of the original induction
711   // value. The remainders then determine based on that range, which iteration
712   // of the original induction value this represents. This is a normalized value
713   // that is un-normalized already by the previous logic.
714   auto newPloop = outsideBuilder.create<scf::ParallelOp>(
715       loc, lowerBounds, upperBounds, steps,
716       [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
717         for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
718           Value previous = ploopIVs[i];
719           unsigned numberCombinedDimensions = combinedDimensions[i].size();
720           // Iterate over all except the last induction value.
721           for (unsigned j = numberCombinedDimensions - 1; j > 0; --j) {
722             unsigned idx = combinedDimensions[i][j];
723 
724             // Determine the current induction value's current loop iteration
725             Value iv = insideBuilder.create<arith::RemSIOp>(
726                 loc, previous, normalizedUpperBounds[idx]);
727             replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx), iv,
728                                        loops.getRegion());
729 
730             // Remove the effect of the current induction value to prepare for
731             // the next value.
732             previous = insideBuilder.create<arith::DivSIOp>(
733                 loc, previous, normalizedUpperBounds[idx]);
734           }
735 
736           // The final induction value is just the remaining value.
737           unsigned idx = combinedDimensions[i][0];
738           replaceAllUsesInRegionWith(loops.getBody()->getArgument(idx),
739                                      previous, loops.getRegion());
740         }
741       });
742 
743   // Replace the old loop with the new loop.
744   loops.getBody()->back().erase();
745   newPloop.getBody()->getOperations().splice(
746       Block::iterator(newPloop.getBody()->back()),
747       loops.getBody()->getOperations());
748   loops.erase();
749 }
750 
751 // Hoist the ops within `outer` that appear before `inner`.
752 // Such ops include the ops that have been introduced by parametric tiling.
753 // Ops that come from triangular loops (i.e. that belong to the program slice
754 // rooted at `outer`) and ops that have side effects cannot be hoisted.
755 // Return failure when any op fails to hoist.
hoistOpsBetween(scf::ForOp outer,scf::ForOp inner)756 static LogicalResult hoistOpsBetween(scf::ForOp outer, scf::ForOp inner) {
757   SetVector<Operation *> forwardSlice;
758   getForwardSlice(
759       outer.getInductionVar(), &forwardSlice,
760       [&inner](Operation *op) { return op != inner.getOperation(); });
761   LogicalResult status = success();
762   SmallVector<Operation *, 8> toHoist;
763   for (auto &op : outer.getBody()->without_terminator()) {
764     // Stop when encountering the inner loop.
765     if (&op == inner.getOperation())
766       break;
767     // Skip over non-hoistable ops.
768     if (forwardSlice.count(&op) > 0) {
769       status = failure();
770       continue;
771     }
772     // Skip intermediate scf::ForOp, these are not considered a failure.
773     if (isa<scf::ForOp>(op))
774       continue;
775     // Skip other ops with regions.
776     if (op.getNumRegions() > 0) {
777       status = failure();
778       continue;
779     }
780     // Skip if op has side effects.
781     // TODO: loads to immutable memory regions are ok.
782     if (!MemoryEffectOpInterface::hasNoEffect(&op)) {
783       status = failure();
784       continue;
785     }
786     toHoist.push_back(&op);
787   }
788   auto *outerForOp = outer.getOperation();
789   for (auto *op : toHoist)
790     op->moveBefore(outerForOp);
791   return status;
792 }
793 
794 // Traverse the interTile and intraTile loops and try to hoist ops such that
795 // bands of perfectly nested loops are isolated.
796 // Return failure if either perfect interTile or perfect intraTile bands cannot
797 // be formed.
tryIsolateBands(const TileLoops & tileLoops)798 static LogicalResult tryIsolateBands(const TileLoops &tileLoops) {
799   LogicalResult status = success();
800   const Loops &interTile = tileLoops.first;
801   const Loops &intraTile = tileLoops.second;
802   auto size = interTile.size();
803   assert(size == intraTile.size());
804   if (size <= 1)
805     return success();
806   for (unsigned s = 1; s < size; ++s)
807     status = succeeded(status) ? hoistOpsBetween(intraTile[0], intraTile[s])
808                                : failure();
809   for (unsigned s = 1; s < size; ++s)
810     status = succeeded(status) ? hoistOpsBetween(interTile[0], interTile[s])
811                                : failure();
812   return status;
813 }
814 
815 /// Collect perfectly nested loops starting from `rootForOps`.  Loops are
816 /// perfectly nested if each loop is the first and only non-terminator operation
817 /// in the parent loop.  Collect at most `maxLoops` loops and append them to
818 /// `forOps`.
819 template <typename T>
getPerfectlyNestedLoopsImpl(SmallVectorImpl<T> & forOps,T rootForOp,unsigned maxLoops=std::numeric_limits<unsigned>::max ())820 static void getPerfectlyNestedLoopsImpl(
821     SmallVectorImpl<T> &forOps, T rootForOp,
822     unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
823   for (unsigned i = 0; i < maxLoops; ++i) {
824     forOps.push_back(rootForOp);
825     Block &body = rootForOp.getRegion().front();
826     if (body.begin() != std::prev(body.end(), 2))
827       return;
828 
829     rootForOp = dyn_cast<T>(&body.front());
830     if (!rootForOp)
831       return;
832   }
833 }
834 
stripmineSink(scf::ForOp forOp,Value factor,ArrayRef<scf::ForOp> targets)835 static Loops stripmineSink(scf::ForOp forOp, Value factor,
836                            ArrayRef<scf::ForOp> targets) {
837   auto originalStep = forOp.getStep();
838   auto iv = forOp.getInductionVar();
839 
840   OpBuilder b(forOp);
841   forOp.setStep(b.create<arith::MulIOp>(forOp.getLoc(), originalStep, factor));
842 
843   Loops innerLoops;
844   for (auto t : targets) {
845     // Save information for splicing ops out of t when done
846     auto begin = t.getBody()->begin();
847     auto nOps = t.getBody()->getOperations().size();
848 
849     // Insert newForOp before the terminator of `t`.
850     auto b = OpBuilder::atBlockTerminator((t.getBody()));
851     Value stepped = b.create<arith::AddIOp>(t.getLoc(), iv, forOp.getStep());
852     Value less = b.create<arith::CmpIOp>(t.getLoc(), arith::CmpIPredicate::slt,
853                                          forOp.getUpperBound(), stepped);
854     Value ub = b.create<arith::SelectOp>(t.getLoc(), less,
855                                          forOp.getUpperBound(), stepped);
856 
857     // Splice [begin, begin + nOps - 1) into `newForOp` and replace uses.
858     auto newForOp = b.create<scf::ForOp>(t.getLoc(), iv, ub, originalStep);
859     newForOp.getBody()->getOperations().splice(
860         newForOp.getBody()->getOperations().begin(),
861         t.getBody()->getOperations(), begin, std::next(begin, nOps - 1));
862     replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
863                                newForOp.getRegion());
864 
865     innerLoops.push_back(newForOp);
866   }
867 
868   return innerLoops;
869 }
870 
871 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
872 // Returns the new for operation, nested immediately under `target`.
873 template <typename SizeType>
stripmineSink(scf::ForOp forOp,SizeType factor,scf::ForOp target)874 static scf::ForOp stripmineSink(scf::ForOp forOp, SizeType factor,
875                                 scf::ForOp target) {
876   // TODO: Use cheap structural assertions that targets are nested under
877   // forOp and that targets are not nested under each other when DominanceInfo
878   // exposes the capability. It seems overkill to construct a whole function
879   // dominance tree at this point.
880   auto res = stripmineSink(forOp, factor, ArrayRef<scf::ForOp>(target));
881   assert(res.size() == 1 && "Expected 1 inner forOp");
882   return res[0];
883 }
884 
tile(ArrayRef<scf::ForOp> forOps,ArrayRef<Value> sizes,ArrayRef<scf::ForOp> targets)885 SmallVector<Loops, 8> mlir::tile(ArrayRef<scf::ForOp> forOps,
886                                  ArrayRef<Value> sizes,
887                                  ArrayRef<scf::ForOp> targets) {
888   SmallVector<SmallVector<scf::ForOp, 8>, 8> res;
889   SmallVector<scf::ForOp, 8> currentTargets(targets.begin(), targets.end());
890   for (auto it : llvm::zip(forOps, sizes)) {
891     auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
892     res.push_back(step);
893     currentTargets = step;
894   }
895   return res;
896 }
897 
tile(ArrayRef<scf::ForOp> forOps,ArrayRef<Value> sizes,scf::ForOp target)898 Loops mlir::tile(ArrayRef<scf::ForOp> forOps, ArrayRef<Value> sizes,
899                  scf::ForOp target) {
900   SmallVector<scf::ForOp, 8> res;
901   for (auto loops : tile(forOps, sizes, ArrayRef<scf::ForOp>(target))) {
902     assert(loops.size() == 1);
903     res.push_back(loops[0]);
904   }
905   return res;
906 }
907 
tilePerfectlyNested(scf::ForOp rootForOp,ArrayRef<Value> sizes)908 Loops mlir::tilePerfectlyNested(scf::ForOp rootForOp, ArrayRef<Value> sizes) {
909   // Collect perfectly nested loops.  If more size values provided than nested
910   // loops available, truncate `sizes`.
911   SmallVector<scf::ForOp, 4> forOps;
912   forOps.reserve(sizes.size());
913   getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
914   if (forOps.size() < sizes.size())
915     sizes = sizes.take_front(forOps.size());
916 
917   return ::tile(forOps, sizes, forOps.back());
918 }
919 
getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> & nestedLoops,scf::ForOp root)920 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<scf::ForOp> &nestedLoops,
921                                    scf::ForOp root) {
922   getPerfectlyNestedLoopsImpl(nestedLoops, root);
923 }
924 
extractFixedOuterLoops(scf::ForOp rootForOp,ArrayRef<int64_t> sizes)925 TileLoops mlir::extractFixedOuterLoops(scf::ForOp rootForOp,
926                                        ArrayRef<int64_t> sizes) {
927   // Collect perfectly nested loops.  If more size values provided than nested
928   // loops available, truncate `sizes`.
929   SmallVector<scf::ForOp, 4> forOps;
930   forOps.reserve(sizes.size());
931   getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
932   if (forOps.size() < sizes.size())
933     sizes = sizes.take_front(forOps.size());
934 
935   // Compute the tile sizes such that i-th outer loop executes size[i]
936   // iterations.  Given that the loop current executes
937   //   numIterations = ceildiv((upperBound - lowerBound), step)
938   // iterations, we need to tile with size ceildiv(numIterations, size[i]).
939   SmallVector<Value, 4> tileSizes;
940   tileSizes.reserve(sizes.size());
941   for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
942     assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
943 
944     auto forOp = forOps[i];
945     OpBuilder builder(forOp);
946     auto loc = forOp.getLoc();
947     Value diff = builder.create<arith::SubIOp>(loc, forOp.getUpperBound(),
948                                                forOp.getLowerBound());
949     Value numIterations = ceilDivPositive(builder, loc, diff, forOp.getStep());
950     Value iterationsPerBlock =
951         ceilDivPositive(builder, loc, numIterations, sizes[i]);
952     tileSizes.push_back(iterationsPerBlock);
953   }
954 
955   // Call parametric tiling with the given sizes.
956   auto intraTile = tile(forOps, tileSizes, forOps.back());
957   TileLoops tileLoops = std::make_pair(forOps, intraTile);
958 
959   // TODO: for now we just ignore the result of band isolation.
960   // In the future, mapping decisions may be impacted by the ability to
961   // isolate perfectly nested bands.
962   (void)tryIsolateBands(tileLoops);
963 
964   return tileLoops;
965 }
966