1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "PassDetail.h"
14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
15 #include "mlir/Dialect/SCF/IR/SCF.h"
16 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
17 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
18 #include "mlir/Dialect/SCF/Utils/Utils.h"
19 #include "mlir/IR/BlockAndValueMapping.h"
20 #include "mlir/IR/PatternMatch.h"
21 #include "mlir/Support/MathExtras.h"
22 
23 using namespace mlir;
24 using namespace mlir::scf;
25 
26 namespace {
27 
28 /// Helper to keep internal information during pipelining transformation.
29 struct LoopPipelinerInternal {
30   /// Coarse liverange information for ops used across stages.
31   struct LiverangeInfo {
32     unsigned lastUseStage = 0;
33     unsigned defStage = 0;
34   };
35 
36 protected:
37   ForOp forOp;
38   unsigned maxStage = 0;
39   DenseMap<Operation *, unsigned> stages;
40   std::vector<Operation *> opOrder;
41   int64_t ub;
42   int64_t lb;
43   int64_t step;
44   PipeliningOption::AnnotationlFnType annotateFn = nullptr;
45   bool peelEpilogue;
46   PipeliningOption::PredicateOpFn predicateFn = nullptr;
47 
48   // When peeling the kernel we generate several version of each value for
49   // different stage of the prologue. This map tracks the mapping between
50   // original Values in the loop and the different versions
51   // peeled from the loop.
52   DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
53 
54   /// Assign a value to `valueMapping`, this means `val` represents the version
55   /// `idx` of `key` in the epilogue.
56   void setValueMapping(Value key, Value el, int64_t idx);
57 
58 public:
59   /// Initalize the information for the given `op`, return true if it
60   /// satisfies the pre-condition to apply pipelining.
61   bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
62   /// Emits the prologue, this creates `maxStage - 1` part which will contain
63   /// operations from stages [0; i], where i is the part index.
64   void emitPrologue(PatternRewriter &rewriter);
65   /// Gather liverange information for Values that are used in a different stage
66   /// than its definition.
67   llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
68   scf::ForOp createKernelLoop(
69       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
70       PatternRewriter &rewriter,
71       llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
72   /// Emits the pipelined kernel. This clones loop operations following user
73   /// order and remaps operands defined in a different stage as their use.
74   void createKernel(
75       scf::ForOp newForOp,
76       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
77       const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
78       PatternRewriter &rewriter);
79   /// Emits the epilogue, this creates `maxStage - 1` part which will contain
80   /// operations from stages [i; maxStage], where i is the part index.
81   llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
82 };
83 
84 bool LoopPipelinerInternal::initializeLoopInfo(
85     ForOp op, const PipeliningOption &options) {
86   forOp = op;
87   auto upperBoundCst =
88       forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
89   auto lowerBoundCst =
90       forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
91   auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
92   if (!upperBoundCst || !lowerBoundCst || !stepCst)
93     return false;
94   ub = upperBoundCst.value();
95   lb = lowerBoundCst.value();
96   step = stepCst.value();
97   peelEpilogue = options.peelEpilogue;
98   predicateFn = options.predicateFn;
99   if (!peelEpilogue && predicateFn == nullptr)
100     return false;
101   int64_t numIteration = ceilDiv(ub - lb, step);
102   std::vector<std::pair<Operation *, unsigned>> schedule;
103   options.getScheduleFn(forOp, schedule);
104   if (schedule.empty())
105     return false;
106 
107   opOrder.reserve(schedule.size());
108   for (auto &opSchedule : schedule) {
109     maxStage = std::max(maxStage, opSchedule.second);
110     stages[opSchedule.first] = opSchedule.second;
111     opOrder.push_back(opSchedule.first);
112   }
113   if (numIteration <= maxStage)
114     return false;
115 
116   // All operations need to have a stage.
117   if (forOp
118           .walk([this](Operation *op) {
119             if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
120                 stages.find(op) == stages.end())
121               return WalkResult::interrupt();
122             return WalkResult::advance();
123           })
124           .wasInterrupted())
125     return false;
126 
127   // Only support loop carried dependency with a distance of 1. This means the
128   // source of all the scf.yield operands needs to be defined by operations in
129   // the loop.
130   if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
131                    [this](Value operand) {
132                      Operation *def = operand.getDefiningOp();
133                      return !def || stages.find(def) == stages.end();
134                    }))
135     return false;
136   annotateFn = options.annotateFn;
137   return true;
138 }
139 
140 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
141   // Initialize the iteration argument to the loop initiale values.
142   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
143     OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
144     setValueMapping(arg, operand.get(), 0);
145   }
146   auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
147   for (int64_t i = 0; i < maxStage; i++) {
148     // special handling for induction variable as the increment is implicit.
149     Value iv =
150         rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step);
151     setValueMapping(forOp.getInductionVar(), iv, i);
152     for (Operation *op : opOrder) {
153       if (stages[op] > i)
154         continue;
155       Operation *newOp = rewriter.clone(*op);
156       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
157         auto it = valueMapping.find(op->getOperand(opIdx));
158         if (it != valueMapping.end())
159           newOp->setOperand(opIdx, it->second[i - stages[op]]);
160       }
161       if (annotateFn)
162         annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
163       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
164         setValueMapping(op->getResult(destId), newOp->getResult(destId),
165                         i - stages[op]);
166         // If the value is a loop carried dependency update the loop argument
167         // mapping.
168         for (OpOperand &operand : yield->getOpOperands()) {
169           if (operand.get() != op->getResult(destId))
170             continue;
171           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
172                           newOp->getResult(destId), i - stages[op] + 1);
173         }
174       }
175     }
176   }
177 }
178 
179 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
180 LoopPipelinerInternal::analyzeCrossStageValues() {
181   llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
182   for (Operation *op : opOrder) {
183     unsigned stage = stages[op];
184     for (OpOperand &operand : op->getOpOperands()) {
185       Operation *def = operand.get().getDefiningOp();
186       if (!def)
187         continue;
188       auto defStage = stages.find(def);
189       if (defStage == stages.end() || defStage->second == stage)
190         continue;
191       assert(stage > defStage->second);
192       LiverangeInfo &info = crossStageValues[operand.get()];
193       info.defStage = defStage->second;
194       info.lastUseStage = std::max(info.lastUseStage, stage);
195     }
196   }
197   return crossStageValues;
198 }
199 
200 scf::ForOp LoopPipelinerInternal::createKernelLoop(
201     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
202         &crossStageValues,
203     PatternRewriter &rewriter,
204     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
205   // Creates the list of initial values associated to values used across
206   // stages. The initial values come from the prologue created above.
207   // Keep track of the kernel argument associated to each version of the
208   // values passed to the kernel.
209   llvm::SmallVector<Value> newLoopArg;
210   // For existing loop argument initialize them with the right version from the
211   // prologue.
212   for (const auto &retVal :
213        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
214     Operation *def = retVal.value().getDefiningOp();
215     assert(def && "Only support loop carried dependencies of distance 1");
216     unsigned defStage = stages[def];
217     Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
218                                      [maxStage - defStage];
219     assert(valueVersion);
220     newLoopArg.push_back(valueVersion);
221   }
222   for (auto escape : crossStageValues) {
223     LiverangeInfo &info = escape.second;
224     Value value = escape.first;
225     for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
226          stageIdx++) {
227       Value valueVersion =
228           valueMapping[value][maxStage - info.lastUseStage + stageIdx];
229       assert(valueVersion);
230       newLoopArg.push_back(valueVersion);
231       loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
232                                            stageIdx)] = newLoopArg.size() - 1;
233     }
234   }
235 
236   // Create the new kernel loop. When we peel the epilgue we need to peel
237   // `numStages - 1` iterations. Then we adjust the upper bound to remove those
238   // iterations.
239   Value newUb = forOp.getUpperBound();
240   if (peelEpilogue)
241     newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(),
242                                                     ub - maxStage * step);
243   auto newForOp =
244       rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
245                                   forOp.getStep(), newLoopArg);
246   return newForOp;
247 }
248 
249 void LoopPipelinerInternal::createKernel(
250     scf::ForOp newForOp,
251     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
252         &crossStageValues,
253     const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
254     PatternRewriter &rewriter) {
255   valueMapping.clear();
256 
257   // Create the kernel, we clone instruction based on the order given by
258   // user and remap operands coming from a previous stages.
259   rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
260   BlockAndValueMapping mapping;
261   mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
262   for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
263     mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
264   }
265   SmallVector<Value> predicates(maxStage + 1, nullptr);
266   if (!peelEpilogue) {
267     // Create a predicate for each stage except the last stage.
268     for (unsigned i = 0; i < maxStage; i++) {
269       Value c = rewriter.create<arith::ConstantIndexOp>(
270           newForOp.getLoc(), ub - (maxStage - i) * step);
271       Value pred = rewriter.create<arith::CmpIOp>(
272           newForOp.getLoc(), arith::CmpIPredicate::slt,
273           newForOp.getInductionVar(), c);
274       predicates[i] = pred;
275     }
276   }
277   for (Operation *op : opOrder) {
278     int64_t useStage = stages[op];
279     auto *newOp = rewriter.clone(*op, mapping);
280     for (OpOperand &operand : op->getOpOperands()) {
281       // Special case for the induction variable uses. We replace it with a
282       // version incremented based on the stage where it is used.
283       if (operand.get() == forOp.getInductionVar()) {
284         rewriter.setInsertionPoint(newOp);
285         Value offset = rewriter.create<arith::ConstantIndexOp>(
286             forOp.getLoc(), (maxStage - stages[op]) * step);
287         Value iv = rewriter.create<arith::AddIOp>(
288             forOp.getLoc(), newForOp.getInductionVar(), offset);
289         newOp->setOperand(operand.getOperandNumber(), iv);
290         rewriter.setInsertionPointAfter(newOp);
291         continue;
292       }
293       auto arg = operand.get().dyn_cast<BlockArgument>();
294       if (arg && arg.getOwner() == forOp.getBody()) {
295         // If the value is a loop carried value coming from stage N + 1 remap,
296         // it will become a direct use.
297         Value ret = forOp.getBody()->getTerminator()->getOperand(
298             arg.getArgNumber() - 1);
299         Operation *dep = ret.getDefiningOp();
300         if (!dep)
301           continue;
302         auto stageDep = stages.find(dep);
303         if (stageDep == stages.end() || stageDep->second == useStage)
304           continue;
305         assert(stageDep->second == useStage + 1);
306         newOp->setOperand(operand.getOperandNumber(),
307                           mapping.lookupOrDefault(ret));
308         continue;
309       }
310       // For operands defined in a previous stage we need to remap it to use
311       // the correct region argument. We look for the right version of the
312       // Value based on the stage where it is used.
313       Operation *def = operand.get().getDefiningOp();
314       if (!def)
315         continue;
316       auto stageDef = stages.find(def);
317       if (stageDef == stages.end() || stageDef->second == useStage)
318         continue;
319       auto remap = loopArgMap.find(
320           std::make_pair(operand.get(), useStage - stageDef->second));
321       assert(remap != loopArgMap.end());
322       newOp->setOperand(operand.getOperandNumber(),
323                         newForOp.getRegionIterArgs()[remap->second]);
324     }
325     if (predicates[useStage]) {
326       newOp = predicateFn(newOp, predicates[useStage], rewriter);
327       // Remap the results to the new predicated one.
328       for (auto values : llvm::zip(op->getResults(), newOp->getResults()))
329         mapping.map(std::get<0>(values), std::get<1>(values));
330     }
331     rewriter.setInsertionPointAfter(newOp);
332     if (annotateFn)
333       annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
334   }
335 
336   // Collect the Values that need to be returned by the forOp. For each
337   // value we need to have `LastUseStage - DefStage` number of versions
338   // returned.
339   // We create a mapping between original values and the associated loop
340   // returned values that will be needed by the epilogue.
341   llvm::SmallVector<Value> yieldOperands;
342   for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
343     yieldOperands.push_back(mapping.lookupOrDefault(retVal));
344   }
345   for (auto &it : crossStageValues) {
346     int64_t version = maxStage - it.second.lastUseStage + 1;
347     unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
348     // add the original verstion to yield ops.
349     // If there is a liverange spanning across more than 2 stages we need to add
350     // extra arg.
351     for (unsigned i = 1; i < numVersionReturned; i++) {
352       setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
353                       version++);
354       yieldOperands.push_back(
355           newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
356                                              newForOp.getNumInductionVars()]);
357     }
358     setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
359                     version++);
360     yieldOperands.push_back(mapping.lookupOrDefault(it.first));
361   }
362   // Map the yield operand to the forOp returned value.
363   for (const auto &retVal :
364        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
365     Operation *def = retVal.value().getDefiningOp();
366     assert(def && "Only support loop carried dependencies of distance 1");
367     unsigned defStage = stages[def];
368     setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
369                     newForOp->getResult(retVal.index()),
370                     maxStage - defStage + 1);
371   }
372   rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
373 }
374 
375 llvm::SmallVector<Value>
376 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
377   llvm::SmallVector<Value> returnValues(forOp->getNumResults());
378   // Emit different versions of the induction variable. They will be
379   // removed by dead code if not used.
380   for (int64_t i = 0; i < maxStage; i++) {
381     Value newlastIter = rewriter.create<arith::ConstantIndexOp>(
382         forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
383     setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
384   }
385   // Emit `maxStage - 1` epilogue part that includes operations fro stages
386   // [i; maxStage].
387   for (int64_t i = 1; i <= maxStage; i++) {
388     for (Operation *op : opOrder) {
389       if (stages[op] < i)
390         continue;
391       Operation *newOp = rewriter.clone(*op);
392       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
393         auto it = valueMapping.find(op->getOperand(opIdx));
394         if (it != valueMapping.end()) {
395           Value v = it->second[maxStage - stages[op] + i];
396           assert(v);
397           newOp->setOperand(opIdx, v);
398         }
399       }
400       if (annotateFn)
401         annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
402       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
403         setValueMapping(op->getResult(destId), newOp->getResult(destId),
404                         maxStage - stages[op] + i);
405         // If the value is a loop carried dependency update the loop argument
406         // mapping and keep track of the last version to replace the original
407         // forOp uses.
408         for (OpOperand &operand :
409              forOp.getBody()->getTerminator()->getOpOperands()) {
410           if (operand.get() != op->getResult(destId))
411             continue;
412           unsigned version = maxStage - stages[op] + i + 1;
413           // If the version is greater than maxStage it means it maps to the
414           // original forOp returned value.
415           if (version > maxStage) {
416             returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
417             continue;
418           }
419           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
420                           newOp->getResult(destId), version);
421         }
422       }
423     }
424   }
425   return returnValues;
426 }
427 
428 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
429   auto it = valueMapping.find(key);
430   // If the value is not in the map yet add a vector big enough to store all
431   // versions.
432   if (it == valueMapping.end())
433     it =
434         valueMapping
435             .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
436             .first;
437   it->second[idx] = el;
438 }
439 
440 } // namespace
441 
442 FailureOr<ForOp> ForLoopPipeliningPattern::returningMatchAndRewrite(
443     ForOp forOp, PatternRewriter &rewriter) const {
444 
445   LoopPipelinerInternal pipeliner;
446   if (!pipeliner.initializeLoopInfo(forOp, options))
447     return failure();
448 
449   // 1. Emit prologue.
450   pipeliner.emitPrologue(rewriter);
451 
452   // 2. Track values used across stages. When a value cross stages it will
453   // need to be passed as loop iteration arguments.
454   // We first collect the values that are used in a different stage than where
455   // they are defined.
456   llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
457       crossStageValues = pipeliner.analyzeCrossStageValues();
458 
459   // Mapping between original loop values used cross stage and the block
460   // arguments associated after pipelining. A Value may map to several
461   // arguments if its liverange spans across more than 2 stages.
462   llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
463   // 3. Create the new kernel loop and return the block arguments mapping.
464   ForOp newForOp =
465       pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
466   // Create the kernel block, order ops based on user choice and remap
467   // operands.
468   pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
469 
470   llvm::SmallVector<Value> returnValues =
471       newForOp.getResults().take_front(forOp->getNumResults());
472   if (options.peelEpilogue) {
473     // 4. Emit the epilogue after the new forOp.
474     rewriter.setInsertionPointAfter(newForOp);
475     returnValues = pipeliner.emitEpilogue(rewriter);
476   }
477   // 5. Erase the original loop and replace the uses with the epilogue output.
478   if (forOp->getNumResults() > 0)
479     rewriter.replaceOp(forOp, returnValues);
480   else
481     rewriter.eraseOp(forOp);
482 
483   return newForOp;
484 }
485 
486 void mlir::scf::populateSCFLoopPipeliningPatterns(
487     RewritePatternSet &patterns, const PipeliningOption &options) {
488   patterns.add<ForLoopPipeliningPattern>(options, patterns.getContext());
489 }
490