1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "PassDetail.h"
14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
15 #include "mlir/Dialect/SCF/SCF.h"
16 #include "mlir/Dialect/SCF/Transforms.h"
17 #include "mlir/Dialect/SCF/Utils/Utils.h"
18 #include "mlir/IR/BlockAndValueMapping.h"
19 #include "mlir/IR/PatternMatch.h"
20 #include "mlir/Support/MathExtras.h"
21 
22 using namespace mlir;
23 using namespace mlir::scf;
24 
25 namespace {
26 
27 /// Helper to keep internal information during pipelining transformation.
28 struct LoopPipelinerInternal {
29   /// Coarse liverange information for ops used across stages.
30   struct LiverangeInfo {
31     unsigned lastUseStage = 0;
32     unsigned defStage = 0;
33   };
34 
35 protected:
36   ForOp forOp;
37   unsigned maxStage = 0;
38   DenseMap<Operation *, unsigned> stages;
39   std::vector<Operation *> opOrder;
40   int64_t ub;
41   int64_t lb;
42   int64_t step;
43   PipeliningOption::AnnotationlFnType annotateFn = nullptr;
44   bool peelEpilogue;
45   PipeliningOption::PredicateOpFn predicateFn = nullptr;
46 
47   // When peeling the kernel we generate several version of each value for
48   // different stage of the prologue. This map tracks the mapping between
49   // original Values in the loop and the different versions
50   // peeled from the loop.
51   DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
52 
53   /// Assign a value to `valueMapping`, this means `val` represents the version
54   /// `idx` of `key` in the epilogue.
55   void setValueMapping(Value key, Value el, int64_t idx);
56 
57 public:
58   /// Initalize the information for the given `op`, return true if it
59   /// satisfies the pre-condition to apply pipelining.
60   bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
61   /// Emits the prologue, this creates `maxStage - 1` part which will contain
62   /// operations from stages [0; i], where i is the part index.
63   void emitPrologue(PatternRewriter &rewriter);
64   /// Gather liverange information for Values that are used in a different stage
65   /// than its definition.
66   llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
67   scf::ForOp createKernelLoop(
68       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
69       PatternRewriter &rewriter,
70       llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
71   /// Emits the pipelined kernel. This clones loop operations following user
72   /// order and remaps operands defined in a different stage as their use.
73   void createKernel(
74       scf::ForOp newForOp,
75       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
76       const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
77       PatternRewriter &rewriter);
78   /// Emits the epilogue, this creates `maxStage - 1` part which will contain
79   /// operations from stages [i; maxStage], where i is the part index.
80   llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
81 };
82 
83 bool LoopPipelinerInternal::initializeLoopInfo(
84     ForOp op, const PipeliningOption &options) {
85   forOp = op;
86   auto upperBoundCst =
87       forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
88   auto lowerBoundCst =
89       forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
90   auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
91   if (!upperBoundCst || !lowerBoundCst || !stepCst)
92     return false;
93   ub = upperBoundCst.value();
94   lb = lowerBoundCst.value();
95   step = stepCst.value();
96   peelEpilogue = options.peelEpilogue;
97   predicateFn = options.predicateFn;
98   if (!peelEpilogue && predicateFn == nullptr)
99     return false;
100   int64_t numIteration = ceilDiv(ub - lb, step);
101   std::vector<std::pair<Operation *, unsigned>> schedule;
102   options.getScheduleFn(forOp, schedule);
103   if (schedule.empty())
104     return false;
105 
106   opOrder.reserve(schedule.size());
107   for (auto &opSchedule : schedule) {
108     maxStage = std::max(maxStage, opSchedule.second);
109     stages[opSchedule.first] = opSchedule.second;
110     opOrder.push_back(opSchedule.first);
111   }
112   if (numIteration <= maxStage)
113     return false;
114 
115   // All operations need to have a stage.
116   if (forOp
117           .walk([this](Operation *op) {
118             if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
119                 stages.find(op) == stages.end())
120               return WalkResult::interrupt();
121             return WalkResult::advance();
122           })
123           .wasInterrupted())
124     return false;
125 
126   // Only support loop carried dependency with a distance of 1. This means the
127   // source of all the scf.yield operands needs to be defined by operations in
128   // the loop.
129   if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
130                    [this](Value operand) {
131                      Operation *def = operand.getDefiningOp();
132                      return !def || stages.find(def) == stages.end();
133                    }))
134     return false;
135   annotateFn = options.annotateFn;
136   return true;
137 }
138 
139 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
140   // Initialize the iteration argument to the loop initiale values.
141   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
142     OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
143     setValueMapping(arg, operand.get(), 0);
144   }
145   auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
146   for (int64_t i = 0; i < maxStage; i++) {
147     // special handling for induction variable as the increment is implicit.
148     Value iv =
149         rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step);
150     setValueMapping(forOp.getInductionVar(), iv, i);
151     for (Operation *op : opOrder) {
152       if (stages[op] > i)
153         continue;
154       Operation *newOp = rewriter.clone(*op);
155       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
156         auto it = valueMapping.find(op->getOperand(opIdx));
157         if (it != valueMapping.end())
158           newOp->setOperand(opIdx, it->second[i - stages[op]]);
159       }
160       if (annotateFn)
161         annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
162       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
163         setValueMapping(op->getResult(destId), newOp->getResult(destId),
164                         i - stages[op]);
165         // If the value is a loop carried dependency update the loop argument
166         // mapping.
167         for (OpOperand &operand : yield->getOpOperands()) {
168           if (operand.get() != op->getResult(destId))
169             continue;
170           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
171                           newOp->getResult(destId), i - stages[op] + 1);
172         }
173       }
174     }
175   }
176 }
177 
178 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
179 LoopPipelinerInternal::analyzeCrossStageValues() {
180   llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
181   for (Operation *op : opOrder) {
182     unsigned stage = stages[op];
183     for (OpOperand &operand : op->getOpOperands()) {
184       Operation *def = operand.get().getDefiningOp();
185       if (!def)
186         continue;
187       auto defStage = stages.find(def);
188       if (defStage == stages.end() || defStage->second == stage)
189         continue;
190       assert(stage > defStage->second);
191       LiverangeInfo &info = crossStageValues[operand.get()];
192       info.defStage = defStage->second;
193       info.lastUseStage = std::max(info.lastUseStage, stage);
194     }
195   }
196   return crossStageValues;
197 }
198 
199 scf::ForOp LoopPipelinerInternal::createKernelLoop(
200     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
201         &crossStageValues,
202     PatternRewriter &rewriter,
203     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
204   // Creates the list of initial values associated to values used across
205   // stages. The initial values come from the prologue created above.
206   // Keep track of the kernel argument associated to each version of the
207   // values passed to the kernel.
208   llvm::SmallVector<Value> newLoopArg;
209   // For existing loop argument initialize them with the right version from the
210   // prologue.
211   for (const auto &retVal :
212        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
213     Operation *def = retVal.value().getDefiningOp();
214     assert(def && "Only support loop carried dependencies of distance 1");
215     unsigned defStage = stages[def];
216     Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
217                                      [maxStage - defStage];
218     assert(valueVersion);
219     newLoopArg.push_back(valueVersion);
220   }
221   for (auto escape : crossStageValues) {
222     LiverangeInfo &info = escape.second;
223     Value value = escape.first;
224     for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
225          stageIdx++) {
226       Value valueVersion =
227           valueMapping[value][maxStage - info.lastUseStage + stageIdx];
228       assert(valueVersion);
229       newLoopArg.push_back(valueVersion);
230       loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
231                                            stageIdx)] = newLoopArg.size() - 1;
232     }
233   }
234 
235   // Create the new kernel loop. When we peel the epilgue we need to peel
236   // `numStages - 1` iterations. Then we adjust the upper bound to remove those
237   // iterations.
238   Value newUb = forOp.getUpperBound();
239   if (peelEpilogue)
240     newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(),
241                                                     ub - maxStage * step);
242   auto newForOp =
243       rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
244                                   forOp.getStep(), newLoopArg);
245   return newForOp;
246 }
247 
248 void LoopPipelinerInternal::createKernel(
249     scf::ForOp newForOp,
250     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
251         &crossStageValues,
252     const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
253     PatternRewriter &rewriter) {
254   valueMapping.clear();
255 
256   // Create the kernel, we clone instruction based on the order given by
257   // user and remap operands coming from a previous stages.
258   rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
259   BlockAndValueMapping mapping;
260   mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
261   for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
262     mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
263   }
264   SmallVector<Value> predicates(maxStage + 1, nullptr);
265   if (!peelEpilogue) {
266     // Create a predicate for each stage except the last stage.
267     for (unsigned i = 0; i < maxStage; i++) {
268       Value c = rewriter.create<arith::ConstantIndexOp>(
269           newForOp.getLoc(), ub - (maxStage - i) * step);
270       Value pred = rewriter.create<arith::CmpIOp>(
271           newForOp.getLoc(), arith::CmpIPredicate::slt,
272           newForOp.getInductionVar(), c);
273       predicates[i] = pred;
274     }
275   }
276   for (Operation *op : opOrder) {
277     int64_t useStage = stages[op];
278     auto *newOp = rewriter.clone(*op, mapping);
279     for (OpOperand &operand : op->getOpOperands()) {
280       // Special case for the induction variable uses. We replace it with a
281       // version incremented based on the stage where it is used.
282       if (operand.get() == forOp.getInductionVar()) {
283         rewriter.setInsertionPoint(newOp);
284         Value offset = rewriter.create<arith::ConstantIndexOp>(
285             forOp.getLoc(), (maxStage - stages[op]) * step);
286         Value iv = rewriter.create<arith::AddIOp>(
287             forOp.getLoc(), newForOp.getInductionVar(), offset);
288         newOp->setOperand(operand.getOperandNumber(), iv);
289         rewriter.setInsertionPointAfter(newOp);
290         continue;
291       }
292       auto arg = operand.get().dyn_cast<BlockArgument>();
293       if (arg && arg.getOwner() == forOp.getBody()) {
294         // If the value is a loop carried value coming from stage N + 1 remap,
295         // it will become a direct use.
296         Value ret = forOp.getBody()->getTerminator()->getOperand(
297             arg.getArgNumber() - 1);
298         Operation *dep = ret.getDefiningOp();
299         if (!dep)
300           continue;
301         auto stageDep = stages.find(dep);
302         if (stageDep == stages.end() || stageDep->second == useStage)
303           continue;
304         assert(stageDep->second == useStage + 1);
305         newOp->setOperand(operand.getOperandNumber(),
306                           mapping.lookupOrDefault(ret));
307         continue;
308       }
309       // For operands defined in a previous stage we need to remap it to use
310       // the correct region argument. We look for the right version of the
311       // Value based on the stage where it is used.
312       Operation *def = operand.get().getDefiningOp();
313       if (!def)
314         continue;
315       auto stageDef = stages.find(def);
316       if (stageDef == stages.end() || stageDef->second == useStage)
317         continue;
318       auto remap = loopArgMap.find(
319           std::make_pair(operand.get(), useStage - stageDef->second));
320       assert(remap != loopArgMap.end());
321       newOp->setOperand(operand.getOperandNumber(),
322                         newForOp.getRegionIterArgs()[remap->second]);
323     }
324     if (predicates[useStage]) {
325       newOp = predicateFn(newOp, predicates[useStage], rewriter);
326       // Remap the results to the new predicated one.
327       for (auto values : llvm::zip(op->getResults(), newOp->getResults()))
328         mapping.map(std::get<0>(values), std::get<1>(values));
329     }
330     rewriter.setInsertionPointAfter(newOp);
331     if (annotateFn)
332       annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
333   }
334 
335   // Collect the Values that need to be returned by the forOp. For each
336   // value we need to have `LastUseStage - DefStage` number of versions
337   // returned.
338   // We create a mapping between original values and the associated loop
339   // returned values that will be needed by the epilogue.
340   llvm::SmallVector<Value> yieldOperands;
341   for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
342     yieldOperands.push_back(mapping.lookupOrDefault(retVal));
343   }
344   for (auto &it : crossStageValues) {
345     int64_t version = maxStage - it.second.lastUseStage + 1;
346     unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
347     // add the original verstion to yield ops.
348     // If there is a liverange spanning across more than 2 stages we need to add
349     // extra arg.
350     for (unsigned i = 1; i < numVersionReturned; i++) {
351       setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
352                       version++);
353       yieldOperands.push_back(
354           newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
355                                              newForOp.getNumInductionVars()]);
356     }
357     setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
358                     version++);
359     yieldOperands.push_back(mapping.lookupOrDefault(it.first));
360   }
361   // Map the yield operand to the forOp returned value.
362   for (const auto &retVal :
363        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
364     Operation *def = retVal.value().getDefiningOp();
365     assert(def && "Only support loop carried dependencies of distance 1");
366     unsigned defStage = stages[def];
367     setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
368                     newForOp->getResult(retVal.index()),
369                     maxStage - defStage + 1);
370   }
371   rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
372 }
373 
374 llvm::SmallVector<Value>
375 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
376   llvm::SmallVector<Value> returnValues(forOp->getNumResults());
377   // Emit different versions of the induction variable. They will be
378   // removed by dead code if not used.
379   for (int64_t i = 0; i < maxStage; i++) {
380     Value newlastIter = rewriter.create<arith::ConstantIndexOp>(
381         forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
382     setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
383   }
384   // Emit `maxStage - 1` epilogue part that includes operations fro stages
385   // [i; maxStage].
386   for (int64_t i = 1; i <= maxStage; i++) {
387     for (Operation *op : opOrder) {
388       if (stages[op] < i)
389         continue;
390       Operation *newOp = rewriter.clone(*op);
391       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
392         auto it = valueMapping.find(op->getOperand(opIdx));
393         if (it != valueMapping.end()) {
394           Value v = it->second[maxStage - stages[op] + i];
395           assert(v);
396           newOp->setOperand(opIdx, v);
397         }
398       }
399       if (annotateFn)
400         annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
401       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
402         setValueMapping(op->getResult(destId), newOp->getResult(destId),
403                         maxStage - stages[op] + i);
404         // If the value is a loop carried dependency update the loop argument
405         // mapping and keep track of the last version to replace the original
406         // forOp uses.
407         for (OpOperand &operand :
408              forOp.getBody()->getTerminator()->getOpOperands()) {
409           if (operand.get() != op->getResult(destId))
410             continue;
411           unsigned version = maxStage - stages[op] + i + 1;
412           // If the version is greater than maxStage it means it maps to the
413           // original forOp returned value.
414           if (version > maxStage) {
415             returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
416             continue;
417           }
418           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
419                           newOp->getResult(destId), version);
420         }
421       }
422     }
423   }
424   return returnValues;
425 }
426 
427 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
428   auto it = valueMapping.find(key);
429   // If the value is not in the map yet add a vector big enough to store all
430   // versions.
431   if (it == valueMapping.end())
432     it =
433         valueMapping
434             .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
435             .first;
436   it->second[idx] = el;
437 }
438 
439 /// Generate a pipelined version of the scf.for loop based on the schedule given
440 /// as option. This applies the mechanical transformation of changing the loop
441 /// and generating the prologue/epilogue for the pipelining and doesn't make any
442 /// decision regarding the schedule.
443 /// Based on the option the loop is split into several stages.
444 /// The transformation assumes that the scheduling given by user is valid.
445 /// For example if we break a loop into 3 stages named S0, S1, S2 we would
446 /// generate the following code with the number in parenthesis the iteration
447 /// index:
448 /// S0(0)                        // Prologue
449 /// S0(1) S1(0)                  // Prologue
450 /// scf.for %I = %C0 to %N - 2 {
451 ///  S0(I+2) S1(I+1) S2(I)       // Pipelined kernel
452 /// }
453 /// S1(N) S2(N-1)                // Epilogue
454 /// S2(N)                        // Epilogue
455 struct ForLoopPipelining : public OpRewritePattern<ForOp> {
456   ForLoopPipelining(const PipeliningOption &options, MLIRContext *context)
457       : OpRewritePattern<ForOp>(context), options(options) {}
458   LogicalResult matchAndRewrite(ForOp forOp,
459                                 PatternRewriter &rewriter) const override {
460 
461     LoopPipelinerInternal pipeliner;
462     if (!pipeliner.initializeLoopInfo(forOp, options))
463       return failure();
464 
465     // 1. Emit prologue.
466     pipeliner.emitPrologue(rewriter);
467 
468     // 2. Track values used across stages. When a value cross stages it will
469     // need to be passed as loop iteration arguments.
470     // We first collect the values that are used in a different stage than where
471     // they are defined.
472     llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
473         crossStageValues = pipeliner.analyzeCrossStageValues();
474 
475     // Mapping between original loop values used cross stage and the block
476     // arguments associated after pipelining. A Value may map to several
477     // arguments if its liverange spans across more than 2 stages.
478     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
479     // 3. Create the new kernel loop and return the block arguments mapping.
480     ForOp newForOp =
481         pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
482     // Create the kernel block, order ops based on user choice and remap
483     // operands.
484     pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
485 
486     llvm::SmallVector<Value> returnValues =
487         newForOp.getResults().take_front(forOp->getNumResults());
488     if (options.peelEpilogue) {
489       // 4. Emit the epilogue after the new forOp.
490       rewriter.setInsertionPointAfter(newForOp);
491       returnValues = pipeliner.emitEpilogue(rewriter);
492     }
493     // 5. Erase the original loop and replace the uses with the epilogue output.
494     if (forOp->getNumResults() > 0)
495       rewriter.replaceOp(forOp, returnValues);
496     else
497       rewriter.eraseOp(forOp);
498 
499     return success();
500   }
501 
502 protected:
503   PipeliningOption options;
504 };
505 
506 } // namespace
507 
508 void mlir::scf::populateSCFLoopPipeliningPatterns(
509     RewritePatternSet &patterns, const PipeliningOption &options) {
510   patterns.add<ForLoopPipelining>(options, patterns.getContext());
511 }
512