1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "PassDetail.h"
14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
15 #include "mlir/Dialect/SCF/SCF.h"
16 #include "mlir/Dialect/SCF/Transforms.h"
17 #include "mlir/Dialect/SCF/Utils/Utils.h"
18 #include "mlir/Dialect/StandardOps/IR/Ops.h"
19 #include "mlir/IR/BlockAndValueMapping.h"
20 #include "mlir/IR/PatternMatch.h"
21 #include "mlir/Support/MathExtras.h"
22 
23 using namespace mlir;
24 using namespace mlir::scf;
25 
26 namespace {
27 
28 /// Helper to keep internal information during pipelining transformation.
29 struct LoopPipelinerInternal {
30   /// Coarse liverange information for ops used across stages.
31   struct LiverangeInfo {
32     unsigned lastUseStage = 0;
33     unsigned defStage = 0;
34   };
35 
36 protected:
37   ForOp forOp;
38   unsigned maxStage = 0;
39   DenseMap<Operation *, unsigned> stages;
40   std::vector<Operation *> opOrder;
41   int64_t ub;
42   int64_t lb;
43   int64_t step;
44   PipeliningOption::AnnotationlFnType annotateFn = nullptr;
45 
46   // When peeling the kernel we generate several version of each value for
47   // different stage of the prologue. This map tracks the mapping between
48   // original Values in the loop and the different versions
49   // peeled from the loop.
50   DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
51 
52   /// Assign a value to `valueMapping`, this means `val` represents the version
53   /// `idx` of `key` in the epilogue.
54   void setValueMapping(Value key, Value el, int64_t idx);
55 
56 public:
57   /// Initalize the information for the given `op`, return true if it
58   /// satisfies the pre-condition to apply pipelining.
59   bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
60   /// Emits the prologue, this creates `maxStage - 1` part which will contain
61   /// operations from stages [0; i], where i is the part index.
62   void emitPrologue(PatternRewriter &rewriter);
63   /// Gather liverange information for Values that are used in a different stage
64   /// than its definition.
65   llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
66   scf::ForOp createKernelLoop(
67       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
68       PatternRewriter &rewriter,
69       llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
70   /// Emits the pipelined kernel. This clones loop operations following user
71   /// order and remaps operands defined in a different stage as their use.
72   void createKernel(
73       scf::ForOp newForOp,
74       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
75       const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
76       PatternRewriter &rewriter);
77   /// Emits the epilogue, this creates `maxStage - 1` part which will contain
78   /// operations from stages [i; maxStage], where i is the part index.
79   llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
80 };
81 
82 bool LoopPipelinerInternal::initializeLoopInfo(
83     ForOp op, const PipeliningOption &options) {
84   forOp = op;
85   auto upperBoundCst =
86       forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
87   auto lowerBoundCst =
88       forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
89   auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
90   if (!upperBoundCst || !lowerBoundCst || !stepCst)
91     return false;
92   ub = upperBoundCst.value();
93   lb = lowerBoundCst.value();
94   step = stepCst.value();
95   int64_t numIteration = ceilDiv(ub - lb, step);
96   std::vector<std::pair<Operation *, unsigned>> schedule;
97   options.getScheduleFn(forOp, schedule);
98   if (schedule.empty())
99     return false;
100 
101   opOrder.reserve(schedule.size());
102   for (auto &opSchedule : schedule) {
103     maxStage = std::max(maxStage, opSchedule.second);
104     stages[opSchedule.first] = opSchedule.second;
105     opOrder.push_back(opSchedule.first);
106   }
107   if (numIteration <= maxStage)
108     return false;
109 
110   // All operations need to have a stage.
111   if (forOp
112           .walk([this](Operation *op) {
113             if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
114                 stages.find(op) == stages.end())
115               return WalkResult::interrupt();
116             return WalkResult::advance();
117           })
118           .wasInterrupted())
119     return false;
120 
121   // Only support loop carried dependency with a distance of 1. This means the
122   // source of all the scf.yield operands needs to be defined by operations in
123   // the loop.
124   if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
125                    [this](Value operand) {
126                      Operation *def = operand.getDefiningOp();
127                      return !def || stages.find(def) == stages.end();
128                    }))
129     return false;
130   annotateFn = options.annotateFn;
131   return true;
132 }
133 
134 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
135   // Initialize the iteration argument to the loop initiale values.
136   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
137     OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
138     setValueMapping(arg, operand.get(), 0);
139   }
140   auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
141   for (int64_t i = 0; i < maxStage; i++) {
142     // special handling for induction variable as the increment is implicit.
143     Value iv =
144         rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step);
145     setValueMapping(forOp.getInductionVar(), iv, i);
146     for (Operation *op : opOrder) {
147       if (stages[op] > i)
148         continue;
149       Operation *newOp = rewriter.clone(*op);
150       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
151         auto it = valueMapping.find(op->getOperand(opIdx));
152         if (it != valueMapping.end())
153           newOp->setOperand(opIdx, it->second[i - stages[op]]);
154       }
155       if (annotateFn)
156         annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
157       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
158         setValueMapping(op->getResult(destId), newOp->getResult(destId),
159                         i - stages[op]);
160         // If the value is a loop carried dependency update the loop argument
161         // mapping.
162         for (OpOperand &operand : yield->getOpOperands()) {
163           if (operand.get() != op->getResult(destId))
164             continue;
165           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
166                           newOp->getResult(destId), i - stages[op] + 1);
167         }
168       }
169     }
170   }
171 }
172 
173 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
174 LoopPipelinerInternal::analyzeCrossStageValues() {
175   llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
176   for (Operation *op : opOrder) {
177     unsigned stage = stages[op];
178     for (OpOperand &operand : op->getOpOperands()) {
179       Operation *def = operand.get().getDefiningOp();
180       if (!def)
181         continue;
182       auto defStage = stages.find(def);
183       if (defStage == stages.end() || defStage->second == stage)
184         continue;
185       assert(stage > defStage->second);
186       LiverangeInfo &info = crossStageValues[operand.get()];
187       info.defStage = defStage->second;
188       info.lastUseStage = std::max(info.lastUseStage, stage);
189     }
190   }
191   return crossStageValues;
192 }
193 
194 scf::ForOp LoopPipelinerInternal::createKernelLoop(
195     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
196         &crossStageValues,
197     PatternRewriter &rewriter,
198     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
199   // Creates the list of initial values associated to values used across
200   // stages. The initial values come from the prologue created above.
201   // Keep track of the kernel argument associated to each version of the
202   // values passed to the kernel.
203   llvm::SmallVector<Value> newLoopArg;
204   // For existing loop argument initialize them with the right version from the
205   // prologue.
206   for (const auto &retVal :
207        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
208     Operation *def = retVal.value().getDefiningOp();
209     assert(def && "Only support loop carried dependencies of distance 1");
210     unsigned defStage = stages[def];
211     Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
212                                      [maxStage - defStage];
213     assert(valueVersion);
214     newLoopArg.push_back(valueVersion);
215   }
216   for (auto escape : crossStageValues) {
217     LiverangeInfo &info = escape.second;
218     Value value = escape.first;
219     for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
220          stageIdx++) {
221       Value valueVersion =
222           valueMapping[value][maxStage - info.lastUseStage + stageIdx];
223       assert(valueVersion);
224       newLoopArg.push_back(valueVersion);
225       loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
226                                            stageIdx)] = newLoopArg.size() - 1;
227     }
228   }
229 
230   // Create the new kernel loop. Since we need to peel `numStages - 1`
231   // iteration we change the upper bound to remove those iterations.
232   Value newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(),
233                                                         ub - maxStage * step);
234   auto newForOp =
235       rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
236                                   forOp.getStep(), newLoopArg);
237   return newForOp;
238 }
239 
240 void LoopPipelinerInternal::createKernel(
241     scf::ForOp newForOp,
242     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
243         &crossStageValues,
244     const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
245     PatternRewriter &rewriter) {
246   valueMapping.clear();
247 
248   // Create the kernel, we clone instruction based on the order given by
249   // user and remap operands coming from a previous stages.
250   rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
251   BlockAndValueMapping mapping;
252   mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
253   for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
254     mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
255   }
256   for (Operation *op : opOrder) {
257     int64_t useStage = stages[op];
258     auto *newOp = rewriter.clone(*op, mapping);
259     for (OpOperand &operand : op->getOpOperands()) {
260       // Special case for the induction variable uses. We replace it with a
261       // version incremented based on the stage where it is used.
262       if (operand.get() == forOp.getInductionVar()) {
263         rewriter.setInsertionPoint(newOp);
264         Value offset = rewriter.create<arith::ConstantIndexOp>(
265             forOp.getLoc(), (maxStage - stages[op]) * step);
266         Value iv = rewriter.create<arith::AddIOp>(
267             forOp.getLoc(), newForOp.getInductionVar(), offset);
268         newOp->setOperand(operand.getOperandNumber(), iv);
269         rewriter.setInsertionPointAfter(newOp);
270         continue;
271       }
272       auto arg = operand.get().dyn_cast<BlockArgument>();
273       if (arg && arg.getOwner() == forOp.getBody()) {
274         // If the value is a loop carried value coming from stage N + 1 remap,
275         // it will become a direct use.
276         Value ret = forOp.getBody()->getTerminator()->getOperand(
277             arg.getArgNumber() - 1);
278         Operation *dep = ret.getDefiningOp();
279         if (!dep)
280           continue;
281         auto stageDep = stages.find(dep);
282         if (stageDep == stages.end() || stageDep->second == useStage)
283           continue;
284         assert(stageDep->second == useStage + 1);
285         newOp->setOperand(operand.getOperandNumber(),
286                           mapping.lookupOrDefault(ret));
287         continue;
288       }
289       // For operands defined in a previous stage we need to remap it to use
290       // the correct region argument. We look for the right version of the
291       // Value based on the stage where it is used.
292       Operation *def = operand.get().getDefiningOp();
293       if (!def)
294         continue;
295       auto stageDef = stages.find(def);
296       if (stageDef == stages.end() || stageDef->second == useStage)
297         continue;
298       auto remap = loopArgMap.find(
299           std::make_pair(operand.get(), useStage - stageDef->second));
300       assert(remap != loopArgMap.end());
301       newOp->setOperand(operand.getOperandNumber(),
302                         newForOp.getRegionIterArgs()[remap->second]);
303     }
304     if (annotateFn)
305       annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
306   }
307 
308   // Collect the Values that need to be returned by the forOp. For each
309   // value we need to have `LastUseStage - DefStage` number of versions
310   // returned.
311   // We create a mapping between original values and the associated loop
312   // returned values that will be needed by the epilogue.
313   llvm::SmallVector<Value> yieldOperands;
314   for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
315     yieldOperands.push_back(mapping.lookupOrDefault(retVal));
316   }
317   for (auto &it : crossStageValues) {
318     int64_t version = maxStage - it.second.lastUseStage + 1;
319     unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
320     // add the original verstion to yield ops.
321     // If there is a liverange spanning across more than 2 stages we need to add
322     // extra arg.
323     for (unsigned i = 1; i < numVersionReturned; i++) {
324       setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
325                       version++);
326       yieldOperands.push_back(
327           newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
328                                              newForOp.getNumInductionVars()]);
329     }
330     setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
331                     version++);
332     yieldOperands.push_back(mapping.lookupOrDefault(it.first));
333   }
334   // Map the yield operand to the forOp returned value.
335   for (const auto &retVal :
336        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
337     Operation *def = retVal.value().getDefiningOp();
338     assert(def && "Only support loop carried dependencies of distance 1");
339     unsigned defStage = stages[def];
340     setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
341                     newForOp->getResult(retVal.index()),
342                     maxStage - defStage + 1);
343   }
344   rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
345 }
346 
347 llvm::SmallVector<Value>
348 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
349   llvm::SmallVector<Value> returnValues(forOp->getNumResults());
350   // Emit different versions of the induction variable. They will be
351   // removed by dead code if not used.
352   for (int64_t i = 0; i < maxStage; i++) {
353     Value newlastIter = rewriter.create<arith::ConstantIndexOp>(
354         forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
355     setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
356   }
357   // Emit `maxStage - 1` epilogue part that includes operations fro stages
358   // [i; maxStage].
359   for (int64_t i = 1; i <= maxStage; i++) {
360     for (Operation *op : opOrder) {
361       if (stages[op] < i)
362         continue;
363       Operation *newOp = rewriter.clone(*op);
364       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
365         auto it = valueMapping.find(op->getOperand(opIdx));
366         if (it != valueMapping.end()) {
367           Value v = it->second[maxStage - stages[op] + i];
368           assert(v);
369           newOp->setOperand(opIdx, v);
370         }
371       }
372       if (annotateFn)
373         annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
374       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
375         setValueMapping(op->getResult(destId), newOp->getResult(destId),
376                         maxStage - stages[op] + i);
377         // If the value is a loop carried dependency update the loop argument
378         // mapping and keep track of the last version to replace the original
379         // forOp uses.
380         for (OpOperand &operand :
381              forOp.getBody()->getTerminator()->getOpOperands()) {
382           if (operand.get() != op->getResult(destId))
383             continue;
384           unsigned version = maxStage - stages[op] + i + 1;
385           // If the version is greater than maxStage it means it maps to the
386           // original forOp returned value.
387           if (version > maxStage) {
388             returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
389             continue;
390           }
391           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
392                           newOp->getResult(destId), version);
393         }
394       }
395     }
396   }
397   return returnValues;
398 }
399 
400 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
401   auto it = valueMapping.find(key);
402   // If the value is not in the map yet add a vector big enough to store all
403   // versions.
404   if (it == valueMapping.end())
405     it =
406         valueMapping
407             .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
408             .first;
409   it->second[idx] = el;
410 }
411 
412 /// Generate a pipelined version of the scf.for loop based on the schedule given
413 /// as option. This applies the mechanical transformation of changing the loop
414 /// and generating the prologue/epilogue for the pipelining and doesn't make any
415 /// decision regarding the schedule.
416 /// Based on the option the loop is split into several stages.
417 /// The transformation assumes that the scheduling given by user is valid.
418 /// For example if we break a loop into 3 stages named S0, S1, S2 we would
419 /// generate the following code with the number in parenthesis the iteration
420 /// index:
421 /// S0(0)                        // Prologue
422 /// S0(1) S1(0)                  // Prologue
423 /// scf.for %I = %C0 to %N - 2 {
424 ///  S0(I+2) S1(I+1) S2(I)       // Pipelined kernel
425 /// }
426 /// S1(N) S2(N-1)                // Epilogue
427 /// S2(N)                        // Epilogue
428 struct ForLoopPipelining : public OpRewritePattern<ForOp> {
429   ForLoopPipelining(const PipeliningOption &options, MLIRContext *context)
430       : OpRewritePattern<ForOp>(context), options(options) {}
431   LogicalResult matchAndRewrite(ForOp forOp,
432                                 PatternRewriter &rewriter) const override {
433 
434     LoopPipelinerInternal pipeliner;
435     if (!pipeliner.initializeLoopInfo(forOp, options))
436       return failure();
437 
438     // 1. Emit prologue.
439     pipeliner.emitPrologue(rewriter);
440 
441     // 2. Track values used across stages. When a value cross stages it will
442     // need to be passed as loop iteration arguments.
443     // We first collect the values that are used in a different stage than where
444     // they are defined.
445     llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
446         crossStageValues = pipeliner.analyzeCrossStageValues();
447 
448     // Mapping between original loop values used cross stage and the block
449     // arguments associated after pipelining. A Value may map to several
450     // arguments if its liverange spans across more than 2 stages.
451     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
452     // 3. Create the new kernel loop and return the block arguments mapping.
453     ForOp newForOp =
454         pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
455     // Create the kernel block, order ops based on user choice and remap
456     // operands.
457     pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
458 
459     // 4. Emit the epilogue after the new forOp.
460     rewriter.setInsertionPointAfter(newForOp);
461     llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter);
462 
463     // 5. Erase the original loop and replace the uses with the epilogue output.
464     if (forOp->getNumResults() > 0)
465       rewriter.replaceOp(forOp, returnValues);
466     else
467       rewriter.eraseOp(forOp);
468 
469     return success();
470   }
471 
472 protected:
473   PipeliningOption options;
474 };
475 
476 } // namespace
477 
478 void mlir::scf::populateSCFLoopPipeliningPatterns(
479     RewritePatternSet &patterns, const PipeliningOption &options) {
480   patterns.add<ForLoopPipelining>(options, patterns.getContext());
481 }
482