1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "PassDetail.h"
14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
15 #include "mlir/Dialect/SCF/SCF.h"
16 #include "mlir/Dialect/SCF/Transforms.h"
17 #include "mlir/Dialect/SCF/Utils/Utils.h"
18 #include "mlir/IR/BlockAndValueMapping.h"
19 #include "mlir/IR/PatternMatch.h"
20 #include "mlir/Support/MathExtras.h"
21 
22 using namespace mlir;
23 using namespace mlir::scf;
24 
25 namespace {
26 
27 /// Helper to keep internal information during pipelining transformation.
28 struct LoopPipelinerInternal {
29   /// Coarse liverange information for ops used across stages.
30   struct LiverangeInfo {
31     unsigned lastUseStage = 0;
32     unsigned defStage = 0;
33   };
34 
35 protected:
36   ForOp forOp;
37   unsigned maxStage = 0;
38   DenseMap<Operation *, unsigned> stages;
39   std::vector<Operation *> opOrder;
40   int64_t ub;
41   int64_t lb;
42   int64_t step;
43   PipeliningOption::AnnotationlFnType annotateFn = nullptr;
44 
45   // When peeling the kernel we generate several version of each value for
46   // different stage of the prologue. This map tracks the mapping between
47   // original Values in the loop and the different versions
48   // peeled from the loop.
49   DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
50 
51   /// Assign a value to `valueMapping`, this means `val` represents the version
52   /// `idx` of `key` in the epilogue.
53   void setValueMapping(Value key, Value el, int64_t idx);
54 
55 public:
56   /// Initalize the information for the given `op`, return true if it
57   /// satisfies the pre-condition to apply pipelining.
58   bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
59   /// Emits the prologue, this creates `maxStage - 1` part which will contain
60   /// operations from stages [0; i], where i is the part index.
61   void emitPrologue(PatternRewriter &rewriter);
62   /// Gather liverange information for Values that are used in a different stage
63   /// than its definition.
64   llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
65   scf::ForOp createKernelLoop(
66       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
67       PatternRewriter &rewriter,
68       llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
69   /// Emits the pipelined kernel. This clones loop operations following user
70   /// order and remaps operands defined in a different stage as their use.
71   void createKernel(
72       scf::ForOp newForOp,
73       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
74       const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
75       PatternRewriter &rewriter);
76   /// Emits the epilogue, this creates `maxStage - 1` part which will contain
77   /// operations from stages [i; maxStage], where i is the part index.
78   llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
79 };
80 
81 bool LoopPipelinerInternal::initializeLoopInfo(
82     ForOp op, const PipeliningOption &options) {
83   forOp = op;
84   auto upperBoundCst =
85       forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
86   auto lowerBoundCst =
87       forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
88   auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
89   if (!upperBoundCst || !lowerBoundCst || !stepCst)
90     return false;
91   ub = upperBoundCst.value();
92   lb = lowerBoundCst.value();
93   step = stepCst.value();
94   int64_t numIteration = ceilDiv(ub - lb, step);
95   std::vector<std::pair<Operation *, unsigned>> schedule;
96   options.getScheduleFn(forOp, schedule);
97   if (schedule.empty())
98     return false;
99 
100   opOrder.reserve(schedule.size());
101   for (auto &opSchedule : schedule) {
102     maxStage = std::max(maxStage, opSchedule.second);
103     stages[opSchedule.first] = opSchedule.second;
104     opOrder.push_back(opSchedule.first);
105   }
106   if (numIteration <= maxStage)
107     return false;
108 
109   // All operations need to have a stage.
110   if (forOp
111           .walk([this](Operation *op) {
112             if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
113                 stages.find(op) == stages.end())
114               return WalkResult::interrupt();
115             return WalkResult::advance();
116           })
117           .wasInterrupted())
118     return false;
119 
120   // Only support loop carried dependency with a distance of 1. This means the
121   // source of all the scf.yield operands needs to be defined by operations in
122   // the loop.
123   if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
124                    [this](Value operand) {
125                      Operation *def = operand.getDefiningOp();
126                      return !def || stages.find(def) == stages.end();
127                    }))
128     return false;
129   annotateFn = options.annotateFn;
130   return true;
131 }
132 
133 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
134   // Initialize the iteration argument to the loop initiale values.
135   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
136     OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
137     setValueMapping(arg, operand.get(), 0);
138   }
139   auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
140   for (int64_t i = 0; i < maxStage; i++) {
141     // special handling for induction variable as the increment is implicit.
142     Value iv =
143         rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step);
144     setValueMapping(forOp.getInductionVar(), iv, i);
145     for (Operation *op : opOrder) {
146       if (stages[op] > i)
147         continue;
148       Operation *newOp = rewriter.clone(*op);
149       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
150         auto it = valueMapping.find(op->getOperand(opIdx));
151         if (it != valueMapping.end())
152           newOp->setOperand(opIdx, it->second[i - stages[op]]);
153       }
154       if (annotateFn)
155         annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
156       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
157         setValueMapping(op->getResult(destId), newOp->getResult(destId),
158                         i - stages[op]);
159         // If the value is a loop carried dependency update the loop argument
160         // mapping.
161         for (OpOperand &operand : yield->getOpOperands()) {
162           if (operand.get() != op->getResult(destId))
163             continue;
164           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
165                           newOp->getResult(destId), i - stages[op] + 1);
166         }
167       }
168     }
169   }
170 }
171 
172 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
173 LoopPipelinerInternal::analyzeCrossStageValues() {
174   llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
175   for (Operation *op : opOrder) {
176     unsigned stage = stages[op];
177     for (OpOperand &operand : op->getOpOperands()) {
178       Operation *def = operand.get().getDefiningOp();
179       if (!def)
180         continue;
181       auto defStage = stages.find(def);
182       if (defStage == stages.end() || defStage->second == stage)
183         continue;
184       assert(stage > defStage->second);
185       LiverangeInfo &info = crossStageValues[operand.get()];
186       info.defStage = defStage->second;
187       info.lastUseStage = std::max(info.lastUseStage, stage);
188     }
189   }
190   return crossStageValues;
191 }
192 
193 scf::ForOp LoopPipelinerInternal::createKernelLoop(
194     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
195         &crossStageValues,
196     PatternRewriter &rewriter,
197     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
198   // Creates the list of initial values associated to values used across
199   // stages. The initial values come from the prologue created above.
200   // Keep track of the kernel argument associated to each version of the
201   // values passed to the kernel.
202   llvm::SmallVector<Value> newLoopArg;
203   // For existing loop argument initialize them with the right version from the
204   // prologue.
205   for (const auto &retVal :
206        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
207     Operation *def = retVal.value().getDefiningOp();
208     assert(def && "Only support loop carried dependencies of distance 1");
209     unsigned defStage = stages[def];
210     Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
211                                      [maxStage - defStage];
212     assert(valueVersion);
213     newLoopArg.push_back(valueVersion);
214   }
215   for (auto escape : crossStageValues) {
216     LiverangeInfo &info = escape.second;
217     Value value = escape.first;
218     for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
219          stageIdx++) {
220       Value valueVersion =
221           valueMapping[value][maxStage - info.lastUseStage + stageIdx];
222       assert(valueVersion);
223       newLoopArg.push_back(valueVersion);
224       loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
225                                            stageIdx)] = newLoopArg.size() - 1;
226     }
227   }
228 
229   // Create the new kernel loop. Since we need to peel `numStages - 1`
230   // iteration we change the upper bound to remove those iterations.
231   Value newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(),
232                                                         ub - maxStage * step);
233   auto newForOp =
234       rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
235                                   forOp.getStep(), newLoopArg);
236   return newForOp;
237 }
238 
239 void LoopPipelinerInternal::createKernel(
240     scf::ForOp newForOp,
241     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
242         &crossStageValues,
243     const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
244     PatternRewriter &rewriter) {
245   valueMapping.clear();
246 
247   // Create the kernel, we clone instruction based on the order given by
248   // user and remap operands coming from a previous stages.
249   rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
250   BlockAndValueMapping mapping;
251   mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
252   for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
253     mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
254   }
255   for (Operation *op : opOrder) {
256     int64_t useStage = stages[op];
257     auto *newOp = rewriter.clone(*op, mapping);
258     for (OpOperand &operand : op->getOpOperands()) {
259       // Special case for the induction variable uses. We replace it with a
260       // version incremented based on the stage where it is used.
261       if (operand.get() == forOp.getInductionVar()) {
262         rewriter.setInsertionPoint(newOp);
263         Value offset = rewriter.create<arith::ConstantIndexOp>(
264             forOp.getLoc(), (maxStage - stages[op]) * step);
265         Value iv = rewriter.create<arith::AddIOp>(
266             forOp.getLoc(), newForOp.getInductionVar(), offset);
267         newOp->setOperand(operand.getOperandNumber(), iv);
268         rewriter.setInsertionPointAfter(newOp);
269         continue;
270       }
271       auto arg = operand.get().dyn_cast<BlockArgument>();
272       if (arg && arg.getOwner() == forOp.getBody()) {
273         // If the value is a loop carried value coming from stage N + 1 remap,
274         // it will become a direct use.
275         Value ret = forOp.getBody()->getTerminator()->getOperand(
276             arg.getArgNumber() - 1);
277         Operation *dep = ret.getDefiningOp();
278         if (!dep)
279           continue;
280         auto stageDep = stages.find(dep);
281         if (stageDep == stages.end() || stageDep->second == useStage)
282           continue;
283         assert(stageDep->second == useStage + 1);
284         newOp->setOperand(operand.getOperandNumber(),
285                           mapping.lookupOrDefault(ret));
286         continue;
287       }
288       // For operands defined in a previous stage we need to remap it to use
289       // the correct region argument. We look for the right version of the
290       // Value based on the stage where it is used.
291       Operation *def = operand.get().getDefiningOp();
292       if (!def)
293         continue;
294       auto stageDef = stages.find(def);
295       if (stageDef == stages.end() || stageDef->second == useStage)
296         continue;
297       auto remap = loopArgMap.find(
298           std::make_pair(operand.get(), useStage - stageDef->second));
299       assert(remap != loopArgMap.end());
300       newOp->setOperand(operand.getOperandNumber(),
301                         newForOp.getRegionIterArgs()[remap->second]);
302     }
303     if (annotateFn)
304       annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
305   }
306 
307   // Collect the Values that need to be returned by the forOp. For each
308   // value we need to have `LastUseStage - DefStage` number of versions
309   // returned.
310   // We create a mapping between original values and the associated loop
311   // returned values that will be needed by the epilogue.
312   llvm::SmallVector<Value> yieldOperands;
313   for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
314     yieldOperands.push_back(mapping.lookupOrDefault(retVal));
315   }
316   for (auto &it : crossStageValues) {
317     int64_t version = maxStage - it.second.lastUseStage + 1;
318     unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
319     // add the original verstion to yield ops.
320     // If there is a liverange spanning across more than 2 stages we need to add
321     // extra arg.
322     for (unsigned i = 1; i < numVersionReturned; i++) {
323       setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
324                       version++);
325       yieldOperands.push_back(
326           newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
327                                              newForOp.getNumInductionVars()]);
328     }
329     setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
330                     version++);
331     yieldOperands.push_back(mapping.lookupOrDefault(it.first));
332   }
333   // Map the yield operand to the forOp returned value.
334   for (const auto &retVal :
335        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
336     Operation *def = retVal.value().getDefiningOp();
337     assert(def && "Only support loop carried dependencies of distance 1");
338     unsigned defStage = stages[def];
339     setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
340                     newForOp->getResult(retVal.index()),
341                     maxStage - defStage + 1);
342   }
343   rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
344 }
345 
346 llvm::SmallVector<Value>
347 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
348   llvm::SmallVector<Value> returnValues(forOp->getNumResults());
349   // Emit different versions of the induction variable. They will be
350   // removed by dead code if not used.
351   for (int64_t i = 0; i < maxStage; i++) {
352     Value newlastIter = rewriter.create<arith::ConstantIndexOp>(
353         forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
354     setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
355   }
356   // Emit `maxStage - 1` epilogue part that includes operations fro stages
357   // [i; maxStage].
358   for (int64_t i = 1; i <= maxStage; i++) {
359     for (Operation *op : opOrder) {
360       if (stages[op] < i)
361         continue;
362       Operation *newOp = rewriter.clone(*op);
363       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
364         auto it = valueMapping.find(op->getOperand(opIdx));
365         if (it != valueMapping.end()) {
366           Value v = it->second[maxStage - stages[op] + i];
367           assert(v);
368           newOp->setOperand(opIdx, v);
369         }
370       }
371       if (annotateFn)
372         annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
373       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
374         setValueMapping(op->getResult(destId), newOp->getResult(destId),
375                         maxStage - stages[op] + i);
376         // If the value is a loop carried dependency update the loop argument
377         // mapping and keep track of the last version to replace the original
378         // forOp uses.
379         for (OpOperand &operand :
380              forOp.getBody()->getTerminator()->getOpOperands()) {
381           if (operand.get() != op->getResult(destId))
382             continue;
383           unsigned version = maxStage - stages[op] + i + 1;
384           // If the version is greater than maxStage it means it maps to the
385           // original forOp returned value.
386           if (version > maxStage) {
387             returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
388             continue;
389           }
390           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
391                           newOp->getResult(destId), version);
392         }
393       }
394     }
395   }
396   return returnValues;
397 }
398 
399 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
400   auto it = valueMapping.find(key);
401   // If the value is not in the map yet add a vector big enough to store all
402   // versions.
403   if (it == valueMapping.end())
404     it =
405         valueMapping
406             .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
407             .first;
408   it->second[idx] = el;
409 }
410 
411 /// Generate a pipelined version of the scf.for loop based on the schedule given
412 /// as option. This applies the mechanical transformation of changing the loop
413 /// and generating the prologue/epilogue for the pipelining and doesn't make any
414 /// decision regarding the schedule.
415 /// Based on the option the loop is split into several stages.
416 /// The transformation assumes that the scheduling given by user is valid.
417 /// For example if we break a loop into 3 stages named S0, S1, S2 we would
418 /// generate the following code with the number in parenthesis the iteration
419 /// index:
420 /// S0(0)                        // Prologue
421 /// S0(1) S1(0)                  // Prologue
422 /// scf.for %I = %C0 to %N - 2 {
423 ///  S0(I+2) S1(I+1) S2(I)       // Pipelined kernel
424 /// }
425 /// S1(N) S2(N-1)                // Epilogue
426 /// S2(N)                        // Epilogue
427 struct ForLoopPipelining : public OpRewritePattern<ForOp> {
428   ForLoopPipelining(const PipeliningOption &options, MLIRContext *context)
429       : OpRewritePattern<ForOp>(context), options(options) {}
430   LogicalResult matchAndRewrite(ForOp forOp,
431                                 PatternRewriter &rewriter) const override {
432 
433     LoopPipelinerInternal pipeliner;
434     if (!pipeliner.initializeLoopInfo(forOp, options))
435       return failure();
436 
437     // 1. Emit prologue.
438     pipeliner.emitPrologue(rewriter);
439 
440     // 2. Track values used across stages. When a value cross stages it will
441     // need to be passed as loop iteration arguments.
442     // We first collect the values that are used in a different stage than where
443     // they are defined.
444     llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
445         crossStageValues = pipeliner.analyzeCrossStageValues();
446 
447     // Mapping between original loop values used cross stage and the block
448     // arguments associated after pipelining. A Value may map to several
449     // arguments if its liverange spans across more than 2 stages.
450     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
451     // 3. Create the new kernel loop and return the block arguments mapping.
452     ForOp newForOp =
453         pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
454     // Create the kernel block, order ops based on user choice and remap
455     // operands.
456     pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
457 
458     // 4. Emit the epilogue after the new forOp.
459     rewriter.setInsertionPointAfter(newForOp);
460     llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter);
461 
462     // 5. Erase the original loop and replace the uses with the epilogue output.
463     if (forOp->getNumResults() > 0)
464       rewriter.replaceOp(forOp, returnValues);
465     else
466       rewriter.eraseOp(forOp);
467 
468     return success();
469   }
470 
471 protected:
472   PipeliningOption options;
473 };
474 
475 } // namespace
476 
477 void mlir::scf::populateSCFLoopPipeliningPatterns(
478     RewritePatternSet &patterns, const PipeliningOption &options) {
479   patterns.add<ForLoopPipelining>(options, patterns.getContext());
480 }
481