1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "PassDetail.h"
14 #include "mlir/Dialect/SCF/SCF.h"
15 #include "mlir/Dialect/SCF/Transforms.h"
16 #include "mlir/Dialect/SCF/Utils.h"
17 #include "mlir/Dialect/StandardOps/IR/Ops.h"
18 #include "mlir/IR/BlockAndValueMapping.h"
19 #include "mlir/IR/PatternMatch.h"
20 #include "mlir/Support/MathExtras.h"
21 
22 using namespace mlir;
23 using namespace mlir::scf;
24 
25 namespace {
26 
27 /// Helper to keep internal information during pipelining transformation.
28 struct LoopPipelinerInternal {
29   /// Coarse liverange information for ops used across stages.
30   struct LiverangeInfo {
31     unsigned lastUseStage = 0;
32     unsigned defStage = 0;
33   };
34 
35 protected:
36   ForOp forOp;
37   unsigned maxStage = 0;
38   DenseMap<Operation *, unsigned> stages;
39   std::vector<Operation *> opOrder;
40   int64_t ub;
41   int64_t lb;
42   int64_t step;
43 
44   // When peeling the kernel we generate several version of each value for
45   // different stage of the prologue. This map tracks the mapping between
46   // original Values in the loop and the different versions
47   // peeled from the loop.
48   DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
49 
50   /// Assign a value to `valueMapping`, this means `val` represents the version
51   /// `idx` of `key` in the epilogue.
52   void setValueMapping(Value key, Value el, int64_t idx);
53 
54 public:
55   /// Initalize the information for the given `op`, return true if it
56   /// satisfies the pre-condition to apply pipelining.
57   bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
58   /// Emits the prologue, this creates `maxStage - 1` part which will contain
59   /// operations from stages [0; i], where i is the part index.
60   void emitPrologue(PatternRewriter &rewriter);
61   /// Gather liverange information for Values that are used in a different stage
62   /// than its definition.
63   llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
64   scf::ForOp createKernelLoop(
65       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
66       PatternRewriter &rewriter,
67       llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
68   /// Emits the pipelined kernel. This clones loop operations following user
69   /// order and remaps operands defined in a different stage as their use.
70   void createKernel(
71       scf::ForOp newForOp,
72       const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
73       const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
74       PatternRewriter &rewriter);
75   /// Emits the epilogue, this creates `maxStage - 1` part which will contain
76   /// operations from stages [i; maxStage], where i is the part index.
77   llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
78 };
79 
80 bool LoopPipelinerInternal::initializeLoopInfo(
81     ForOp op, const PipeliningOption &options) {
82   forOp = op;
83   auto upperBoundCst = forOp.upperBound().getDefiningOp<ConstantIndexOp>();
84   auto lowerBoundCst = forOp.lowerBound().getDefiningOp<ConstantIndexOp>();
85   auto stepCst = forOp.step().getDefiningOp<ConstantIndexOp>();
86   if (!upperBoundCst || !lowerBoundCst || !stepCst)
87     return false;
88   ub = upperBoundCst.getValue();
89   lb = lowerBoundCst.getValue();
90   step = stepCst.getValue();
91   int64_t numIteration = ceilDiv(ub - lb, step);
92   std::vector<std::pair<Operation *, unsigned>> schedule;
93   options.getScheduleFn(forOp, schedule);
94   if (schedule.empty())
95     return false;
96 
97   opOrder.reserve(schedule.size());
98   for (auto &opSchedule : schedule) {
99     maxStage = std::max(maxStage, opSchedule.second);
100     stages[opSchedule.first] = opSchedule.second;
101     opOrder.push_back(opSchedule.first);
102   }
103   if (numIteration <= maxStage)
104     return false;
105 
106   // All operations need to have a stage.
107   if (forOp
108           .walk([this](Operation *op) {
109             if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
110                 stages.find(op) == stages.end())
111               return WalkResult::interrupt();
112             return WalkResult::advance();
113           })
114           .wasInterrupted())
115     return false;
116 
117   // Only support loop carried dependency with a distance of 1. This means the
118   // source of all the scf.yield operands needs to be defined by operations in
119   // the loop.
120   if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
121                    [this](Value operand) {
122                      Operation *def = operand.getDefiningOp();
123                      return !def || stages.find(def) == stages.end();
124                    }))
125     return false;
126   return true;
127 }
128 
129 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
130   // Initialize the iteration argument to the loop initiale values.
131   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
132     OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
133     setValueMapping(arg, operand.get(), 0);
134   }
135   auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
136   for (int64_t i = 0; i < maxStage; i++) {
137     // special handling for induction variable as the increment is implicit.
138     Value iv = rewriter.create<ConstantIndexOp>(forOp.getLoc(), lb + i);
139     setValueMapping(forOp.getInductionVar(), iv, i);
140     for (Operation *op : opOrder) {
141       if (stages[op] > i)
142         continue;
143       Operation *newOp = rewriter.clone(*op);
144       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
145         auto it = valueMapping.find(op->getOperand(opIdx));
146         if (it != valueMapping.end())
147           newOp->setOperand(opIdx, it->second[i - stages[op]]);
148       }
149       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
150         setValueMapping(op->getResult(destId), newOp->getResult(destId),
151                         i - stages[op]);
152         // If the value is a loop carried dependency update the loop argument
153         // mapping.
154         for (OpOperand &operand : yield->getOpOperands()) {
155           if (operand.get() != op->getResult(destId))
156             continue;
157           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
158                           newOp->getResult(destId), i - stages[op] + 1);
159         }
160       }
161     }
162   }
163 }
164 
165 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
166 LoopPipelinerInternal::analyzeCrossStageValues() {
167   llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
168   for (Operation *op : opOrder) {
169     unsigned stage = stages[op];
170     for (OpOperand &operand : op->getOpOperands()) {
171       Operation *def = operand.get().getDefiningOp();
172       if (!def)
173         continue;
174       auto defStage = stages.find(def);
175       if (defStage == stages.end() || defStage->second == stage)
176         continue;
177       assert(stage > defStage->second);
178       LiverangeInfo &info = crossStageValues[operand.get()];
179       info.defStage = defStage->second;
180       info.lastUseStage = std::max(info.lastUseStage, stage);
181     }
182   }
183   return crossStageValues;
184 }
185 
186 scf::ForOp LoopPipelinerInternal::createKernelLoop(
187     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
188         &crossStageValues,
189     PatternRewriter &rewriter,
190     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
191   // Creates the list of initial values associated to values used across
192   // stages. The initial values come from the prologue created above.
193   // Keep track of the kernel argument associated to each version of the
194   // values passed to the kernel.
195   llvm::SmallVector<Value> newLoopArg;
196   // For existing loop argument initialize them with the right version from the
197   // prologue.
198   for (auto retVal :
199        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
200     Operation *def = retVal.value().getDefiningOp();
201     assert(def && "Only support loop carried dependencies of distance 1");
202     unsigned defStage = stages[def];
203     Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
204                                      [maxStage - defStage];
205     assert(valueVersion);
206     newLoopArg.push_back(valueVersion);
207   }
208   for (auto escape : crossStageValues) {
209     LiverangeInfo &info = escape.second;
210     Value value = escape.first;
211     for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
212          stageIdx++) {
213       Value valueVersion =
214           valueMapping[value][maxStage - info.lastUseStage + stageIdx];
215       assert(valueVersion);
216       newLoopArg.push_back(valueVersion);
217       loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
218                                            stageIdx)] = newLoopArg.size() - 1;
219     }
220   }
221 
222   // Create the new kernel loop. Since we need to peel `numStages - 1`
223   // iteration we change the upper bound to remove those iterations.
224   Value newUb =
225       rewriter.create<ConstantIndexOp>(forOp.getLoc(), ub - maxStage * step);
226   auto newForOp = rewriter.create<scf::ForOp>(
227       forOp.getLoc(), forOp.lowerBound(), newUb, forOp.step(), newLoopArg);
228   return newForOp;
229 }
230 
231 void LoopPipelinerInternal::createKernel(
232     scf::ForOp newForOp,
233     const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
234         &crossStageValues,
235     const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
236     PatternRewriter &rewriter) {
237   valueMapping.clear();
238 
239   // Create the kernel, we clone instruction based on the order given by
240   // user and remap operands coming from a previous stages.
241   rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
242   BlockAndValueMapping mapping;
243   mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
244   for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) {
245     mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
246   }
247   for (Operation *op : opOrder) {
248     int64_t useStage = stages[op];
249     auto *newOp = rewriter.clone(*op, mapping);
250     for (OpOperand &operand : op->getOpOperands()) {
251       // Special case for the induction variable uses. We replace it with a
252       // version incremented based on the stage where it is used.
253       if (operand.get() == forOp.getInductionVar()) {
254         rewriter.setInsertionPoint(newOp);
255         Value offset = rewriter.create<ConstantIndexOp>(
256             forOp.getLoc(), (maxStage - stages[op]) * step);
257         Value iv = rewriter.create<AddIOp>(forOp.getLoc(),
258                                            newForOp.getInductionVar(), offset);
259         newOp->setOperand(operand.getOperandNumber(), iv);
260         rewriter.setInsertionPointAfter(newOp);
261         continue;
262       }
263       auto arg = operand.get().dyn_cast<BlockArgument>();
264       if (arg && arg.getOwner() == forOp.getBody()) {
265         // If the value is a loop carried value coming from stage N + 1 remap,
266         // it will become a direct use.
267         Value ret = forOp.getBody()->getTerminator()->getOperand(
268             arg.getArgNumber() - 1);
269         Operation *dep = ret.getDefiningOp();
270         if (!dep)
271           continue;
272         auto stageDep = stages.find(dep);
273         if (stageDep == stages.end() || stageDep->second == useStage)
274           continue;
275         assert(stageDep->second == useStage + 1);
276         newOp->setOperand(operand.getOperandNumber(),
277                           mapping.lookupOrDefault(ret));
278         continue;
279       }
280       // For operands defined in a previous stage we need to remap it to use
281       // the correct region argument. We look for the right version of the
282       // Value based on the stage where it is used.
283       Operation *def = operand.get().getDefiningOp();
284       if (!def)
285         continue;
286       auto stageDef = stages.find(def);
287       if (stageDef == stages.end() || stageDef->second == useStage)
288         continue;
289       auto remap = loopArgMap.find(
290           std::make_pair(operand.get(), useStage - stageDef->second));
291       assert(remap != loopArgMap.end());
292       newOp->setOperand(operand.getOperandNumber(),
293                         newForOp.getRegionIterArgs()[remap->second]);
294     }
295   }
296 
297   // Collect the Values that need to be returned by the forOp. For each
298   // value we need to have `LastUseStage - DefStage` number of versions
299   // returned.
300   // We create a mapping between original values and the associated loop
301   // returned values that will be needed by the epilogue.
302   llvm::SmallVector<Value> yieldOperands;
303   for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
304     yieldOperands.push_back(mapping.lookupOrDefault(retVal));
305   }
306   for (auto &it : crossStageValues) {
307     int64_t version = maxStage - it.second.lastUseStage + 1;
308     unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
309     // add the original verstion to yield ops.
310     // If there is a liverange spanning across more than 2 stages we need to add
311     // extra arg.
312     for (unsigned i = 1; i < numVersionReturned; i++) {
313       setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
314                       version++);
315       yieldOperands.push_back(
316           newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
317                                              newForOp.getNumInductionVars()]);
318     }
319     setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
320                     version++);
321     yieldOperands.push_back(mapping.lookupOrDefault(it.first));
322   }
323   // Map the yield operand to the forOp returned value.
324   for (auto retVal :
325        llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
326     Operation *def = retVal.value().getDefiningOp();
327     assert(def && "Only support loop carried dependencies of distance 1");
328     unsigned defStage = stages[def];
329     setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
330                     newForOp->getResult(retVal.index()),
331                     maxStage - defStage + 1);
332   }
333   rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
334 }
335 
336 llvm::SmallVector<Value>
337 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
338   llvm::SmallVector<Value> returnValues(forOp->getNumResults());
339   // Emit different versions of the induction variable. They will be
340   // removed by dead code if not used.
341   for (int64_t i = 0; i < maxStage; i++) {
342     Value newlastIter = rewriter.create<ConstantIndexOp>(
343         forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
344     setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
345   }
346   // Emit `maxStage - 1` epilogue part that includes operations fro stages
347   // [i; maxStage].
348   for (int64_t i = 1; i <= maxStage; i++) {
349     for (Operation *op : opOrder) {
350       if (stages[op] < i)
351         continue;
352       Operation *newOp = rewriter.clone(*op);
353       for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
354         auto it = valueMapping.find(op->getOperand(opIdx));
355         if (it != valueMapping.end()) {
356           Value v = it->second[maxStage - stages[op] + i];
357           assert(v);
358           newOp->setOperand(opIdx, v);
359         }
360       }
361       for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
362         setValueMapping(op->getResult(destId), newOp->getResult(destId),
363                         maxStage - stages[op] + i);
364         // If the value is a loop carried dependency update the loop argument
365         // mapping and keep track of the last version to replace the original
366         // forOp uses.
367         for (OpOperand &operand :
368              forOp.getBody()->getTerminator()->getOpOperands()) {
369           if (operand.get() != op->getResult(destId))
370             continue;
371           unsigned version = maxStage - stages[op] + i + 1;
372           // If the version is greater than maxStage it means it maps to the
373           // original forOp returned value.
374           if (version > maxStage) {
375             returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
376             continue;
377           }
378           setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
379                           newOp->getResult(destId), version);
380         }
381       }
382     }
383   }
384   return returnValues;
385 }
386 
387 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
388   auto it = valueMapping.find(key);
389   // If the value is not in the map yet add a vector big enough to store all
390   // versions.
391   if (it == valueMapping.end())
392     it =
393         valueMapping
394             .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
395             .first;
396   it->second[idx] = el;
397 }
398 
399 /// Generate a pipelined version of the scf.for loop based on the schedule given
400 /// as option. This applies the mechanical transformation of changing the loop
401 /// and generating the prologue/epilogue for the pipelining and doesn't make any
402 /// decision regarding the schedule.
403 /// Based on the option the loop is split into several stages.
404 /// The transformation assumes that the scheduling given by user is valid.
405 /// For example if we break a loop into 3 stages named S0, S1, S2 we would
406 /// generate the following code with the number in parenthesis the iteration
407 /// index:
408 /// S0(0)                        // Prologue
409 /// S0(1) S1(0)                  // Prologue
410 /// scf.for %I = %C0 to %N - 2 {
411 ///  S0(I+2) S1(I+1) S2(I)       // Pipelined kernel
412 /// }
413 /// S1(N) S2(N-1)                // Epilogue
414 /// S2(N)                        // Epilogue
415 struct ForLoopPipelining : public OpRewritePattern<ForOp> {
416   ForLoopPipelining(const PipeliningOption &options, MLIRContext *context)
417       : OpRewritePattern<ForOp>(context), options(options) {}
418   LogicalResult matchAndRewrite(ForOp forOp,
419                                 PatternRewriter &rewriter) const override {
420 
421     LoopPipelinerInternal pipeliner;
422     if (!pipeliner.initializeLoopInfo(forOp, options))
423       return failure();
424 
425     // 1. Emit prologue.
426     pipeliner.emitPrologue(rewriter);
427 
428     // 2. Track values used across stages. When a value cross stages it will
429     // need to be passed as loop iteration arguments.
430     // We first collect the values that are used in a different stage than where
431     // they are defined.
432     llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
433         crossStageValues = pipeliner.analyzeCrossStageValues();
434 
435     // Mapping between original loop values used cross stage and the block
436     // arguments associated after pipelining. A Value may map to several
437     // arguments if its liverange spans across more than 2 stages.
438     llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
439     // 3. Create the new kernel loop and return the block arguments mapping.
440     ForOp newForOp =
441         pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
442     // Create the kernel block, order ops based on user choice and remap
443     // operands.
444     pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
445 
446     // 4. Emit the epilogue after the new forOp.
447     rewriter.setInsertionPointAfter(newForOp);
448     llvm::SmallVector<Value> returnValues = pipeliner.emitEpilogue(rewriter);
449 
450     // 5. Erase the original loop and replace the uses with the epilogue output.
451     if (forOp->getNumResults() > 0)
452       rewriter.replaceOp(forOp, returnValues);
453     else
454       rewriter.eraseOp(forOp);
455 
456     return success();
457   }
458 
459 protected:
460   PipeliningOption options;
461 };
462 
463 } // namespace
464 
465 void mlir::scf::populateSCFLoopPipeliningPatterns(
466     RewritePatternSet &patterns, const PipeliningOption &options) {
467   patterns.add<ForLoopPipelining>(options, patterns.getContext());
468 }
469