1 //===- LoopPipelining.cpp - Code to perform loop software pipelining-------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop software pipelining
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "PassDetail.h"
14 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
15 #include "mlir/Dialect/SCF/IR/SCF.h"
16 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
17 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
18 #include "mlir/Dialect/SCF/Utils/Utils.h"
19 #include "mlir/IR/BlockAndValueMapping.h"
20 #include "mlir/IR/PatternMatch.h"
21 #include "mlir/Support/MathExtras.h"
22
23 using namespace mlir;
24 using namespace mlir::scf;
25
26 namespace {
27
28 /// Helper to keep internal information during pipelining transformation.
29 struct LoopPipelinerInternal {
30 /// Coarse liverange information for ops used across stages.
31 struct LiverangeInfo {
32 unsigned lastUseStage = 0;
33 unsigned defStage = 0;
34 };
35
36 protected:
37 ForOp forOp;
38 unsigned maxStage = 0;
39 DenseMap<Operation *, unsigned> stages;
40 std::vector<Operation *> opOrder;
41 int64_t ub;
42 int64_t lb;
43 int64_t step;
44 PipeliningOption::AnnotationlFnType annotateFn = nullptr;
45 bool peelEpilogue;
46 PipeliningOption::PredicateOpFn predicateFn = nullptr;
47
48 // When peeling the kernel we generate several version of each value for
49 // different stage of the prologue. This map tracks the mapping between
50 // original Values in the loop and the different versions
51 // peeled from the loop.
52 DenseMap<Value, llvm::SmallVector<Value>> valueMapping;
53
54 /// Assign a value to `valueMapping`, this means `val` represents the version
55 /// `idx` of `key` in the epilogue.
56 void setValueMapping(Value key, Value el, int64_t idx);
57
58 public:
59 /// Initalize the information for the given `op`, return true if it
60 /// satisfies the pre-condition to apply pipelining.
61 bool initializeLoopInfo(ForOp op, const PipeliningOption &options);
62 /// Emits the prologue, this creates `maxStage - 1` part which will contain
63 /// operations from stages [0; i], where i is the part index.
64 void emitPrologue(PatternRewriter &rewriter);
65 /// Gather liverange information for Values that are used in a different stage
66 /// than its definition.
67 llvm::MapVector<Value, LiverangeInfo> analyzeCrossStageValues();
68 scf::ForOp createKernelLoop(
69 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
70 PatternRewriter &rewriter,
71 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap);
72 /// Emits the pipelined kernel. This clones loop operations following user
73 /// order and remaps operands defined in a different stage as their use.
74 void createKernel(
75 scf::ForOp newForOp,
76 const llvm::MapVector<Value, LiverangeInfo> &crossStageValues,
77 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
78 PatternRewriter &rewriter);
79 /// Emits the epilogue, this creates `maxStage - 1` part which will contain
80 /// operations from stages [i; maxStage], where i is the part index.
81 llvm::SmallVector<Value> emitEpilogue(PatternRewriter &rewriter);
82 };
83
initializeLoopInfo(ForOp op,const PipeliningOption & options)84 bool LoopPipelinerInternal::initializeLoopInfo(
85 ForOp op, const PipeliningOption &options) {
86 forOp = op;
87 auto upperBoundCst =
88 forOp.getUpperBound().getDefiningOp<arith::ConstantIndexOp>();
89 auto lowerBoundCst =
90 forOp.getLowerBound().getDefiningOp<arith::ConstantIndexOp>();
91 auto stepCst = forOp.getStep().getDefiningOp<arith::ConstantIndexOp>();
92 if (!upperBoundCst || !lowerBoundCst || !stepCst)
93 return false;
94 ub = upperBoundCst.value();
95 lb = lowerBoundCst.value();
96 step = stepCst.value();
97 peelEpilogue = options.peelEpilogue;
98 predicateFn = options.predicateFn;
99 if (!peelEpilogue && predicateFn == nullptr)
100 return false;
101 int64_t numIteration = ceilDiv(ub - lb, step);
102 std::vector<std::pair<Operation *, unsigned>> schedule;
103 options.getScheduleFn(forOp, schedule);
104 if (schedule.empty())
105 return false;
106
107 opOrder.reserve(schedule.size());
108 for (auto &opSchedule : schedule) {
109 maxStage = std::max(maxStage, opSchedule.second);
110 stages[opSchedule.first] = opSchedule.second;
111 opOrder.push_back(opSchedule.first);
112 }
113 if (numIteration <= maxStage)
114 return false;
115
116 // All operations need to have a stage.
117 if (forOp
118 .walk([this](Operation *op) {
119 if (op != forOp.getOperation() && !isa<scf::YieldOp>(op) &&
120 stages.find(op) == stages.end())
121 return WalkResult::interrupt();
122 return WalkResult::advance();
123 })
124 .wasInterrupted())
125 return false;
126
127 // Only support loop carried dependency with a distance of 1. This means the
128 // source of all the scf.yield operands needs to be defined by operations in
129 // the loop.
130 if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(),
131 [this](Value operand) {
132 Operation *def = operand.getDefiningOp();
133 return !def || stages.find(def) == stages.end();
134 }))
135 return false;
136 annotateFn = options.annotateFn;
137 return true;
138 }
139
emitPrologue(PatternRewriter & rewriter)140 void LoopPipelinerInternal::emitPrologue(PatternRewriter &rewriter) {
141 // Initialize the iteration argument to the loop initiale values.
142 for (BlockArgument &arg : forOp.getRegionIterArgs()) {
143 OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
144 setValueMapping(arg, operand.get(), 0);
145 }
146 auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
147 for (int64_t i = 0; i < maxStage; i++) {
148 // special handling for induction variable as the increment is implicit.
149 Value iv =
150 rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(), lb + i * step);
151 setValueMapping(forOp.getInductionVar(), iv, i);
152 for (Operation *op : opOrder) {
153 if (stages[op] > i)
154 continue;
155 Operation *newOp = rewriter.clone(*op);
156 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
157 auto it = valueMapping.find(op->getOperand(opIdx));
158 if (it != valueMapping.end())
159 newOp->setOperand(opIdx, it->second[i - stages[op]]);
160 }
161 if (annotateFn)
162 annotateFn(newOp, PipeliningOption::PipelinerPart::Prologue, i);
163 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
164 setValueMapping(op->getResult(destId), newOp->getResult(destId),
165 i - stages[op]);
166 // If the value is a loop carried dependency update the loop argument
167 // mapping.
168 for (OpOperand &operand : yield->getOpOperands()) {
169 if (operand.get() != op->getResult(destId))
170 continue;
171 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
172 newOp->getResult(destId), i - stages[op] + 1);
173 }
174 }
175 }
176 }
177 }
178
179 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
analyzeCrossStageValues()180 LoopPipelinerInternal::analyzeCrossStageValues() {
181 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo> crossStageValues;
182 for (Operation *op : opOrder) {
183 unsigned stage = stages[op];
184 for (OpOperand &operand : op->getOpOperands()) {
185 Operation *def = operand.get().getDefiningOp();
186 if (!def)
187 continue;
188 auto defStage = stages.find(def);
189 if (defStage == stages.end() || defStage->second == stage)
190 continue;
191 assert(stage > defStage->second);
192 LiverangeInfo &info = crossStageValues[operand.get()];
193 info.defStage = defStage->second;
194 info.lastUseStage = std::max(info.lastUseStage, stage);
195 }
196 }
197 return crossStageValues;
198 }
199
createKernelLoop(const llvm::MapVector<Value,LoopPipelinerInternal::LiverangeInfo> & crossStageValues,PatternRewriter & rewriter,llvm::DenseMap<std::pair<Value,unsigned>,unsigned> & loopArgMap)200 scf::ForOp LoopPipelinerInternal::createKernelLoop(
201 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
202 &crossStageValues,
203 PatternRewriter &rewriter,
204 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap) {
205 // Creates the list of initial values associated to values used across
206 // stages. The initial values come from the prologue created above.
207 // Keep track of the kernel argument associated to each version of the
208 // values passed to the kernel.
209 llvm::SmallVector<Value> newLoopArg;
210 // For existing loop argument initialize them with the right version from the
211 // prologue.
212 for (const auto &retVal :
213 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
214 Operation *def = retVal.value().getDefiningOp();
215 assert(def && "Only support loop carried dependencies of distance 1");
216 unsigned defStage = stages[def];
217 Value valueVersion = valueMapping[forOp.getRegionIterArgs()[retVal.index()]]
218 [maxStage - defStage];
219 assert(valueVersion);
220 newLoopArg.push_back(valueVersion);
221 }
222 for (auto escape : crossStageValues) {
223 LiverangeInfo &info = escape.second;
224 Value value = escape.first;
225 for (unsigned stageIdx = 0; stageIdx < info.lastUseStage - info.defStage;
226 stageIdx++) {
227 Value valueVersion =
228 valueMapping[value][maxStage - info.lastUseStage + stageIdx];
229 assert(valueVersion);
230 newLoopArg.push_back(valueVersion);
231 loopArgMap[std::make_pair(value, info.lastUseStage - info.defStage -
232 stageIdx)] = newLoopArg.size() - 1;
233 }
234 }
235
236 // Create the new kernel loop. When we peel the epilgue we need to peel
237 // `numStages - 1` iterations. Then we adjust the upper bound to remove those
238 // iterations.
239 Value newUb = forOp.getUpperBound();
240 if (peelEpilogue)
241 newUb = rewriter.create<arith::ConstantIndexOp>(forOp.getLoc(),
242 ub - maxStage * step);
243 auto newForOp =
244 rewriter.create<scf::ForOp>(forOp.getLoc(), forOp.getLowerBound(), newUb,
245 forOp.getStep(), newLoopArg);
246 return newForOp;
247 }
248
createKernel(scf::ForOp newForOp,const llvm::MapVector<Value,LoopPipelinerInternal::LiverangeInfo> & crossStageValues,const llvm::DenseMap<std::pair<Value,unsigned>,unsigned> & loopArgMap,PatternRewriter & rewriter)249 void LoopPipelinerInternal::createKernel(
250 scf::ForOp newForOp,
251 const llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
252 &crossStageValues,
253 const llvm::DenseMap<std::pair<Value, unsigned>, unsigned> &loopArgMap,
254 PatternRewriter &rewriter) {
255 valueMapping.clear();
256
257 // Create the kernel, we clone instruction based on the order given by
258 // user and remap operands coming from a previous stages.
259 rewriter.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin());
260 BlockAndValueMapping mapping;
261 mapping.map(forOp.getInductionVar(), newForOp.getInductionVar());
262 for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) {
263 mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
264 }
265 SmallVector<Value> predicates(maxStage + 1, nullptr);
266 if (!peelEpilogue) {
267 // Create a predicate for each stage except the last stage.
268 for (unsigned i = 0; i < maxStage; i++) {
269 Value c = rewriter.create<arith::ConstantIndexOp>(
270 newForOp.getLoc(), ub - (maxStage - i) * step);
271 Value pred = rewriter.create<arith::CmpIOp>(
272 newForOp.getLoc(), arith::CmpIPredicate::slt,
273 newForOp.getInductionVar(), c);
274 predicates[i] = pred;
275 }
276 }
277 for (Operation *op : opOrder) {
278 int64_t useStage = stages[op];
279 auto *newOp = rewriter.clone(*op, mapping);
280 for (OpOperand &operand : op->getOpOperands()) {
281 // Special case for the induction variable uses. We replace it with a
282 // version incremented based on the stage where it is used.
283 if (operand.get() == forOp.getInductionVar()) {
284 rewriter.setInsertionPoint(newOp);
285 Value offset = rewriter.create<arith::ConstantIndexOp>(
286 forOp.getLoc(), (maxStage - stages[op]) * step);
287 Value iv = rewriter.create<arith::AddIOp>(
288 forOp.getLoc(), newForOp.getInductionVar(), offset);
289 newOp->setOperand(operand.getOperandNumber(), iv);
290 rewriter.setInsertionPointAfter(newOp);
291 continue;
292 }
293 auto arg = operand.get().dyn_cast<BlockArgument>();
294 if (arg && arg.getOwner() == forOp.getBody()) {
295 // If the value is a loop carried value coming from stage N + 1 remap,
296 // it will become a direct use.
297 Value ret = forOp.getBody()->getTerminator()->getOperand(
298 arg.getArgNumber() - 1);
299 Operation *dep = ret.getDefiningOp();
300 if (!dep)
301 continue;
302 auto stageDep = stages.find(dep);
303 if (stageDep == stages.end() || stageDep->second == useStage)
304 continue;
305 assert(stageDep->second == useStage + 1);
306 newOp->setOperand(operand.getOperandNumber(),
307 mapping.lookupOrDefault(ret));
308 continue;
309 }
310 // For operands defined in a previous stage we need to remap it to use
311 // the correct region argument. We look for the right version of the
312 // Value based on the stage where it is used.
313 Operation *def = operand.get().getDefiningOp();
314 if (!def)
315 continue;
316 auto stageDef = stages.find(def);
317 if (stageDef == stages.end() || stageDef->second == useStage)
318 continue;
319 auto remap = loopArgMap.find(
320 std::make_pair(operand.get(), useStage - stageDef->second));
321 assert(remap != loopArgMap.end());
322 newOp->setOperand(operand.getOperandNumber(),
323 newForOp.getRegionIterArgs()[remap->second]);
324 }
325 if (predicates[useStage]) {
326 newOp = predicateFn(newOp, predicates[useStage], rewriter);
327 // Remap the results to the new predicated one.
328 for (auto values : llvm::zip(op->getResults(), newOp->getResults()))
329 mapping.map(std::get<0>(values), std::get<1>(values));
330 }
331 rewriter.setInsertionPointAfter(newOp);
332 if (annotateFn)
333 annotateFn(newOp, PipeliningOption::PipelinerPart::Kernel, 0);
334 }
335
336 // Collect the Values that need to be returned by the forOp. For each
337 // value we need to have `LastUseStage - DefStage` number of versions
338 // returned.
339 // We create a mapping between original values and the associated loop
340 // returned values that will be needed by the epilogue.
341 llvm::SmallVector<Value> yieldOperands;
342 for (Value retVal : forOp.getBody()->getTerminator()->getOperands()) {
343 yieldOperands.push_back(mapping.lookupOrDefault(retVal));
344 }
345 for (auto &it : crossStageValues) {
346 int64_t version = maxStage - it.second.lastUseStage + 1;
347 unsigned numVersionReturned = it.second.lastUseStage - it.second.defStage;
348 // add the original verstion to yield ops.
349 // If there is a liverange spanning across more than 2 stages we need to add
350 // extra arg.
351 for (unsigned i = 1; i < numVersionReturned; i++) {
352 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
353 version++);
354 yieldOperands.push_back(
355 newForOp.getBody()->getArguments()[yieldOperands.size() + 1 +
356 newForOp.getNumInductionVars()]);
357 }
358 setValueMapping(it.first, newForOp->getResult(yieldOperands.size()),
359 version++);
360 yieldOperands.push_back(mapping.lookupOrDefault(it.first));
361 }
362 // Map the yield operand to the forOp returned value.
363 for (const auto &retVal :
364 llvm::enumerate(forOp.getBody()->getTerminator()->getOperands())) {
365 Operation *def = retVal.value().getDefiningOp();
366 assert(def && "Only support loop carried dependencies of distance 1");
367 unsigned defStage = stages[def];
368 setValueMapping(forOp.getRegionIterArgs()[retVal.index()],
369 newForOp->getResult(retVal.index()),
370 maxStage - defStage + 1);
371 }
372 rewriter.create<scf::YieldOp>(forOp.getLoc(), yieldOperands);
373 }
374
375 llvm::SmallVector<Value>
emitEpilogue(PatternRewriter & rewriter)376 LoopPipelinerInternal::emitEpilogue(PatternRewriter &rewriter) {
377 llvm::SmallVector<Value> returnValues(forOp->getNumResults());
378 // Emit different versions of the induction variable. They will be
379 // removed by dead code if not used.
380 for (int64_t i = 0; i < maxStage; i++) {
381 Value newlastIter = rewriter.create<arith::ConstantIndexOp>(
382 forOp.getLoc(), lb + step * ((((ub - 1) - lb) / step) - i));
383 setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
384 }
385 // Emit `maxStage - 1` epilogue part that includes operations fro stages
386 // [i; maxStage].
387 for (int64_t i = 1; i <= maxStage; i++) {
388 for (Operation *op : opOrder) {
389 if (stages[op] < i)
390 continue;
391 Operation *newOp = rewriter.clone(*op);
392 for (unsigned opIdx = 0; opIdx < op->getNumOperands(); opIdx++) {
393 auto it = valueMapping.find(op->getOperand(opIdx));
394 if (it != valueMapping.end()) {
395 Value v = it->second[maxStage - stages[op] + i];
396 assert(v);
397 newOp->setOperand(opIdx, v);
398 }
399 }
400 if (annotateFn)
401 annotateFn(newOp, PipeliningOption::PipelinerPart::Epilogue, i - 1);
402 for (unsigned destId : llvm::seq(unsigned(0), op->getNumResults())) {
403 setValueMapping(op->getResult(destId), newOp->getResult(destId),
404 maxStage - stages[op] + i);
405 // If the value is a loop carried dependency update the loop argument
406 // mapping and keep track of the last version to replace the original
407 // forOp uses.
408 for (OpOperand &operand :
409 forOp.getBody()->getTerminator()->getOpOperands()) {
410 if (operand.get() != op->getResult(destId))
411 continue;
412 unsigned version = maxStage - stages[op] + i + 1;
413 // If the version is greater than maxStage it means it maps to the
414 // original forOp returned value.
415 if (version > maxStage) {
416 returnValues[operand.getOperandNumber()] = newOp->getResult(destId);
417 continue;
418 }
419 setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
420 newOp->getResult(destId), version);
421 }
422 }
423 }
424 }
425 return returnValues;
426 }
427
setValueMapping(Value key,Value el,int64_t idx)428 void LoopPipelinerInternal::setValueMapping(Value key, Value el, int64_t idx) {
429 auto it = valueMapping.find(key);
430 // If the value is not in the map yet add a vector big enough to store all
431 // versions.
432 if (it == valueMapping.end())
433 it =
434 valueMapping
435 .insert(std::make_pair(key, llvm::SmallVector<Value>(maxStage + 1)))
436 .first;
437 it->second[idx] = el;
438 }
439
440 } // namespace
441
returningMatchAndRewrite(ForOp forOp,PatternRewriter & rewriter) const442 FailureOr<ForOp> ForLoopPipeliningPattern::returningMatchAndRewrite(
443 ForOp forOp, PatternRewriter &rewriter) const {
444
445 LoopPipelinerInternal pipeliner;
446 if (!pipeliner.initializeLoopInfo(forOp, options))
447 return failure();
448
449 // 1. Emit prologue.
450 pipeliner.emitPrologue(rewriter);
451
452 // 2. Track values used across stages. When a value cross stages it will
453 // need to be passed as loop iteration arguments.
454 // We first collect the values that are used in a different stage than where
455 // they are defined.
456 llvm::MapVector<Value, LoopPipelinerInternal::LiverangeInfo>
457 crossStageValues = pipeliner.analyzeCrossStageValues();
458
459 // Mapping between original loop values used cross stage and the block
460 // arguments associated after pipelining. A Value may map to several
461 // arguments if its liverange spans across more than 2 stages.
462 llvm::DenseMap<std::pair<Value, unsigned>, unsigned> loopArgMap;
463 // 3. Create the new kernel loop and return the block arguments mapping.
464 ForOp newForOp =
465 pipeliner.createKernelLoop(crossStageValues, rewriter, loopArgMap);
466 // Create the kernel block, order ops based on user choice and remap
467 // operands.
468 pipeliner.createKernel(newForOp, crossStageValues, loopArgMap, rewriter);
469
470 llvm::SmallVector<Value> returnValues =
471 newForOp.getResults().take_front(forOp->getNumResults());
472 if (options.peelEpilogue) {
473 // 4. Emit the epilogue after the new forOp.
474 rewriter.setInsertionPointAfter(newForOp);
475 returnValues = pipeliner.emitEpilogue(rewriter);
476 }
477 // 5. Erase the original loop and replace the uses with the epilogue output.
478 if (forOp->getNumResults() > 0)
479 rewriter.replaceOp(forOp, returnValues);
480 else
481 rewriter.eraseOp(forOp);
482
483 return newForOp;
484 }
485
populateSCFLoopPipeliningPatterns(RewritePatternSet & patterns,const PipeliningOption & options)486 void mlir::scf::populateSCFLoopPipeliningPatterns(
487 RewritePatternSet &patterns, const PipeliningOption &options) {
488 patterns.add<ForLoopPipeliningPattern>(options, patterns.getContext());
489 }
490