1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements miscellaneous loop transformation routines.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "mlir/Dialect/Affine/LoopUtils.h"
14 #include "mlir/Analysis/SliceAnalysis.h"
15 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
16 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
17 #include "mlir/Dialect/Affine/Analysis/Utils.h"
18 #include "mlir/Dialect/Affine/IR/AffineOps.h"
19 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
20 #include "mlir/Dialect/Affine/Utils.h"
21 #include "mlir/Dialect/Func/IR/FuncOps.h"
22 #include "mlir/Dialect/MemRef/IR/MemRef.h"
23 #include "mlir/Dialect/SCF/IR/SCF.h"
24 #include "mlir/IR/BlockAndValueMapping.h"
25 #include "mlir/IR/IntegerSet.h"
26 #include "mlir/Support/MathExtras.h"
27 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
28 #include "mlir/Transforms/RegionUtils.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/ADT/SmallPtrSet.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33
34 #define DEBUG_TYPE "LoopUtils"
35
36 using namespace mlir;
37 using namespace presburger;
38 using llvm::SmallMapVector;
39
40 namespace {
41 // This structure is to pass and return sets of loop parameters without
42 // confusing the order.
43 struct LoopParams {
44 Value lowerBound;
45 Value upperBound;
46 Value step;
47 };
48 } // namespace
49
50 /// Computes the cleanup loop lower bound of the loop being unrolled with
51 /// the specified unroll factor; this bound will also be upper bound of the main
52 /// part of the unrolled loop. Computes the bound as an AffineMap with its
53 /// operands or a null map when the trip count can't be expressed as an affine
54 /// expression.
55 static void
getCleanupLoopLowerBound(AffineForOp forOp,unsigned unrollFactor,AffineMap & cleanupLbMap,SmallVectorImpl<Value> & cleanupLbOperands)56 getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
57 AffineMap &cleanupLbMap,
58 SmallVectorImpl<Value> &cleanupLbOperands) {
59 AffineMap tripCountMap;
60 SmallVector<Value, 4> tripCountOperands;
61 getTripCountMapAndOperands(forOp, &tripCountMap, &tripCountOperands);
62 // Trip count can't be computed.
63 if (!tripCountMap) {
64 cleanupLbMap = AffineMap();
65 return;
66 }
67
68 OpBuilder b(forOp);
69 auto lbMap = forOp.getLowerBoundMap();
70 auto lb = b.create<AffineApplyOp>(forOp.getLoc(), lbMap,
71 forOp.getLowerBoundOperands());
72
73 // For each upper bound expr, get the range.
74 // Eg: affine.for %i = lb to min (ub1, ub2),
75 // where tripCountExprs yield (tr1, tr2), we create affine.apply's:
76 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
77 // these affine.apply's make up the cleanup loop lower bound.
78 SmallVector<AffineExpr, 4> bumpExprs(tripCountMap.getNumResults());
79 SmallVector<Value, 4> bumpValues(tripCountMap.getNumResults());
80 int64_t step = forOp.getStep();
81 for (unsigned i = 0, e = tripCountMap.getNumResults(); i < e; i++) {
82 auto tripCountExpr = tripCountMap.getResult(i);
83 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
84 auto bumpMap = AffineMap::get(tripCountMap.getNumDims(),
85 tripCountMap.getNumSymbols(), bumpExprs[i]);
86 bumpValues[i] =
87 b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, tripCountOperands);
88 }
89
90 SmallVector<AffineExpr, 4> newUbExprs(tripCountMap.getNumResults());
91 for (unsigned i = 0, e = bumpExprs.size(); i < e; i++)
92 newUbExprs[i] = b.getAffineDimExpr(0) + b.getAffineDimExpr(i + 1);
93
94 cleanupLbOperands.clear();
95 cleanupLbOperands.push_back(lb);
96 cleanupLbOperands.append(bumpValues.begin(), bumpValues.end());
97 cleanupLbMap = AffineMap::get(1 + tripCountMap.getNumResults(), 0, newUbExprs,
98 b.getContext());
99 // Simplify the cleanupLbMap + cleanupLbOperands.
100 fullyComposeAffineMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
101 cleanupLbMap = simplifyAffineMap(cleanupLbMap);
102 canonicalizeMapAndOperands(&cleanupLbMap, &cleanupLbOperands);
103 // Remove any affine.apply's that became dead from the simplification above.
104 for (auto v : bumpValues)
105 if (v.use_empty())
106 v.getDefiningOp()->erase();
107
108 if (lb.use_empty())
109 lb.erase();
110 }
111
112 /// Helper to replace uses of loop carried values (iter_args) and loop
113 /// yield values while promoting single iteration affine.for ops.
replaceIterArgsAndYieldResults(AffineForOp forOp)114 static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
115 // Replace uses of iter arguments with iter operands (initial values).
116 auto iterOperands = forOp.getIterOperands();
117 auto iterArgs = forOp.getRegionIterArgs();
118 for (auto e : llvm::zip(iterOperands, iterArgs))
119 std::get<1>(e).replaceAllUsesWith(std::get<0>(e));
120
121 // Replace uses of loop results with the values yielded by the loop.
122 auto outerResults = forOp.getResults();
123 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
124 for (auto e : llvm::zip(outerResults, innerResults))
125 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
126 }
127
128 /// Promotes the loop body of a forOp to its containing block if the forOp
129 /// was known to have a single iteration.
130 // TODO: extend this for arbitrary affine bounds.
promoteIfSingleIteration(AffineForOp forOp)131 LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
132 Optional<uint64_t> tripCount = getConstantTripCount(forOp);
133 if (!tripCount || *tripCount != 1)
134 return failure();
135
136 if (forOp.getLowerBoundMap().getNumResults() != 1)
137 return failure();
138
139 // Replaces all IV uses to its single iteration value.
140 auto iv = forOp.getInductionVar();
141 auto *parentBlock = forOp->getBlock();
142 if (!iv.use_empty()) {
143 if (forOp.hasConstantLowerBound()) {
144 OpBuilder topBuilder(forOp->getParentOfType<func::FuncOp>().getBody());
145 auto constOp = topBuilder.create<arith::ConstantIndexOp>(
146 forOp.getLoc(), forOp.getConstantLowerBound());
147 iv.replaceAllUsesWith(constOp);
148 } else {
149 auto lbOperands = forOp.getLowerBoundOperands();
150 auto lbMap = forOp.getLowerBoundMap();
151 OpBuilder builder(forOp);
152 if (lbMap == builder.getDimIdentityMap()) {
153 // No need of generating an affine.apply.
154 iv.replaceAllUsesWith(lbOperands[0]);
155 } else {
156 auto affineApplyOp =
157 builder.create<AffineApplyOp>(forOp.getLoc(), lbMap, lbOperands);
158 iv.replaceAllUsesWith(affineApplyOp);
159 }
160 }
161 }
162
163 replaceIterArgsAndYieldResults(forOp);
164
165 // Move the loop body operations, except for its terminator, to the loop's
166 // containing block.
167 forOp.getBody()->back().erase();
168 parentBlock->getOperations().splice(Block::iterator(forOp),
169 forOp.getBody()->getOperations());
170 forOp.erase();
171 return success();
172 }
173
174 /// Generates an affine.for op with the specified lower and upper bounds
175 /// while generating the right IV remappings to realize shifts for operations in
176 /// its body. The operations that go into the loop body are specified in
177 /// opGroupQueue starting from the specified offset, and in that order. The
178 /// first element of the pair specifies the shift applied to that group of
179 /// operations; the shift is multiplied by the loop step before being applied.
180 /// Returns nullptr if the generated loop simplifies to a single iteration one.
generateShiftedLoop(AffineMap lbMap,AffineMap ubMap,const std::vector<std::pair<uint64_t,ArrayRef<Operation * >>> & opGroupQueue,unsigned offset,AffineForOp srcForOp,OpBuilder b)181 static AffineForOp generateShiftedLoop(
182 AffineMap lbMap, AffineMap ubMap,
183 const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
184 unsigned offset, AffineForOp srcForOp, OpBuilder b) {
185 auto lbOperands = srcForOp.getLowerBoundOperands();
186 auto ubOperands = srcForOp.getUpperBoundOperands();
187
188 assert(lbMap.getNumInputs() == lbOperands.size());
189 assert(ubMap.getNumInputs() == ubOperands.size());
190
191 auto loopChunk = b.create<AffineForOp>(srcForOp.getLoc(), lbOperands, lbMap,
192 ubOperands, ubMap, srcForOp.getStep());
193 auto loopChunkIV = loopChunk.getInductionVar();
194 auto srcIV = srcForOp.getInductionVar();
195
196 BlockAndValueMapping operandMap;
197
198 auto bodyBuilder = OpBuilder::atBlockTerminator(loopChunk.getBody());
199 for (const auto &it : llvm::drop_begin(opGroupQueue, offset)) {
200 uint64_t shift = it.first;
201 auto ops = it.second;
202 // All 'same shift' operations get added with their operands being
203 // remapped to results of cloned operations, and their IV used remapped.
204 // Generate the remapping if the shift is not zero: remappedIV = newIV -
205 // shift.
206 if (!srcIV.use_empty() && shift != 0) {
207 auto ivRemap = bodyBuilder.create<AffineApplyOp>(
208 srcForOp.getLoc(),
209 bodyBuilder.getSingleDimShiftAffineMap(
210 -static_cast<int64_t>(srcForOp.getStep() * shift)),
211 loopChunkIV);
212 operandMap.map(srcIV, ivRemap);
213 } else {
214 operandMap.map(srcIV, loopChunkIV);
215 }
216 for (auto *op : ops)
217 bodyBuilder.clone(*op, operandMap);
218 };
219 if (succeeded(promoteIfSingleIteration(loopChunk)))
220 return AffineForOp();
221 return loopChunk;
222 }
223
224 // The skewing of operations with respect to one another can be used for
225 // example to allow overlap of asynchronous operations (such as DMA
226 // communication) with computation, or just relative shifting of operations
227 // for better register reuse, locality or parallelism. As such, the shifts are
228 // typically expected to be at most of the order of the number of operations.
229 // This method should not be used as a substitute for loop distribution/fission.
230 // This method uses an algorithm// in time linear in the number of operations
231 // in the body of the for loop - (using the 'sweep line' paradigm). This method
232 // asserts preservation of SSA dominance. A check for that as well as that for
233 // memory-based dependence preservation check rests with the users of this
234 // method.
affineForOpBodySkew(AffineForOp forOp,ArrayRef<uint64_t> shifts,bool unrollPrologueEpilogue)235 LogicalResult mlir::affineForOpBodySkew(AffineForOp forOp,
236 ArrayRef<uint64_t> shifts,
237 bool unrollPrologueEpilogue) {
238 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
239 "too few/many shifts");
240 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
241 return success();
242
243 // If the trip counts aren't constant, we would need versioning and
244 // conditional guards (or context information to prevent such versioning). The
245 // better way to pipeline for such loops is to first tile them and extract
246 // constant trip count "full tiles" before applying this.
247 auto mayBeConstTripCount = getConstantTripCount(forOp);
248 if (!mayBeConstTripCount) {
249 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
250 return success();
251 }
252 uint64_t tripCount = *mayBeConstTripCount;
253
254 assert(isOpwiseShiftValid(forOp, shifts) &&
255 "shifts will lead to an invalid transformation\n");
256
257 int64_t step = forOp.getStep();
258
259 unsigned numChildOps = shifts.size();
260
261 // Do a linear time (counting) sort for the shifts.
262 uint64_t maxShift = *std::max_element(shifts.begin(), shifts.end());
263 if (maxShift >= numChildOps) {
264 // Large shifts are not the typical use case.
265 forOp.emitWarning("not shifting because shifts are unrealistically large");
266 return success();
267 }
268
269 // An array of operation groups sorted by shift amount; each group has all
270 // operations with the same shift in the order in which they appear in the
271 // body of the 'affine.for' op.
272 std::vector<std::vector<Operation *>> sortedOpGroups(maxShift + 1);
273 unsigned pos = 0;
274 for (auto &op : forOp.getBody()->without_terminator()) {
275 auto shift = shifts[pos++];
276 sortedOpGroups[shift].push_back(&op);
277 }
278
279 // Unless the shifts have a specific pattern (which actually would be the
280 // common use case), prologue and epilogue are not meaningfully defined.
281 // Nevertheless, if 'unrollPrologueEpilogue' is set, we will treat the first
282 // loop generated as the prologue and the last as epilogue and unroll these
283 // fully.
284 AffineForOp prologue, epilogue;
285
286 // Do a sweep over the sorted shifts while storing open groups in a
287 // vector, and generating loop portions as necessary during the sweep. A block
288 // of operations is paired with its shift.
289 std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> opGroupQueue;
290
291 auto origLbMap = forOp.getLowerBoundMap();
292 uint64_t lbShift = 0;
293 OpBuilder b(forOp);
294 for (uint64_t d = 0, e = sortedOpGroups.size(); d < e; ++d) {
295 // If nothing is shifted by d, continue.
296 if (sortedOpGroups[d].empty())
297 continue;
298 if (!opGroupQueue.empty()) {
299 assert(d > 0 &&
300 "Queue expected to be empty when the first block is found");
301 // The interval for which the loop needs to be generated here is:
302 // [lbShift, min(lbShift + tripCount, d)) and the body of the
303 // loop needs to have all operations in opQueue in that order.
304 AffineForOp res;
305 if (lbShift + tripCount * step < d * step) {
306 res = generateShiftedLoop(
307 b.getShiftedAffineMap(origLbMap, lbShift),
308 b.getShiftedAffineMap(origLbMap, lbShift + tripCount * step),
309 opGroupQueue, /*offset=*/0, forOp, b);
310 // Entire loop for the queued op groups generated, empty it.
311 opGroupQueue.clear();
312 lbShift += tripCount * step;
313 } else {
314 res = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
315 b.getShiftedAffineMap(origLbMap, d),
316 opGroupQueue, /*offset=*/0, forOp, b);
317 lbShift = d * step;
318 }
319
320 if (res) {
321 // Simplify/canonicalize the affine.for.
322 RewritePatternSet patterns(res.getContext());
323 AffineForOp::getCanonicalizationPatterns(patterns, res.getContext());
324 bool erased;
325 (void)applyOpPatternsAndFold(res, std::move(patterns), &erased);
326
327 if (!erased && !prologue)
328 prologue = res;
329 if (!erased)
330 epilogue = res;
331 }
332 } else {
333 // Start of first interval.
334 lbShift = d * step;
335 }
336 // Augment the list of operations that get into the current open interval.
337 opGroupQueue.emplace_back(d, sortedOpGroups[d]);
338 }
339
340 // Those operations groups left in the queue now need to be processed (FIFO)
341 // and their loops completed.
342 for (unsigned i = 0, e = opGroupQueue.size(); i < e; ++i) {
343 uint64_t ubShift = (opGroupQueue[i].first + tripCount) * step;
344 epilogue = generateShiftedLoop(b.getShiftedAffineMap(origLbMap, lbShift),
345 b.getShiftedAffineMap(origLbMap, ubShift),
346 opGroupQueue, /*offset=*/i, forOp, b);
347 lbShift = ubShift;
348 if (!prologue)
349 prologue = epilogue;
350 }
351
352 // Erase the original for op.
353 forOp.erase();
354
355 if (unrollPrologueEpilogue && prologue)
356 (void)loopUnrollFull(prologue);
357 if (unrollPrologueEpilogue && !epilogue && epilogue != prologue)
358 (void)loopUnrollFull(epilogue);
359
360 return success();
361 }
362
363 /// Checks the legality of tiling of a hyper-rectangular loop nest by simply
364 /// checking if there is a 'negative' dependence in the memrefs present in
365 /// the loop nest. If yes then tiling is invalid.
366 static bool
checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops)367 checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) {
368 assert(!origLoops.empty() && "no original loops provided");
369
370 // We first find out all dependences we intend to check.
371 SmallVector<Operation *, 8> loadAndStoreOps;
372 origLoops[0]->walk([&](Operation *op) {
373 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
374 loadAndStoreOps.push_back(op);
375 });
376
377 unsigned numOps = loadAndStoreOps.size();
378 unsigned numLoops = origLoops.size();
379 FlatAffineValueConstraints dependenceConstraints;
380 for (unsigned d = 1; d <= numLoops + 1; ++d) {
381 for (unsigned i = 0; i < numOps; ++i) {
382 Operation *srcOp = loadAndStoreOps[i];
383 MemRefAccess srcAccess(srcOp);
384 for (unsigned j = 0; j < numOps; ++j) {
385 Operation *dstOp = loadAndStoreOps[j];
386 MemRefAccess dstAccess(dstOp);
387
388 SmallVector<DependenceComponent, 2> depComps;
389 dependenceConstraints.reset();
390 DependenceResult result = checkMemrefAccessDependence(
391 srcAccess, dstAccess, d, &dependenceConstraints, &depComps);
392
393 // Skip if there is no dependence in this case.
394 if (!hasDependence(result))
395 continue;
396
397 // Check whether there is any negative direction vector in the
398 // dependence components found above, which means that dependence is
399 // violated by the default hyper-rect tiling method.
400 LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated "
401 "for dependence at depth: "
402 << Twine(d) << " between:\n";);
403 LLVM_DEBUG(srcAccess.opInst->dump(););
404 LLVM_DEBUG(dstAccess.opInst->dump(););
405 for (unsigned k = 0, e = depComps.size(); k < e; k++) {
406 DependenceComponent depComp = depComps[k];
407 if (depComp.lb.has_value() && depComp.ub.has_value() &&
408 depComp.lb.value() < depComp.ub.value() &&
409 depComp.ub.value() < 0) {
410 LLVM_DEBUG(llvm::dbgs()
411 << "Dependence component lb = "
412 << Twine(depComp.lb.value())
413 << " ub = " << Twine(depComp.ub.value())
414 << " is negative at depth: " << Twine(d)
415 << " and thus violates the legality rule.\n");
416 return false;
417 }
418 }
419 }
420 }
421 }
422
423 return true;
424 }
425
426 /// Checks whether hyper-rectangular loop tiling of the nest
427 /// represented by `origLoops` is valid. The validity condition is from Irigoin
428 /// and Triolet, which states that two tiles cannot depend on each other. We
429 /// simplify such condition to just checking whether there is any negative
430 /// dependence direction, since we have the prior knowledge that the tiling
431 /// results will be hyper-rectangles, which are scheduled in the
432 /// lexicographically increasing order on the vector of loop indices. This
433 /// function will return failure when any dependence component is negative along
434 /// any of `origLoops`.
435 LogicalResult
checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops)436 checkTilingLegality(MutableArrayRef<mlir::AffineForOp> origLoops) {
437 return success(checkTilingLegalityImpl(origLoops));
438 }
439
440 /// Checks whether a loop nest is hyper-rectangular or not.
checkIfHyperRectangular(MutableArrayRef<AffineForOp> input)441 LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) {
442 FlatAffineValueConstraints cst;
443 SmallVector<Operation *, 8> ops(input.begin(), input.end());
444 // 0-d or 1-d is trivially hyper-rectangular.
445 if (input.size() <= 1)
446 return success();
447 if (failed(getIndexSet(ops, &cst))) {
448 LLVM_DEBUG(llvm::dbgs() << "Index set computation failed!\n");
449 return failure();
450 }
451 if (!cst.isHyperRectangular(0, input.size())) {
452 LLVM_DEBUG(llvm::dbgs()
453 << "Non-hyperrectangular nests not supported for tiling!\n");
454 return failure();
455 }
456 return success();
457 }
458
459 /// Check if the input nest is supported for tiling and whether tiling would be
460 /// legal or not.
461 template <typename t>
performPreTilingChecks(MutableArrayRef<AffineForOp> input,ArrayRef<t> tileSizes)462 LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input,
463 ArrayRef<t> tileSizes) {
464 assert(input.size() == tileSizes.size() && "Too few/many tile sizes");
465
466 if (llvm::any_of(input,
467 [](AffineForOp op) { return op.getNumResults() > 0; })) {
468 LLVM_DEBUG(llvm::dbgs()
469 << "Cannot tile nest where a loop has yield values\n");
470 return failure();
471 }
472
473 // Check if the supplied `for` ops are all successively nested.
474 if (!isPerfectlyNested(input)) {
475 LLVM_DEBUG(llvm::dbgs() << "input loops not perfectly nested");
476 return failure();
477 }
478
479 if (failed(checkIfHyperRectangular(input)))
480 return failure();
481
482 // Check if tiling is legal.
483 if (failed(checkTilingLegality(input))) {
484 input[0].emitRemark("tiling code is illegal due to dependences");
485 return failure();
486 }
487
488 return success();
489 }
490
491 /// Move the loop body of AffineForOp 'src' from 'src' into the specified
492 /// location in destination's body, ignoring the terminator.
moveLoopBodyImpl(AffineForOp src,AffineForOp dest,Block::iterator loc)493 static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
494 Block::iterator loc) {
495 auto &ops = src.getBody()->getOperations();
496 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
497 std::prev(ops.end()));
498 }
499
500 /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
501 /// body.
moveLoopBody(AffineForOp src,AffineForOp dest)502 void moveLoopBody(AffineForOp src, AffineForOp dest) {
503 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
504 }
505
506 /// Constructs tiled loop nest, without setting the loop bounds and move the
507 /// body of the original loop nest to the tiled loop nest.
constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,AffineForOp rootAffineForOp,unsigned width,MutableArrayRef<AffineForOp> tiledLoops)508 void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
509 AffineForOp rootAffineForOp, unsigned width,
510 MutableArrayRef<AffineForOp> tiledLoops) {
511 Location loc = rootAffineForOp.getLoc();
512
513 // The outermost among the loops as we add more..
514 Operation *topLoop = rootAffineForOp.getOperation();
515 AffineForOp innermostPointLoop;
516
517 // Add intra-tile (or point) loops.
518 for (unsigned i = 0; i < width; i++) {
519 OpBuilder b(topLoop);
520 // Loop bounds will be set later.
521 AffineForOp pointLoop = b.create<AffineForOp>(loc, 0, 0);
522 pointLoop.getBody()->getOperations().splice(
523 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
524 topLoop);
525 tiledLoops[2 * width - 1 - i] = pointLoop;
526 topLoop = pointLoop.getOperation();
527 if (i == 0)
528 innermostPointLoop = pointLoop;
529 }
530
531 // Add tile space loops;
532 for (unsigned i = width; i < 2 * width; i++) {
533 OpBuilder b(topLoop);
534 // Loop bounds will be set later.
535 AffineForOp tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
536 tileSpaceLoop.getBody()->getOperations().splice(
537 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
538 topLoop);
539 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
540 topLoop = tileSpaceLoop.getOperation();
541 }
542
543 // Move the loop body of the original nest to the new one.
544 moveLoopBody(origLoops.back(), innermostPointLoop);
545 }
546
547 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
548 // TODO: Handle non-constant lower bounds.
setIntraTileBoundsParametric(OpBuilder & b,AffineForOp origLoop,AffineForOp newInterTileLoop,AffineForOp newIntraTileLoop,Value tileSize)549 static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
550 AffineForOp newInterTileLoop,
551 AffineForOp newIntraTileLoop,
552 Value tileSize) {
553 // The lower bound for the intra-tile loop is represented by an affine map
554 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
555 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
556 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
557 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
558 // parameter, %origlb is lower bound and %origLoopStep is the loop step of the
559 // corresponding inter-tile loop.
560
561 assert(origLoop.hasConstantLowerBound() &&
562 "expected input loops to have constant lower bound.");
563
564 // Get lower bound of original loop as an affine expression.
565 AffineExpr origLowerBoundExpr;
566 origLowerBoundExpr =
567 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
568
569 // Add dim operands from original lower/upper bound.
570 SmallVector<Value, 4> lbOperands, ubOperands;
571 AffineBound lb = origLoop.getLowerBound();
572 AffineBound ub = origLoop.getUpperBound();
573 lbOperands.reserve(lb.getNumOperands() + 2);
574 ubOperands.reserve(ub.getNumOperands() + 2);
575 AffineMap origLbMap = lb.getMap();
576 AffineMap origUbMap = ub.getMap();
577 for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j)
578 lbOperands.push_back(lb.getOperand(j));
579 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
580 ubOperands.push_back(ub.getOperand(j));
581
582 // Add a new dim operand in lb/ubOperands corresponding to the origLoop
583 // IV.
584 lbOperands.push_back(newInterTileLoop.getInductionVar());
585 ubOperands.push_back(newInterTileLoop.getInductionVar());
586
587 // Get loop IV as an affine expression for lower/upper bound. Size of
588 // lb/ubOperands is guaranteed to be atleast one.
589 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
590 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
591
592 // Add symbol operands from original lower/upper bound.
593 for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j)
594 lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j));
595 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
596 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
597
598 // Add a new symbol operand which is the tile size for this loop.
599 lbOperands.push_back(tileSize);
600 ubOperands.push_back(tileSize);
601
602 SmallVector<AffineExpr, 4> lbBoundExprs;
603 SmallVector<AffineExpr, 4> ubBoundExprs;
604 lbBoundExprs.reserve(origLbMap.getNumResults());
605 ubBoundExprs.reserve(origUbMap.getNumResults());
606
607 // Get tiling parameter as an affine expression for lb/ub.
608 AffineExpr lbTileParameter = b.getAffineSymbolExpr(origLbMap.getNumSymbols());
609 AffineExpr ubTileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
610
611 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
612 lbBoundExprs.push_back(
613 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
614 origLowerBoundExpr);
615
616 // Get the origLoopStep as an affine expression.
617 AffineExpr origLoopStep = b.getAffineConstantExpr(origLoop.getStep());
618
619 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
620 // (tilingParameter * origLoopStep) + origlb.
621 ubBoundExprs.push_back(
622 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
623 (ubTileParameter * origLoopStep) + origLowerBoundExpr);
624
625 ubBoundExprs.append(origUbMap.getResults().begin(),
626 origUbMap.getResults().end());
627
628 AffineMap lbMap =
629 AffineMap::get(origLbMap.getNumDims() + 1, origLbMap.getNumSymbols() + 1,
630 lbBoundExprs, b.getContext());
631 newIntraTileLoop.setLowerBound(lbOperands, lbMap);
632
633 AffineMap ubMap =
634 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols() + 1,
635 ubBoundExprs, b.getContext());
636 newIntraTileLoop.setUpperBound(ubOperands, ubMap);
637
638 // Original loop step must be preserved.
639 newIntraTileLoop.setStep(origLoop.getStep());
640 }
641
642 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
643 // TODO: Handle non-constant lower bounds.
setInterTileBoundsParametric(OpBuilder & b,AffineForOp origLoop,AffineForOp newLoop,Value tileSize)644 static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
645 AffineForOp newLoop, Value tileSize) {
646 OperandRange newLbOperands = origLoop.getLowerBoundOperands();
647
648 // The lower bounds for inter-tile loops are same as the corresponding lower
649 // bounds of original loops.
650 newLoop.setLowerBound(newLbOperands, origLoop.getLowerBoundMap());
651
652 // The new upper bound map for inter-tile loops, assuming constant lower
653 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
654 // originalLowerBound), tiling parameter); where tiling parameter is the
655 // respective tile size for that loop. For e.g. if the original ubmap was
656 // ()->(1024), the new map will be
657 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
658 // Therefore a new symbol operand is inserted in the map and the result
659 // expression is overwritten.
660
661 assert(origLoop.hasConstantLowerBound() &&
662 "expected input loops to have constant lower bound.");
663
664 // Get lower bound of original loop as an affine expression.
665 AffineExpr origLowerBoundExpr;
666 origLowerBoundExpr =
667 b.getAffineConstantExpr(origLoop.getConstantLowerBound());
668
669 // Add dim operands from original upper bound.
670 SmallVector<Value, 4> ubOperands;
671 AffineBound ub = origLoop.getUpperBound();
672 ubOperands.reserve(ub.getNumOperands() + 1);
673 AffineMap origUbMap = ub.getMap();
674 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
675 ubOperands.push_back(ub.getOperand(j));
676
677 // Add symbol operands from original upper bound.
678 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
679 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
680
681 // Add a new symbol operand which is the tile size for this loop.
682 ubOperands.push_back(tileSize);
683
684 // Get tiling parameter as an affine expression.
685 AffineExpr tileParameter = b.getAffineSymbolExpr(origUbMap.getNumSymbols());
686
687 SmallVector<AffineExpr, 4> boundExprs;
688 boundExprs.reserve(origUbMap.getNumResults());
689 int64_t origUpperBound;
690 AffineExpr origUpperBoundExpr;
691
692 // If upper bound for the original loop is constant, then the constant can
693 // be obtained as an affine expression straight away.
694 if (origLoop.hasConstantUpperBound()) {
695 origUpperBound = origLoop.getConstantUpperBound();
696
697 // Get original constant upper bound as an affine expression.
698 origUpperBoundExpr = b.getAffineConstantExpr(origUpperBound);
699
700 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
701 // originalLowerBound), tilingParameter).
702 boundExprs.push_back(
703 origLowerBoundExpr +
704 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
705 } else {
706 // If upper bound for the original loop is not constant then two cases
707 // are possible, although there handeling is the same, 1.) The result of
708 // ubmap has only one result expression. For e.g.
709 // affine.for %i = 5 to %ub
710 //
711 // A symbol operand is added which represents the tiling parameter. The
712 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
713 // where 's0' is the original upper bound and 's1' is the tiling
714 // parameter. 2.) When ubMap has more than one result expression. For e.g.
715 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
716 // affine.for %i = 5 to min #map0()[%s0, %s1]
717 //
718 // A symbol operand is added which represents the tiling parameter. The
719 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
720 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
721
722 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
723 // originalLowerBound), tilingParameter).
724 for (AffineExpr origUpperBoundExpr : origUbMap.getResults())
725 boundExprs.push_back(
726 origLowerBoundExpr +
727 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
728 }
729
730 AffineMap ubMap =
731 AffineMap::get(origUbMap.getNumDims(), origUbMap.getNumSymbols() + 1,
732 boundExprs, b.getContext());
733 newLoop.setUpperBound(ubOperands, ubMap);
734
735 // Original loop step must be preserved.
736 newLoop.setStep(origLoop.getStep());
737 }
738
739 /// Constructs and sets new loop bounds after tiling for the case of
740 /// hyper-rectangular index sets, where the bounds of one dimension do not
741 /// depend on other dimensions and tiling parameters are captured from SSA
742 /// values. Bounds of each dimension can thus be treated independently,
743 /// and deriving the new bounds is much simpler and faster than for the case of
744 /// tiling arbitrary polyhedral shapes.
constructParametricallyTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,MutableArrayRef<AffineForOp> newLoops,ArrayRef<Value> tileSizes)745 static void constructParametricallyTiledIndexSetHyperRect(
746 MutableArrayRef<AffineForOp> origLoops,
747 MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) {
748 assert(!origLoops.empty() && "expected atleast one loop in band");
749 assert(origLoops.size() == tileSizes.size() &&
750 "expected tiling parameter for each loop in band.");
751
752 OpBuilder b(origLoops[0].getOperation());
753 unsigned width = origLoops.size();
754
755 // Set bounds for tile space loops.
756 for (unsigned i = 0; i < width; ++i) {
757 setInterTileBoundsParametric(b, origLoops[i], newLoops[i], tileSizes[i]);
758 }
759
760 // Set bounds for intra-tile loops.
761 for (unsigned i = 0; i < width; ++i) {
762 setIntraTileBoundsParametric(b, origLoops[i], newLoops[i],
763 newLoops[i + width], tileSizes[i]);
764 }
765 }
766
767 /// Constructs and sets new loop bounds after tiling for the case of
768 /// hyper-rectangular index sets, where the bounds of one dimension do not
769 /// depend on other dimensions. Bounds of each dimension can thus be treated
770 /// independently, and deriving the new bounds is much simpler and faster
771 /// than for the case of tiling arbitrary polyhedral shapes.
772 static void
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,MutableArrayRef<AffineForOp> newLoops,ArrayRef<unsigned> tileSizes)773 constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
774 MutableArrayRef<AffineForOp> newLoops,
775 ArrayRef<unsigned> tileSizes) {
776 assert(!origLoops.empty());
777 assert(origLoops.size() == tileSizes.size());
778
779 OpBuilder b(origLoops[0].getOperation());
780 unsigned width = origLoops.size();
781
782 // Bounds for tile space loops.
783 for (unsigned i = 0; i < width; i++) {
784 OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
785 OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
786 newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
787 newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
788 // If the step size of original loop is x and tileSize is y then after
789 // tiling the tile space loops' step size becomes x*y.
790 newLoops[i].setStep(tileSizes[i] * origLoops[i].getStep());
791 }
792 // Bounds for intra-tile loops.
793 for (unsigned i = 0; i < width; i++) {
794 int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
795 Optional<uint64_t> mayBeConstantCount = getConstantTripCount(origLoops[i]);
796 // The lower bound is just the tile-space loop.
797 AffineMap lbMap = b.getDimIdentityMap();
798 newLoops[width + i].setLowerBound(
799 /*operands=*/newLoops[i].getInductionVar(), lbMap);
800 // The step sizes of intra-tile loops is just the original loops' step size.
801 newLoops[width + i].setStep(origLoops[i].getStep());
802
803 // Set the upper bound.
804 if (mayBeConstantCount && mayBeConstantCount.value() < tileSizes[i]) {
805 // Trip count is less than the tile size: upper bound is lower bound +
806 // trip count * stepSize.
807 AffineMap ubMap = b.getSingleDimShiftAffineMap(
808 mayBeConstantCount.value() * origLoops[i].getStep());
809 newLoops[width + i].setUpperBound(
810 /*operands=*/newLoops[i].getInductionVar(), ubMap);
811 } else if (largestDiv % tileSizes[i] != 0) {
812 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
813 // Construct the upper bound map; the operands are the original operands
814 // with 'i' (tile-space loop) appended to it. The new upper bound map is
815 // the original one with an additional expression i + tileSize * stepSize
816 // appended.
817
818 // Add dim operands from original upper bound.
819 SmallVector<Value, 4> ubOperands;
820 AffineBound ub = origLoops[i].getUpperBound();
821 ubOperands.reserve(ub.getNumOperands() + 1);
822 AffineMap origUbMap = ub.getMap();
823 for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
824 ubOperands.push_back(ub.getOperand(j));
825
826 // Add dim operand for new loop upper bound.
827 ubOperands.push_back(newLoops[i].getInductionVar());
828
829 // Add symbol operands from original upper bound.
830 for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
831 ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));
832
833 SmallVector<AffineExpr, 4> boundExprs;
834 boundExprs.reserve(1 + origUbMap.getNumResults());
835 AffineExpr dim = b.getAffineDimExpr(origUbMap.getNumDims());
836 // The new upper bound map is the original one with an additional
837 // expression i + tileSize * stepSize (of original loop) appended.
838 boundExprs.push_back(dim + tileSizes[i] * origLoops[i].getStep());
839 boundExprs.append(origUbMap.getResults().begin(),
840 origUbMap.getResults().end());
841 AffineMap ubMap =
842 AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
843 boundExprs, b.getContext());
844 newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
845 } else {
846 // No need of the min expression.
847 AffineExpr dim = b.getAffineDimExpr(0);
848 AffineMap ubMap =
849 AffineMap::get(1, 0, dim + tileSizes[i] * origLoops[i].getStep());
850 newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
851 }
852 }
853 }
854
855 /// Tiles the specified band of perfectly nested loops creating tile-space loops
856 /// and intra-tile loops. A band is a contiguous set of loops.
857 // TODO: handle non hyper-rectangular spaces.
858 LogicalResult
tilePerfectlyNested(MutableArrayRef<AffineForOp> input,ArrayRef<unsigned> tileSizes,SmallVectorImpl<AffineForOp> * tiledNest)859 mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
860 ArrayRef<unsigned> tileSizes,
861 SmallVectorImpl<AffineForOp> *tiledNest) {
862 if (input.empty())
863 return success();
864
865 if (failed(performPreTilingChecks(input, tileSizes)))
866 return failure();
867
868 MutableArrayRef<AffineForOp> origLoops = input;
869 AffineForOp rootAffineForOp = origLoops[0];
870
871 // Note that width is at least one since the band isn't empty.
872 unsigned width = input.size();
873 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
874
875 // Construct a tiled loop nest without setting their bounds. Bounds are
876 // set later.
877 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
878
879 SmallVector<Value, 8> origLoopIVs;
880 extractForInductionVars(input, &origLoopIVs);
881
882 // Set loop bounds for the tiled loop nest.
883 constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);
884
885 // Replace original IVs with intra-tile loop IVs.
886 for (unsigned i = 0; i < width; i++)
887 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
888
889 // Erase the old loop nest.
890 rootAffineForOp.erase();
891
892 if (tiledNest)
893 *tiledNest = std::move(tiledLoops);
894
895 return success();
896 }
897
898 /// Tiles the specified band of perfectly nested loops creating tile-space
899 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
900 /// is a contiguous set of loops.
901 // TODO: handle non hyper-rectangular spaces.
902 LogicalResult
tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,ArrayRef<Value> tileSizes,SmallVectorImpl<AffineForOp> * tiledNest)903 mlir::tilePerfectlyNestedParametric(MutableArrayRef<AffineForOp> input,
904 ArrayRef<Value> tileSizes,
905 SmallVectorImpl<AffineForOp> *tiledNest) {
906 if (input.empty())
907 return success();
908
909 if (failed(performPreTilingChecks(input, tileSizes)))
910 return failure();
911
912 MutableArrayRef<AffineForOp> origLoops = input;
913 AffineForOp rootAffineForOp = origLoops[0];
914 unsigned width = input.size();
915 SmallVector<AffineForOp, 6> tiledLoops(2 * width);
916
917 // Construct a tiled loop nest without setting their bounds. Bounds are
918 // set later.
919 constructTiledLoopNest(origLoops, rootAffineForOp, width, tiledLoops);
920
921 SmallVector<Value, 8> origLoopIVs;
922 extractForInductionVars(input, &origLoopIVs);
923
924 // Set loop bounds for the tiled loop nest.
925 constructParametricallyTiledIndexSetHyperRect(origLoops, tiledLoops,
926 tileSizes);
927
928 // Replace original IVs with intra-tile loop IVs.
929 for (unsigned i = 0; i < width; i++)
930 origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());
931
932 // Erase the old loop nest.
933 rootAffineForOp.erase();
934
935 if (tiledNest)
936 *tiledNest = std::move(tiledLoops);
937
938 return success();
939 }
940
941 /// Get perfectly nested sequence of loops starting at root of loop nest
942 /// (the first op being another AffineFor, and the second op - a terminator).
943 /// A loop is perfectly nested iff: the first op in the loop's body is another
944 /// AffineForOp, and the second op is a terminator).
getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> & nestedLoops,AffineForOp root)945 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
946 AffineForOp root) {
947 for (unsigned i = 0; i < std::numeric_limits<unsigned>::max(); ++i) {
948 nestedLoops.push_back(root);
949 Block &body = root.getRegion().front();
950 if (body.begin() != std::prev(body.end(), 2))
951 return;
952
953 root = dyn_cast<AffineForOp>(&body.front());
954 if (!root)
955 return;
956 }
957 }
958
959 /// Identify valid and profitable bands of loops to tile. This is currently just
960 /// a temporary placeholder to test the mechanics of tiled code generation.
961 /// Returns all maximal outermost perfect loop nests to tile.
getTileableBands(func::FuncOp f,std::vector<SmallVector<AffineForOp,6>> * bands)962 void mlir::getTileableBands(func::FuncOp f,
963 std::vector<SmallVector<AffineForOp, 6>> *bands) {
964 // Get maximal perfect nest of 'affine.for' insts starting from root
965 // (inclusive).
966 for (AffineForOp forOp : f.getOps<AffineForOp>()) {
967 SmallVector<AffineForOp, 6> band;
968 getPerfectlyNestedLoops(band, forOp);
969 bands->push_back(band);
970 }
971 }
972
973 /// Unrolls this loop completely.
loopUnrollFull(AffineForOp forOp)974 LogicalResult mlir::loopUnrollFull(AffineForOp forOp) {
975 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
976 if (mayBeConstantTripCount.has_value()) {
977 uint64_t tripCount = mayBeConstantTripCount.value();
978 if (tripCount == 0)
979 return success();
980 if (tripCount == 1)
981 return promoteIfSingleIteration(forOp);
982 return loopUnrollByFactor(forOp, tripCount);
983 }
984 return failure();
985 }
986
987 /// Unrolls this loop by the specified factor or by the trip count (if constant)
988 /// whichever is lower.
loopUnrollUpToFactor(AffineForOp forOp,uint64_t unrollFactor)989 LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
990 uint64_t unrollFactor) {
991 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
992 if (mayBeConstantTripCount.has_value() &&
993 mayBeConstantTripCount.value() < unrollFactor)
994 return loopUnrollByFactor(forOp, mayBeConstantTripCount.value());
995 return loopUnrollByFactor(forOp, unrollFactor);
996 }
997
998 /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
999 /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
1000 /// unrolled body. If specified, annotates the Ops in each unrolled iteration
1001 /// using annotateFn.
generateUnrolledLoop(Block * loopBodyBlock,Value forOpIV,uint64_t unrollFactor,function_ref<Value (unsigned,Value,OpBuilder)> ivRemapFn,function_ref<void (unsigned,Operation *,OpBuilder)> annotateFn,ValueRange iterArgs,ValueRange yieldedValues)1002 static void generateUnrolledLoop(
1003 Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
1004 function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
1005 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1006 ValueRange iterArgs, ValueRange yieldedValues) {
1007 // Builder to insert unrolled bodies just before the terminator of the body of
1008 // 'forOp'.
1009 auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
1010
1011 if (!annotateFn)
1012 annotateFn = [](unsigned, Operation *, OpBuilder) {};
1013
1014 // Keep a pointer to the last non-terminator operation in the original block
1015 // so that we know what to clone (since we are doing this in-place).
1016 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
1017
1018 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
1019 SmallVector<Value, 4> lastYielded(yieldedValues);
1020
1021 for (unsigned i = 1; i < unrollFactor; i++) {
1022 BlockAndValueMapping operandMap;
1023
1024 // Prepare operand map.
1025 operandMap.map(iterArgs, lastYielded);
1026
1027 // If the induction variable is used, create a remapping to the value for
1028 // this unrolled instance.
1029 if (!forOpIV.use_empty()) {
1030 Value ivUnroll = ivRemapFn(i, forOpIV, builder);
1031 operandMap.map(forOpIV, ivUnroll);
1032 }
1033
1034 // Clone the original body of 'forOp'.
1035 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
1036 Operation *clonedOp = builder.clone(*it, operandMap);
1037 annotateFn(i, clonedOp, builder);
1038 }
1039
1040 // Update yielded values.
1041 for (unsigned i = 0, e = lastYielded.size(); i < e; i++)
1042 lastYielded[i] = operandMap.lookup(yieldedValues[i]);
1043 }
1044
1045 // Make sure we annotate the Ops in the original body. We do this last so that
1046 // any annotations are not copied into the cloned Ops above.
1047 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
1048 annotateFn(0, &*it, builder);
1049
1050 // Update operands of the yield statement.
1051 loopBodyBlock->getTerminator()->setOperands(lastYielded);
1052 }
1053
1054 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
1055 /// count is not a multiple of `unrollFactor`.
generateCleanupLoopForUnroll(AffineForOp forOp,uint64_t unrollFactor)1056 static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
1057 uint64_t unrollFactor) {
1058 // Insert the cleanup loop right after 'forOp'.
1059 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
1060 auto cleanupForOp = cast<AffineForOp>(builder.clone(*forOp));
1061
1062 // Update uses of `forOp` results. `cleanupForOp` should use `forOp` result
1063 // and produce results for the original users of `forOp` results.
1064 auto results = forOp.getResults();
1065 auto cleanupResults = cleanupForOp.getResults();
1066 auto cleanupIterOperands = cleanupForOp.getIterOperands();
1067
1068 for (auto e : llvm::zip(results, cleanupResults, cleanupIterOperands)) {
1069 std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
1070 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
1071 }
1072
1073 AffineMap cleanupMap;
1074 SmallVector<Value, 4> cleanupOperands;
1075 getCleanupLoopLowerBound(forOp, unrollFactor, cleanupMap, cleanupOperands);
1076 if (!cleanupMap)
1077 return failure();
1078
1079 cleanupForOp.setLowerBound(cleanupOperands, cleanupMap);
1080 // Promote the loop body up if this has turned into a single iteration loop.
1081 (void)promoteIfSingleIteration(cleanupForOp);
1082
1083 // Adjust upper bound of the original loop; this is the same as the lower
1084 // bound of the cleanup loop.
1085 forOp.setUpperBound(cleanupOperands, cleanupMap);
1086 return success();
1087 }
1088
1089 /// Unrolls this loop by the specified factor. Returns success if the loop
1090 /// is successfully unrolled.
loopUnrollByFactor(AffineForOp forOp,uint64_t unrollFactor,function_ref<void (unsigned,Operation *,OpBuilder)> annotateFn)1091 LogicalResult mlir::loopUnrollByFactor(
1092 AffineForOp forOp, uint64_t unrollFactor,
1093 function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
1094 assert(unrollFactor > 0 && "unroll factor should be positive");
1095
1096 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1097 if (unrollFactor == 1) {
1098 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1099 failed(promoteIfSingleIteration(forOp)))
1100 return failure();
1101 return success();
1102 }
1103
1104 // Nothing in the loop body other than the terminator.
1105 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1106 return success();
1107
1108 // If the trip count is lower than the unroll factor, no unrolled body.
1109 // TODO: option to specify cleanup loop unrolling.
1110 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollFactor)
1111 return failure();
1112
1113 // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
1114 if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
1115 // Loops where the lower bound is a max expression or the upper bound is
1116 // a min expression and the trip count doesn't divide the unroll factor
1117 // can't be unrolled since the lower bound of the cleanup loop in such cases
1118 // cannot be expressed as an affine function or a max over affine functions.
1119 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1120 forOp.getUpperBoundMap().getNumResults() != 1)
1121 return failure();
1122 if (failed(generateCleanupLoopForUnroll(forOp, unrollFactor)))
1123 assert(false && "cleanup loop lower bound map for single result lower "
1124 "and upper bound maps can always be determined");
1125 }
1126
1127 ValueRange iterArgs(forOp.getRegionIterArgs());
1128 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1129
1130 // Scale the step of loop being unrolled by unroll factor.
1131 int64_t step = forOp.getStep();
1132 forOp.setStep(step * unrollFactor);
1133 generateUnrolledLoop(
1134 forOp.getBody(), forOp.getInductionVar(), unrollFactor,
1135 [&](unsigned i, Value iv, OpBuilder b) {
1136 // iv' = iv + i * step
1137 auto d0 = b.getAffineDimExpr(0);
1138 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1139 return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap, iv);
1140 },
1141 /*annotateFn=*/annotateFn,
1142 /*iterArgs=*/iterArgs, /*yieldedValues=*/yieldedValues);
1143
1144 // Promote the loop body up if this has turned into a single iteration loop.
1145 (void)promoteIfSingleIteration(forOp);
1146 return success();
1147 }
1148
loopUnrollJamUpToFactor(AffineForOp forOp,uint64_t unrollJamFactor)1149 LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp,
1150 uint64_t unrollJamFactor) {
1151 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1152 if (mayBeConstantTripCount.has_value() &&
1153 mayBeConstantTripCount.value() < unrollJamFactor)
1154 return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.value());
1155 return loopUnrollJamByFactor(forOp, unrollJamFactor);
1156 }
1157
1158 /// Check if all control operands of all loops are defined outside of `forOp`
1159 /// and return false if not.
areInnerBoundsInvariant(AffineForOp forOp)1160 static bool areInnerBoundsInvariant(AffineForOp forOp) {
1161 auto walkResult = forOp.walk([&](AffineForOp aForOp) {
1162 for (auto controlOperand : aForOp.getControlOperands()) {
1163 if (!forOp.isDefinedOutsideOfLoop(controlOperand))
1164 return WalkResult::interrupt();
1165 }
1166 return WalkResult::advance();
1167 });
1168 return !walkResult.wasInterrupted();
1169 }
1170
1171 // Gathers all maximal sub-blocks of operations that do not themselves
1172 // include a for op (a operation could have a descendant for op though
1173 // in its tree). Ignore the block terminators.
1174 struct JamBlockGatherer {
1175 // Store iterators to the first and last op of each sub-block found.
1176 std::vector<std::pair<Block::iterator, Block::iterator>> subBlocks;
1177
1178 // This is a linear time walk.
walkJamBlockGatherer1179 void walk(Operation *op) {
1180 for (auto ®ion : op->getRegions())
1181 for (auto &block : region)
1182 walk(block);
1183 }
1184
walkJamBlockGatherer1185 void walk(Block &block) {
1186 for (auto it = block.begin(), e = std::prev(block.end()); it != e;) {
1187 auto subBlockStart = it;
1188 while (it != e && !isa<AffineForOp>(&*it))
1189 ++it;
1190 if (it != subBlockStart)
1191 subBlocks.emplace_back(subBlockStart, std::prev(it));
1192 // Process all for ops that appear next.
1193 while (it != e && isa<AffineForOp>(&*it))
1194 walk(&*it++);
1195 }
1196 }
1197 };
1198
1199 /// Unrolls and jams this loop by the specified factor.
loopUnrollJamByFactor(AffineForOp forOp,uint64_t unrollJamFactor)1200 LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp,
1201 uint64_t unrollJamFactor) {
1202 assert(unrollJamFactor > 0 && "unroll jam factor should be positive");
1203
1204 Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
1205 if (unrollJamFactor == 1) {
1206 if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
1207 failed(promoteIfSingleIteration(forOp)))
1208 return failure();
1209 return success();
1210 }
1211
1212 // Nothing in the loop body other than the terminator.
1213 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1214 return success();
1215
1216 // If the trip count is lower than the unroll jam factor, no unroll jam.
1217 if (mayBeConstantTripCount && *mayBeConstantTripCount < unrollJamFactor) {
1218 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1219 return failure();
1220 }
1221
1222 // If any control operand of any inner loop of `forOp` is defined within
1223 // `forOp`, no unroll jam.
1224 if (!areInnerBoundsInvariant(forOp))
1225 return failure();
1226
1227 // Gather all sub-blocks to jam upon the loop being unrolled.
1228 JamBlockGatherer jbg;
1229 jbg.walk(forOp);
1230 auto &subBlocks = jbg.subBlocks;
1231
1232 // Collect loops with iter_args.
1233 SmallVector<AffineForOp, 4> loopsWithIterArgs;
1234 forOp.walk([&](AffineForOp aForOp) {
1235 if (aForOp.getNumIterOperands() > 0)
1236 loopsWithIterArgs.push_back(aForOp);
1237 });
1238
1239 // Get supported reductions to be used for creating reduction ops at the end.
1240 SmallVector<LoopReduction> reductions;
1241 if (forOp.getNumIterOperands() > 0)
1242 getSupportedReductions(forOp, reductions);
1243
1244 // Generate the cleanup loop if trip count isn't a multiple of
1245 // unrollJamFactor.
1246 if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) {
1247 // Loops where the lower bound is a max expression or the upper bound is
1248 // a min expression and the trip count doesn't divide the unroll factor
1249 // can't be unrolled since the lower bound of the cleanup loop in such cases
1250 // cannot be expressed as an affine function or a max over affine functions.
1251 if (forOp.getLowerBoundMap().getNumResults() != 1 ||
1252 forOp.getUpperBoundMap().getNumResults() != 1)
1253 return failure();
1254 if (failed(generateCleanupLoopForUnroll(forOp, unrollJamFactor)))
1255 assert(false && "cleanup loop lower bound map for single result lower "
1256 "and upper bound maps can always be determined");
1257 }
1258
1259 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1260 // iteration. There are (`unrollJamFactor` - 1) iterations.
1261 SmallVector<BlockAndValueMapping, 4> operandMaps(unrollJamFactor - 1);
1262
1263 // For any loop with iter_args, replace it with a new loop that has
1264 // `unrollJamFactor` copies of its iterOperands, iter_args and yield
1265 // operands.
1266 SmallVector<AffineForOp, 4> newLoopsWithIterArgs;
1267 OpBuilder builder(forOp.getContext());
1268 for (AffineForOp oldForOp : loopsWithIterArgs) {
1269 SmallVector<Value, 4> dupIterOperands, dupIterArgs, dupYieldOperands;
1270 ValueRange oldIterOperands = oldForOp.getIterOperands();
1271 ValueRange oldIterArgs = oldForOp.getRegionIterArgs();
1272 ValueRange oldYieldOperands =
1273 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1274 // Get additional iterOperands, iterArgs, and yield operands. We will
1275 // fix iterOperands and yield operands after cloning of sub-blocks.
1276 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1277 dupIterOperands.append(oldIterOperands.begin(), oldIterOperands.end());
1278 dupIterArgs.append(oldIterArgs.begin(), oldIterArgs.end());
1279 dupYieldOperands.append(oldYieldOperands.begin(), oldYieldOperands.end());
1280 }
1281 // Create a new loop with additional iterOperands, iter_args and yield
1282 // operands. This new loop will take the loop body of the original loop.
1283 AffineForOp newForOp = mlir::replaceForOpWithNewYields(
1284 builder, oldForOp, dupIterOperands, dupYieldOperands, dupIterArgs);
1285 newLoopsWithIterArgs.push_back(newForOp);
1286 // `forOp` has been replaced with a new loop.
1287 if (oldForOp == forOp)
1288 forOp = newForOp;
1289 assert(oldForOp.use_empty() && "old for op should not have any user");
1290 oldForOp.erase();
1291 // Update `operandMaps` for `newForOp` iterArgs and results.
1292 ValueRange newIterArgs = newForOp.getRegionIterArgs();
1293 unsigned oldNumIterArgs = oldIterArgs.size();
1294 ValueRange newResults = newForOp.getResults();
1295 unsigned oldNumResults = newResults.size() / unrollJamFactor;
1296 assert(oldNumIterArgs == oldNumResults &&
1297 "oldNumIterArgs must be the same as oldNumResults");
1298 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1299 for (unsigned j = 0; j < oldNumIterArgs; ++j) {
1300 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1301 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1302 // to those in the `i`th new set.
1303 operandMaps[i - 1].map(newIterArgs[j],
1304 newIterArgs[i * oldNumIterArgs + j]);
1305 operandMaps[i - 1].map(newResults[j],
1306 newResults[i * oldNumResults + j]);
1307 }
1308 }
1309 }
1310
1311 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1312 int64_t step = forOp.getStep();
1313 forOp.setStep(step * unrollJamFactor);
1314
1315 auto forOpIV = forOp.getInductionVar();
1316 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1317 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1318 for (auto &subBlock : subBlocks) {
1319 // Builder to insert unroll-jammed bodies. Insert right at the end of
1320 // sub-block.
1321 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1322
1323 // If the induction variable is used, create a remapping to the value for
1324 // this unrolled instance.
1325 if (!forOpIV.use_empty()) {
1326 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1327 auto d0 = builder.getAffineDimExpr(0);
1328 auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
1329 auto ivUnroll =
1330 builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
1331 operandMaps[i - 1].map(forOpIV, ivUnroll);
1332 }
1333 // Clone the sub-block being unroll-jammed.
1334 for (auto it = subBlock.first; it != std::next(subBlock.second); ++it)
1335 builder.clone(*it, operandMaps[i - 1]);
1336 }
1337 // Fix iterOperands and yield op operands of newly created loops.
1338 for (auto newForOp : newLoopsWithIterArgs) {
1339 unsigned oldNumIterOperands =
1340 newForOp.getNumIterOperands() / unrollJamFactor;
1341 unsigned numControlOperands = newForOp.getNumControlOperands();
1342 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1343 unsigned oldNumYieldOperands = yieldOp.getNumOperands() / unrollJamFactor;
1344 assert(oldNumIterOperands == oldNumYieldOperands &&
1345 "oldNumIterOperands must be the same as oldNumYieldOperands");
1346 for (unsigned j = 0; j < oldNumIterOperands; ++j) {
1347 // The `i`th duplication of an old iterOperand or yield op operand
1348 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1349 // if such mapped value exists.
1350 newForOp.setOperand(numControlOperands + i * oldNumIterOperands + j,
1351 operandMaps[i - 1].lookupOrDefault(
1352 newForOp.getOperand(numControlOperands + j)));
1353 yieldOp.setOperand(
1354 i * oldNumYieldOperands + j,
1355 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1356 }
1357 }
1358 }
1359 if (forOp.getNumResults() > 0) {
1360 // Create reduction ops to combine every `unrollJamFactor` related results
1361 // into one value. For example, for %0:2 = affine.for ... and addf, we add
1362 // %1 = arith.addf %0#0, %0#1, and replace the following uses of %0#0 with
1363 // %1.
1364 builder.setInsertionPointAfter(forOp);
1365 auto loc = forOp.getLoc();
1366 unsigned oldNumResults = forOp.getNumResults() / unrollJamFactor;
1367 for (LoopReduction &reduction : reductions) {
1368 unsigned pos = reduction.iterArgPosition;
1369 Value lhs = forOp.getResult(pos);
1370 Value rhs;
1371 SmallPtrSet<Operation *, 4> newOps;
1372 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1373 rhs = forOp.getResult(i * oldNumResults + pos);
1374 // Create ops based on reduction type.
1375 lhs = arith::getReductionOp(reduction.kind, builder, loc, lhs, rhs);
1376 if (!lhs)
1377 return failure();
1378 Operation *op = lhs.getDefiningOp();
1379 assert(op && "Reduction op should have been created");
1380 newOps.insert(op);
1381 }
1382 // Replace all uses except those in newly created reduction ops.
1383 forOp.getResult(pos).replaceAllUsesExcept(lhs, newOps);
1384 }
1385 }
1386
1387 // Promote the loop body up if this has turned into a single iteration loop.
1388 (void)promoteIfSingleIteration(forOp);
1389 return success();
1390 }
1391
1392 /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
1393 /// nested within 'forOpA' as the only non-terminator operation in its block.
interchangeLoops(AffineForOp forOpA,AffineForOp forOpB)1394 void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) {
1395 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1396 auto &forOpABody = forOpA.getBody()->getOperations();
1397 auto &forOpBBody = forOpB.getBody()->getOperations();
1398
1399 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1400 // before forOpA (in ForOpA's parent's block) this should leave 'forOpA's
1401 // body containing only the terminator.
1402 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1403 forOpABody, forOpABody.begin(),
1404 std::prev(forOpABody.end()));
1405 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1406 // body (this leaves forOpB's body containing only the terminator).
1407 forOpABody.splice(forOpABody.begin(), forOpBBody, forOpBBody.begin(),
1408 std::prev(forOpBBody.end()));
1409 // 3) Splice forOpA into the beginning of forOpB's body.
1410 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1411 Block::iterator(forOpA));
1412 }
1413
1414 // Checks each dependence component against the permutation to see if the
1415 // desired loop interchange would violate dependences by making the
1416 // dependence component lexicographically negative.
checkLoopInterchangeDependences(const std::vector<SmallVector<DependenceComponent,2>> & depCompsVec,ArrayRef<AffineForOp> loops,ArrayRef<unsigned> loopPermMap)1417 static bool checkLoopInterchangeDependences(
1418 const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
1419 ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) {
1420 // Invert permutation map.
1421 unsigned maxLoopDepth = loops.size();
1422 SmallVector<unsigned, 4> loopPermMapInv;
1423 loopPermMapInv.resize(maxLoopDepth);
1424 for (unsigned i = 0; i < maxLoopDepth; ++i)
1425 loopPermMapInv[loopPermMap[i]] = i;
1426
1427 // Check each dependence component against the permutation to see if the
1428 // desired loop interchange permutation would make the dependence vectors
1429 // lexicographically negative.
1430 // Example 1: [-1, 1][0, 0]
1431 // Example 2: [0, 0][-1, 1]
1432 for (const auto &depComps : depCompsVec) {
1433 assert(depComps.size() >= maxLoopDepth);
1434 // Check if the first non-zero dependence component is positive.
1435 // This iterates through loops in the desired order.
1436 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1437 unsigned permIndex = loopPermMapInv[j];
1438 assert(depComps[permIndex].lb);
1439 int64_t depCompLb = *depComps[permIndex].lb;
1440 if (depCompLb > 0)
1441 break;
1442 if (depCompLb < 0)
1443 return false;
1444 }
1445 }
1446 return true;
1447 }
1448
1449 /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
1450 /// nested sequence of loops in 'loops' would violate dependences.
isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,ArrayRef<unsigned> loopPermMap)1451 bool mlir::isValidLoopInterchangePermutation(ArrayRef<AffineForOp> loops,
1452 ArrayRef<unsigned> loopPermMap) {
1453 // Gather dependence components for dependences between all ops in loop nest
1454 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1455 assert(loopPermMap.size() == loops.size());
1456 unsigned maxLoopDepth = loops.size();
1457 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1458 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1459 return checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap);
1460 }
1461
1462 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
1463 /// in it from outermost to innermost.
1464 bool LLVM_ATTRIBUTE_UNUSED
isPerfectlyNested(ArrayRef<AffineForOp> loops)1465 mlir::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
1466 assert(!loops.empty() && "no loops provided");
1467
1468 // We already know that the block can't be empty.
1469 auto hasTwoElements = [](Block *block) {
1470 auto secondOpIt = std::next(block->begin());
1471 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1472 };
1473
1474 auto enclosingLoop = loops.front();
1475 for (auto loop : loops.drop_front()) {
1476 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1477 // parentForOp's body should be just this loop and the terminator.
1478 if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
1479 return false;
1480 enclosingLoop = loop;
1481 }
1482 return true;
1483 }
1484
1485 // input[i] should move from position i -> permMap[i]. Returns the position in
1486 // `input` that becomes the new outermost loop.
permuteLoops(MutableArrayRef<AffineForOp> input,ArrayRef<unsigned> permMap)1487 unsigned mlir::permuteLoops(MutableArrayRef<AffineForOp> input,
1488 ArrayRef<unsigned> permMap) {
1489 assert(input.size() == permMap.size() && "invalid permutation map size");
1490 // Check whether the permutation spec is valid. This is a small vector - we'll
1491 // just sort and check if it's iota.
1492 SmallVector<unsigned, 4> checkPermMap(permMap.begin(), permMap.end());
1493 llvm::sort(checkPermMap);
1494 if (llvm::any_of(llvm::enumerate(checkPermMap),
1495 [](const auto &en) { return en.value() != en.index(); }))
1496 assert(false && "invalid permutation map");
1497
1498 // Nothing to do.
1499 if (input.size() < 2)
1500 return 0;
1501
1502 assert(isPerfectlyNested(input) && "input not perfectly nested");
1503
1504 // Compute the inverse mapping, invPermMap: since input[i] goes to position
1505 // permMap[i], position i of the permuted nest is at input[invPermMap[i]].
1506 SmallVector<std::pair<unsigned, unsigned>, 4> invPermMap;
1507 for (unsigned i = 0, e = input.size(); i < e; ++i)
1508 invPermMap.push_back({permMap[i], i});
1509 llvm::sort(invPermMap);
1510
1511 // Move the innermost loop body to the loop that would be the innermost in the
1512 // permuted nest (only if the innermost loop is going to change).
1513 if (permMap.back() != input.size() - 1) {
1514 auto *destBody = input[invPermMap.back().second].getBody();
1515 auto *srcBody = input.back().getBody();
1516 destBody->getOperations().splice(destBody->begin(),
1517 srcBody->getOperations(), srcBody->begin(),
1518 std::prev(srcBody->end()));
1519 }
1520
1521 // We'll move each loop in `input` in the reverse order so that its body is
1522 // empty when we are moving it; this incurs zero copies and no erasing.
1523 for (int i = input.size() - 1; i >= 0; --i) {
1524 // If this has to become the outermost loop after permutation, add it to the
1525 // parent block of the original root.
1526 if (permMap[i] == 0) {
1527 // If the root remains the same, nothing to do.
1528 if (i == 0)
1529 continue;
1530 // Make input[i] the new outermost loop moving it into parentBlock.
1531 auto *parentBlock = input[0]->getBlock();
1532 parentBlock->getOperations().splice(Block::iterator(input[0]),
1533 input[i]->getBlock()->getOperations(),
1534 Block::iterator(input[i]));
1535 continue;
1536 }
1537
1538 // If the parent in the permuted order is the same as in the original,
1539 // nothing to do.
1540 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1541 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1542 continue;
1543
1544 // Move input[i] to its surrounding loop in the transformed nest.
1545 auto *destBody = input[parentPosInInput].getBody();
1546 destBody->getOperations().splice(destBody->begin(),
1547 input[i]->getBlock()->getOperations(),
1548 Block::iterator(input[i]));
1549 }
1550
1551 return invPermMap[0].second;
1552 }
1553
1554 // Sinks all sequential loops to the innermost levels (while preserving
1555 // relative order among them) and moves all parallel loops to the
1556 // outermost (while again preserving relative order among them).
sinkSequentialLoops(AffineForOp forOp)1557 AffineForOp mlir::sinkSequentialLoops(AffineForOp forOp) {
1558 SmallVector<AffineForOp, 4> loops;
1559 getPerfectlyNestedLoops(loops, forOp);
1560 if (loops.size() < 2)
1561 return forOp;
1562
1563 // Gather dependence components for dependences between all ops in loop nest
1564 // rooted at 'loops[0]', at loop depths in range [1, maxLoopDepth].
1565 unsigned maxLoopDepth = loops.size();
1566 std::vector<SmallVector<DependenceComponent, 2>> depCompsVec;
1567 getDependenceComponents(loops[0], maxLoopDepth, &depCompsVec);
1568
1569 // Mark loops as either parallel or sequential.
1570 SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
1571 for (auto &depComps : depCompsVec) {
1572 assert(depComps.size() >= maxLoopDepth);
1573 for (unsigned j = 0; j < maxLoopDepth; ++j) {
1574 DependenceComponent &depComp = depComps[j];
1575 assert(depComp.lb.has_value() && depComp.ub.has_value());
1576 if (depComp.lb.value() != 0 || depComp.ub.value() != 0)
1577 isParallelLoop[j] = false;
1578 }
1579 }
1580
1581 // Count the number of parallel loops.
1582 unsigned numParallelLoops = 0;
1583 for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
1584 if (isParallelLoop[i])
1585 ++numParallelLoops;
1586
1587 // Compute permutation of loops that sinks sequential loops (and thus raises
1588 // parallel loops) while preserving relative order.
1589 SmallVector<unsigned, 4> loopPermMap(maxLoopDepth);
1590 unsigned nextSequentialLoop = numParallelLoops;
1591 unsigned nextParallelLoop = 0;
1592 for (unsigned i = 0; i < maxLoopDepth; ++i) {
1593 if (isParallelLoop[i]) {
1594 loopPermMap[i] = nextParallelLoop++;
1595 } else {
1596 loopPermMap[i] = nextSequentialLoop++;
1597 }
1598 }
1599
1600 // Check if permutation 'loopPermMap' would violate dependences.
1601 if (!checkLoopInterchangeDependences(depCompsVec, loops, loopPermMap))
1602 return forOp;
1603 // Perform loop interchange according to permutation 'loopPermMap'.
1604 unsigned loopNestRootIndex = permuteLoops(loops, loopPermMap);
1605 return loops[loopNestRootIndex];
1606 }
1607
1608 // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
1609 // lower (resp. upper) loop bound. When called for both the lower and upper
1610 // bounds, the resulting IR resembles:
1611 //
1612 // ```mlir
1613 // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
1614 // ...
1615 // }
1616 // ```
augmentMapAndBounds(OpBuilder & b,Value iv,AffineMap * map,SmallVector<Value,4> * operands,int64_t offset=0)1617 static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
1618 SmallVector<Value, 4> *operands,
1619 int64_t offset = 0) {
1620 auto bounds = llvm::to_vector<4>(map->getResults());
1621 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1622 operands->insert(operands->begin() + map->getNumDims(), iv);
1623 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1624 b.getContext());
1625 canonicalizeMapAndOperands(map, operands);
1626 }
1627
1628 // Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
1629 // Stripmine-sink is a primitive building block for generalized tiling of
1630 // imperfectly nested loops.
1631 // This transformation is purely mechanical and does not check legality,
1632 // profitability or even structural correctness. It is the user's
1633 // responsibility to specify `targets` that are dominated by `forOp`.
1634 // Returns the new AffineForOps, one per `targets`, nested immediately under
1635 // each of the `targets`.
1636 static SmallVector<AffineForOp, 8>
stripmineSink(AffineForOp forOp,uint64_t factor,ArrayRef<AffineForOp> targets)1637 stripmineSink(AffineForOp forOp, uint64_t factor,
1638 ArrayRef<AffineForOp> targets) {
1639 auto originalStep = forOp.getStep();
1640 auto scaledStep = originalStep * factor;
1641 forOp.setStep(scaledStep);
1642
1643 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1644
1645 // Lower-bound map creation.
1646 auto lbMap = forOp.getLowerBoundMap();
1647 SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
1648 augmentMapAndBounds(b, forOp.getInductionVar(), &lbMap, &lbOperands);
1649
1650 // Upper-bound map creation.
1651 auto ubMap = forOp.getUpperBoundMap();
1652 SmallVector<Value, 4> ubOperands(forOp.getUpperBoundOperands());
1653 augmentMapAndBounds(b, forOp.getInductionVar(), &ubMap, &ubOperands,
1654 /*offset=*/scaledStep);
1655
1656 auto iv = forOp.getInductionVar();
1657 SmallVector<AffineForOp, 8> innerLoops;
1658 for (auto t : targets) {
1659 // Insert newForOp before the terminator of `t`.
1660 auto b = OpBuilder::atBlockTerminator(t.getBody());
1661 auto newForOp = b.create<AffineForOp>(t.getLoc(), lbOperands, lbMap,
1662 ubOperands, ubMap, originalStep);
1663 auto begin = t.getBody()->begin();
1664 // Skip terminator and `newForOp` which is just before the terminator.
1665 auto nOps = t.getBody()->getOperations().size() - 2;
1666 newForOp.getBody()->getOperations().splice(
1667 newForOp.getBody()->getOperations().begin(),
1668 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1669 replaceAllUsesInRegionWith(iv, newForOp.getInductionVar(),
1670 newForOp.getRegion());
1671 innerLoops.push_back(newForOp);
1672 }
1673
1674 return innerLoops;
1675 }
1676
1677 // Stripmines a `forOp` by `factor` and sinks it under a single `target`.
1678 // Returns the new AffineForOps, nested immediately under `target`.
1679 template <typename SizeType>
stripmineSink(AffineForOp forOp,SizeType factor,AffineForOp target)1680 static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
1681 AffineForOp target) {
1682 // TODO: Use cheap structural assertions that targets are nested under
1683 // forOp and that targets are not nested under each other when DominanceInfo
1684 // exposes the capability. It seems overkill to construct a whole function
1685 // dominance tree at this point.
1686 auto res = stripmineSink(forOp, factor, ArrayRef<AffineForOp>(target));
1687 assert(res.size() == 1 && "Expected 1 inner forOp");
1688 return res[0];
1689 }
1690
1691 SmallVector<SmallVector<AffineForOp, 8>, 8>
tile(ArrayRef<AffineForOp> forOps,ArrayRef<uint64_t> sizes,ArrayRef<AffineForOp> targets)1692 mlir::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
1693 ArrayRef<AffineForOp> targets) {
1694 SmallVector<SmallVector<AffineForOp, 8>, 8> res;
1695 SmallVector<AffineForOp, 8> currentTargets(targets.begin(), targets.end());
1696 for (auto it : llvm::zip(forOps, sizes)) {
1697 auto step = stripmineSink(std::get<0>(it), std::get<1>(it), currentTargets);
1698 res.push_back(step);
1699 currentTargets = step;
1700 }
1701 return res;
1702 }
1703
tile(ArrayRef<AffineForOp> forOps,ArrayRef<uint64_t> sizes,AffineForOp target)1704 SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
1705 ArrayRef<uint64_t> sizes,
1706 AffineForOp target) {
1707 SmallVector<AffineForOp, 8> res;
1708 for (auto loops : tile(forOps, sizes, ArrayRef<AffineForOp>(target))) {
1709 assert(loops.size() == 1);
1710 res.push_back(loops[0]);
1711 }
1712 return res;
1713 }
1714
coalesceLoops(MutableArrayRef<AffineForOp> loops)1715 LogicalResult mlir::coalesceLoops(MutableArrayRef<AffineForOp> loops) {
1716 if (loops.size() < 2)
1717 return success();
1718
1719 AffineForOp innermost = loops.back();
1720 AffineForOp outermost = loops.front();
1721 AffineBound ub = outermost.getUpperBound();
1722 AffineMap origUbMap = ub.getMap();
1723 Location loc = outermost.getLoc();
1724 OpBuilder builder(outermost);
1725 for (AffineForOp loop : loops) {
1726 // We only work on normalized loops.
1727 if (loop.getStep() != 1 || !loop.hasConstantLowerBound() ||
1728 loop.getConstantLowerBound() != 0)
1729 return failure();
1730 }
1731 SmallVector<Value, 4> upperBoundSymbols;
1732 SmallVector<Value, 4> ubOperands(ub.getOperands().begin(),
1733 ub.getOperands().end());
1734
1735 // 1. Store the upper bound of the outermost loop in a variable.
1736 Value prev;
1737 if (!llvm::hasSingleElement(origUbMap.getResults()))
1738 prev = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1739 else
1740 prev = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1741 upperBoundSymbols.push_back(prev);
1742
1743 // 2. Emit code computing the upper bound of the coalesced loop as product of
1744 // the number of iterations of all loops.
1745 for (AffineForOp loop : loops.drop_front()) {
1746 ub = loop.getUpperBound();
1747 origUbMap = ub.getMap();
1748 ubOperands = ub.getOperands();
1749 Value upperBound;
1750 // If upper bound map has more than one result, take their minimum.
1751 if (!llvm::hasSingleElement(origUbMap.getResults()))
1752 upperBound = builder.create<AffineMinOp>(loc, origUbMap, ubOperands);
1753 else
1754 upperBound = builder.create<AffineApplyOp>(loc, origUbMap, ubOperands);
1755 upperBoundSymbols.push_back(upperBound);
1756 SmallVector<Value, 4> operands;
1757 operands.push_back(prev);
1758 operands.push_back(upperBound);
1759 // Maintain running product of loop upper bounds.
1760 prev = builder.create<AffineApplyOp>(
1761 loc,
1762 AffineMap::get(/*numDims=*/1,
1763 /*numSymbols=*/1,
1764 builder.getAffineDimExpr(0) *
1765 builder.getAffineSymbolExpr(0)),
1766 operands);
1767 }
1768 // Set upper bound of the coalesced loop.
1769 AffineMap newUbMap = AffineMap::get(
1770 /*numDims=*/0,
1771 /*numSymbols=*/1, builder.getAffineSymbolExpr(0), builder.getContext());
1772 outermost.setUpperBound(prev, newUbMap);
1773
1774 builder.setInsertionPointToStart(outermost.getBody());
1775
1776 // 3. Remap induction variables. For each original loop, the value of the
1777 // induction variable can be obtained by dividing the induction variable of
1778 // the linearized loop by the total number of iterations of the loops nested
1779 // in it modulo the number of iterations in this loop (remove the values
1780 // related to the outer loops):
1781 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1782 // Compute these iteratively from the innermost loop by creating a "running
1783 // quotient" of division by the range.
1784 Value previous = outermost.getInductionVar();
1785 for (unsigned idx = loops.size(); idx > 0; --idx) {
1786 if (idx != loops.size()) {
1787 SmallVector<Value, 4> operands;
1788 operands.push_back(previous);
1789 operands.push_back(upperBoundSymbols[idx]);
1790 previous = builder.create<AffineApplyOp>(
1791 loc,
1792 AffineMap::get(
1793 /*numDims=*/1, /*numSymbols=*/1,
1794 builder.getAffineDimExpr(0).floorDiv(
1795 builder.getAffineSymbolExpr(0))),
1796 operands);
1797 }
1798 // Modified value of the induction variables of the nested loops after
1799 // coalescing.
1800 Value inductionVariable;
1801 if (idx == 1) {
1802 inductionVariable = previous;
1803 } else {
1804 SmallVector<Value, 4> applyOperands;
1805 applyOperands.push_back(previous);
1806 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1807 inductionVariable = builder.create<AffineApplyOp>(
1808 loc,
1809 AffineMap::get(
1810 /*numDims=*/1, /*numSymbols=*/1,
1811 builder.getAffineDimExpr(0) % builder.getAffineSymbolExpr(0)),
1812 applyOperands);
1813 }
1814 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1815 inductionVariable, loops.back().getRegion());
1816 }
1817
1818 // 4. Move the operations from the innermost just above the second-outermost
1819 // loop, delete the extra terminator and the second-outermost loop.
1820 AffineForOp secondOutermostLoop = loops[1];
1821 innermost.getBody()->back().erase();
1822 outermost.getBody()->getOperations().splice(
1823 Block::iterator(secondOutermostLoop.getOperation()),
1824 innermost.getBody()->getOperations());
1825 secondOutermostLoop.erase();
1826 return success();
1827 }
1828
mapLoopToProcessorIds(scf::ForOp forOp,ArrayRef<Value> processorId,ArrayRef<Value> numProcessors)1829 void mlir::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId,
1830 ArrayRef<Value> numProcessors) {
1831 assert(processorId.size() == numProcessors.size());
1832 if (processorId.empty())
1833 return;
1834
1835 OpBuilder b(forOp);
1836 Location loc(forOp.getLoc());
1837 AffineExpr lhs, rhs;
1838 bindSymbols(forOp.getContext(), lhs, rhs);
1839 auto mulMap = AffineMap::get(0, 2, lhs * rhs);
1840 auto addMap = AffineMap::get(0, 2, lhs + rhs);
1841
1842 Value linearIndex = processorId.front();
1843 for (unsigned i = 1, e = processorId.size(); i < e; ++i) {
1844 auto mulApplyOp = b.create<AffineApplyOp>(
1845 loc, mulMap, ValueRange{linearIndex, numProcessors[i]});
1846 linearIndex = b.create<AffineApplyOp>(
1847 loc, addMap, ValueRange{mulApplyOp, processorId[i]});
1848 }
1849
1850 auto mulApplyOp = b.create<AffineApplyOp>(
1851 loc, mulMap, ValueRange{linearIndex, forOp.getStep()});
1852 Value lb = b.create<AffineApplyOp>(
1853 loc, addMap, ValueRange{mulApplyOp, forOp.getLowerBound()});
1854 forOp.setLowerBound(lb);
1855
1856 Value step = forOp.getStep();
1857 for (auto numProcs : numProcessors)
1858 step = b.create<AffineApplyOp>(loc, mulMap, ValueRange{numProcs, step});
1859 forOp.setStep(step);
1860 }
1861
1862 /// Given a memref region, determine the lowest depth at which transfers can be
1863 /// placed for it, and return the corresponding block, start and end positions
1864 /// in the block for placing incoming (read) and outgoing (write) copies
1865 /// respectively. The lowest depth depends on whether the region being accessed
1866 /// is hoistable with respect to one or more immediately surrounding loops.
1867 static void
findHighestBlockForPlacement(const MemRefRegion & region,Block & block,Block::iterator & begin,Block::iterator & end,Block ** copyPlacementBlock,Block::iterator * copyInPlacementStart,Block::iterator * copyOutPlacementStart)1868 findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block,
1869 Block::iterator &begin, Block::iterator &end,
1870 Block **copyPlacementBlock,
1871 Block::iterator *copyInPlacementStart,
1872 Block::iterator *copyOutPlacementStart) {
1873 const auto *cst = region.getConstraints();
1874 SmallVector<Value, 4> symbols;
1875 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1876
1877 SmallVector<AffineForOp, 4> enclosingFors;
1878 getLoopIVs(*block.begin(), &enclosingFors);
1879 // Walk up loop parents till we find an IV on which this region is
1880 // symbolic/variant.
1881 auto it = enclosingFors.rbegin();
1882 for (auto e = enclosingFors.rend(); it != e; ++it) {
1883 // TODO: also need to be checking this for regions symbols that
1884 // aren't loop IVs, whether we are within their resp. defs' dominance scope.
1885 if (llvm::is_contained(symbols, it->getInductionVar()))
1886 break;
1887 }
1888
1889 if (it != enclosingFors.rbegin()) {
1890 auto lastInvariantIV = *std::prev(it);
1891 *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation());
1892 *copyOutPlacementStart = std::next(*copyInPlacementStart);
1893 *copyPlacementBlock = lastInvariantIV->getBlock();
1894 } else {
1895 *copyInPlacementStart = begin;
1896 *copyOutPlacementStart = end;
1897 *copyPlacementBlock = █
1898 }
1899 }
1900
1901 // Info comprising stride and number of elements transferred every stride.
1902 struct StrideInfo {
1903 int64_t stride;
1904 int64_t numEltPerStride;
1905 };
1906
1907 /// Returns striding information for a copy/transfer of this region with
1908 /// potentially multiple striding levels from outermost to innermost. For an
1909 /// n-dimensional region, there can be at most n-1 levels of striding
1910 /// successively nested.
1911 // TODO: make this work with non-identity layout maps.
getMultiLevelStrides(const MemRefRegion & region,ArrayRef<int64_t> bufferShape,SmallVectorImpl<StrideInfo> * strideInfos)1912 static void getMultiLevelStrides(const MemRefRegion ®ion,
1913 ArrayRef<int64_t> bufferShape,
1914 SmallVectorImpl<StrideInfo> *strideInfos) {
1915 if (bufferShape.size() <= 1)
1916 return;
1917
1918 int64_t numEltPerStride = 1;
1919 int64_t stride = 1;
1920 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1921 int64_t dimSize = region.memref.getType().cast<MemRefType>().getDimSize(d);
1922 stride *= dimSize;
1923 numEltPerStride *= bufferShape[d];
1924 // A stride is needed only if the region has a shorter extent than the
1925 // memref along the dimension *and* has an extent greater than one along the
1926 // next major dimension.
1927 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1928 strideInfos->push_back({stride, numEltPerStride});
1929 }
1930 }
1931 }
1932
1933 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1934 /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
1935 /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
1936 /// bound information for the copy loop nest. `fastBufOffsets` contain the
1937 /// expressions to be subtracted out from the respective copy loop iterators in
1938 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1939 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1940 /// inserted.
1941 //
1942 /// The copy-in nest is generated as follows as an example for a 2-d region:
1943 /// for x = ...
1944 /// for y = ...
1945 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1946 ///
1947 static AffineForOp
generatePointWiseCopy(Location loc,Value memref,Value fastMemRef,ArrayRef<AffineMap> lbMaps,ArrayRef<Value> lbOperands,ArrayRef<AffineMap> ubMaps,ArrayRef<Value> ubOperands,ArrayRef<AffineExpr> fastBufOffsets,bool isCopyOut,OpBuilder b)1948 generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
1949 ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
1950 ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
1951 ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
1952 OpBuilder b) {
1953 assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
1954 return lbMap.getNumInputs() == lbOperands.size();
1955 }));
1956 assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
1957 return ubMap.getNumInputs() == ubOperands.size();
1958 }));
1959
1960 unsigned rank = memref.getType().cast<MemRefType>().getRank();
1961 assert(lbMaps.size() == rank && "wrong number of lb maps");
1962 assert(ubMaps.size() == rank && "wrong number of ub maps");
1963
1964 SmallVector<Value, 4> memIndices;
1965 SmallVector<AffineExpr, 4> fastBufExprs;
1966 SmallVector<Value, 4> fastBufMapOperands;
1967 AffineForOp copyNestRoot;
1968 SmallVector<AffineApplyOp, 4> mayBeDeadApplys;
1969 for (unsigned d = 0; d < rank; ++d) {
1970 auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
1971 ubOperands, ubMaps[d]);
1972 if (d == 0)
1973 copyNestRoot = forOp;
1974
1975 b = OpBuilder::atBlockTerminator(forOp.getBody());
1976
1977 auto fastBufOffsetMap =
1978 AffineMap::get(lbOperands.size(), 0, fastBufOffsets[d]);
1979 auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
1980
1981 // Construct the subscript for the fast memref being copied into/from:
1982 // x - offset_x.
1983 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1984 b.getAffineDimExpr(2 * d));
1985 fastBufMapOperands.push_back(offset);
1986 fastBufMapOperands.push_back(forOp.getInductionVar());
1987 mayBeDeadApplys.push_back(offset);
1988
1989 // Subscript for the slow memref being copied.
1990 memIndices.push_back(forOp.getInductionVar());
1991 }
1992
1993 auto fastBufMap =
1994 AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs, b.getContext());
1995 fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
1996 fastBufMap = simplifyAffineMap(fastBufMap);
1997 canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
1998
1999 // Drop any dead affine.applys.
2000 for (auto applyOp : mayBeDeadApplys)
2001 if (applyOp.use_empty())
2002 applyOp.erase();
2003
2004 if (!isCopyOut) {
2005 // Copy in.
2006 auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
2007 b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
2008 fastBufMapOperands);
2009 return copyNestRoot;
2010 }
2011
2012 // Copy out.
2013 auto load =
2014 b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
2015 b.create<AffineStoreOp>(loc, load, memref, memIndices);
2016 return copyNestRoot;
2017 }
2018
2019 static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
emitRemarkForBlock(Block & block)2020 emitRemarkForBlock(Block &block) {
2021 return block.getParentOp()->emitRemark();
2022 }
2023
2024 /// Creates a buffer in the faster memory space for the specified memref region;
2025 /// generates a copy from the lower memory space to this one, and replaces all
2026 /// loads/stores in the block range [`begin', `end') of `block' to load/store
2027 /// from that buffer. Returns failure if copies could not be generated due to
2028 /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
2029 /// in copyPlacementBlock specify the insertion points where the incoming copies
2030 /// and outgoing copies, respectively, should be inserted (the insertion happens
2031 /// right before the insertion point). Since `begin` can itself be invalidated
2032 /// due to the memref rewriting done from this method, the output argument
2033 /// `nBegin` is set to its replacement (set to `begin` if no invalidation
2034 /// happens). Since outgoing copies could have been inserted at `end`, the
2035 /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
2036 /// size of the fast buffer allocated.
generateCopy(const MemRefRegion & region,Block * block,Block::iterator begin,Block::iterator end,Block * copyPlacementBlock,Block::iterator copyInPlacementStart,Block::iterator copyOutPlacementStart,AffineCopyOptions copyOptions,DenseMap<Value,Value> & fastBufferMap,DenseSet<Operation * > & copyNests,uint64_t * sizeInBytes,Block::iterator * nBegin,Block::iterator * nEnd)2037 static LogicalResult generateCopy(
2038 const MemRefRegion ®ion, Block *block, Block::iterator begin,
2039 Block::iterator end, Block *copyPlacementBlock,
2040 Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
2041 AffineCopyOptions copyOptions, DenseMap<Value, Value> &fastBufferMap,
2042 DenseSet<Operation *> ©Nests, uint64_t *sizeInBytes,
2043 Block::iterator *nBegin, Block::iterator *nEnd) {
2044 *nBegin = begin;
2045 *nEnd = end;
2046
2047 func::FuncOp f = begin->getParentOfType<func::FuncOp>();
2048 OpBuilder topBuilder(f.getBody());
2049 Value zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
2050
2051 if (begin == end)
2052 return success();
2053
2054 // Is the copy out point at the end of the block where we are doing
2055 // explicit copying.
2056 bool isCopyOutAtEndOfBlock = (end == copyOutPlacementStart);
2057
2058 // Copies for read regions are going to be inserted at 'begin'.
2059 OpBuilder prologue(copyPlacementBlock, copyInPlacementStart);
2060 // Copies for write regions are going to be inserted at 'end'.
2061 OpBuilder epilogue(copyPlacementBlock, copyOutPlacementStart);
2062 OpBuilder &b = region.isWrite() ? epilogue : prologue;
2063
2064 // Builder to create constants at the top level.
2065 auto func = copyPlacementBlock->getParent()->getParentOfType<func::FuncOp>();
2066 OpBuilder top(func.getBody());
2067
2068 auto loc = region.loc;
2069 auto memref = region.memref;
2070 auto memRefType = memref.getType().cast<MemRefType>();
2071
2072 if (!memRefType.getLayout().isIdentity()) {
2073 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
2074 return failure();
2075 }
2076
2077 // Indices to use for the copying.
2078 // Indices for the original memref being copied from/to.
2079 SmallVector<Value, 4> memIndices;
2080 // Indices for the faster buffer being copied into/from.
2081 SmallVector<Value, 4> bufIndices;
2082
2083 unsigned rank = memRefType.getRank();
2084 SmallVector<int64_t, 4> fastBufferShape;
2085
2086 // Compute the extents of the buffer.
2087 std::vector<SmallVector<int64_t, 4>> lbs;
2088 SmallVector<int64_t, 8> lbDivisors;
2089 lbs.reserve(rank);
2090 Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape(
2091 &fastBufferShape, &lbs, &lbDivisors);
2092 if (!numElements) {
2093 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2094 return failure();
2095 }
2096
2097 if (*numElements == 0) {
2098 LLVM_DEBUG(llvm::dbgs() << "Nothing to copy\n");
2099 *sizeInBytes = 0;
2100 return success();
2101 }
2102
2103 SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
2104 for (unsigned i = 0; i < rank; ++i)
2105 region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
2106
2107 const FlatAffineValueConstraints *cst = region.getConstraints();
2108 // 'regionSymbols' hold values that this memory region is symbolic/parametric
2109 // on; these typically include loop IVs surrounding the level at which the
2110 // copy generation is being done or other valid symbols in MLIR.
2111 SmallVector<Value, 8> regionSymbols;
2112 cst->getValues(rank, cst->getNumVars(), ®ionSymbols);
2113
2114 // Construct the index expressions for the fast memory buffer. The index
2115 // expression for a particular dimension of the fast buffer is obtained by
2116 // subtracting out the lower bound on the original memref's data region
2117 // along the corresponding dimension.
2118
2119 // Index start offsets for faster memory buffer relative to the original.
2120 SmallVector<AffineExpr, 4> fastBufOffsets;
2121 fastBufOffsets.reserve(rank);
2122 for (unsigned d = 0; d < rank; d++) {
2123 assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2124
2125 AffineExpr offset = top.getAffineConstantExpr(0);
2126 for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2127 offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
2128 assert(lbDivisors[d] > 0);
2129 offset =
2130 (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2131
2132 // Set copy start location for this dimension in the lower memory space
2133 // memref.
2134 if (auto caf = offset.dyn_cast<AffineConstantExpr>()) {
2135 auto indexVal = caf.getValue();
2136 if (indexVal == 0) {
2137 memIndices.push_back(zeroIndex);
2138 } else {
2139 memIndices.push_back(
2140 top.create<arith::ConstantIndexOp>(loc, indexVal).getResult());
2141 }
2142 } else {
2143 // The coordinate for the start location is just the lower bound along the
2144 // corresponding dimension on the memory region (stored in 'offset').
2145 auto map = AffineMap::get(
2146 cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2147 memIndices.push_back(b.create<AffineApplyOp>(loc, map, regionSymbols));
2148 }
2149 // The fast buffer is copied into at location zero; addressing is relative.
2150 bufIndices.push_back(zeroIndex);
2151
2152 // Record the offsets since they are needed to remap the memory accesses of
2153 // the original memref further below.
2154 fastBufOffsets.push_back(offset);
2155 }
2156
2157 // The faster memory space buffer.
2158 Value fastMemRef;
2159
2160 // Check if a buffer was already created.
2161 bool existingBuf = fastBufferMap.count(memref) > 0;
2162 if (!existingBuf) {
2163 AffineMap fastBufferLayout = b.getMultiDimIdentityMap(rank);
2164 auto fastMemRefType =
2165 MemRefType::get(fastBufferShape, memRefType.getElementType(),
2166 fastBufferLayout, copyOptions.fastMemorySpace);
2167
2168 // Create the fast memory space buffer just before the 'affine.for'
2169 // operation.
2170 fastMemRef =
2171 prologue.create<memref::AllocOp>(loc, fastMemRefType).getResult();
2172 // Record it.
2173 fastBufferMap[memref] = fastMemRef;
2174 // fastMemRefType is a constant shaped memref.
2175 *sizeInBytes = *getMemRefSizeInBytes(fastMemRefType);
2176 LLVM_DEBUG(emitRemarkForBlock(*block)
2177 << "Creating fast buffer of type " << fastMemRefType
2178 << " and size " << llvm::divideCeil(*sizeInBytes, 1024)
2179 << " KiB\n");
2180 } else {
2181 // Reuse the one already created.
2182 fastMemRef = fastBufferMap[memref];
2183 *sizeInBytes = 0;
2184 }
2185
2186 auto numElementsSSA = top.create<arith::ConstantIndexOp>(loc, *numElements);
2187
2188 Value dmaStride = nullptr;
2189 Value numEltPerDmaStride = nullptr;
2190 if (copyOptions.generateDma) {
2191 SmallVector<StrideInfo, 4> dmaStrideInfos;
2192 getMultiLevelStrides(region, fastBufferShape, &dmaStrideInfos);
2193
2194 // TODO: use all stride levels once DmaStartOp is extended for
2195 // multi-level strides.
2196 if (dmaStrideInfos.size() > 1) {
2197 LLVM_DEBUG(llvm::dbgs() << "Only up to one level of stride supported\n");
2198 return failure();
2199 }
2200
2201 if (!dmaStrideInfos.empty()) {
2202 dmaStride =
2203 top.create<arith::ConstantIndexOp>(loc, dmaStrideInfos[0].stride);
2204 numEltPerDmaStride = top.create<arith::ConstantIndexOp>(
2205 loc, dmaStrideInfos[0].numEltPerStride);
2206 }
2207 }
2208
2209 // Record the last operation where we want the memref replacement to end. We
2210 // later do the memref replacement only in [begin, postDomFilter] so
2211 // that the original memref's used in the data movement code themselves don't
2212 // get replaced.
2213 auto postDomFilter = std::prev(end);
2214
2215 // Create fully composed affine maps for each memref.
2216 auto memAffineMap = b.getMultiDimIdentityMap(memIndices.size());
2217 fullyComposeAffineMapAndOperands(&memAffineMap, &memIndices);
2218 auto bufAffineMap = b.getMultiDimIdentityMap(bufIndices.size());
2219 fullyComposeAffineMapAndOperands(&bufAffineMap, &bufIndices);
2220
2221 if (!copyOptions.generateDma) {
2222 // Point-wise copy generation.
2223 auto copyNest =
2224 generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
2225 /*lbOperands=*/regionSymbols, ubMaps,
2226 /*ubOperands=*/regionSymbols, fastBufOffsets,
2227 /*isCopyOut=*/region.isWrite(), b);
2228
2229 // Record this so that we can skip it from yet another copy.
2230 copyNests.insert(copyNest);
2231
2232 // Since new ops are being appended (for copy out's), adjust the end to
2233 // mark end of block range being processed if necessary.
2234 if (region.isWrite() && isCopyOutAtEndOfBlock)
2235 *nEnd = Block::iterator(copyNest.getOperation());
2236 } else {
2237 // DMA generation.
2238 // Create a tag (single element 1-d memref) for the DMA.
2239 auto tagMemRefType = MemRefType::get({1}, top.getIntegerType(32), {},
2240 copyOptions.tagMemorySpace);
2241 auto tagMemRef = prologue.create<memref::AllocOp>(loc, tagMemRefType);
2242
2243 SmallVector<Value, 4> tagIndices({zeroIndex});
2244 auto tagAffineMap = b.getMultiDimIdentityMap(tagIndices.size());
2245 fullyComposeAffineMapAndOperands(&tagAffineMap, &tagIndices);
2246 if (!region.isWrite()) {
2247 // DMA non-blocking read from original buffer to fast buffer.
2248 b.create<AffineDmaStartOp>(loc, memref, memAffineMap, memIndices,
2249 fastMemRef, bufAffineMap, bufIndices,
2250 tagMemRef, tagAffineMap, tagIndices,
2251 numElementsSSA, dmaStride, numEltPerDmaStride);
2252 } else {
2253 // DMA non-blocking write from fast buffer to the original memref.
2254 auto op = b.create<AffineDmaStartOp>(
2255 loc, fastMemRef, bufAffineMap, bufIndices, memref, memAffineMap,
2256 memIndices, tagMemRef, tagAffineMap, tagIndices, numElementsSSA,
2257 dmaStride, numEltPerDmaStride);
2258 // Since new ops may be appended at 'end' (for outgoing DMAs), adjust the
2259 // end to mark end of block range being processed.
2260 if (isCopyOutAtEndOfBlock)
2261 *nEnd = Block::iterator(op.getOperation());
2262 }
2263
2264 // Matching DMA wait to block on completion; tag always has a 0 index.
2265 b.create<AffineDmaWaitOp>(loc, tagMemRef, tagAffineMap, zeroIndex,
2266 numElementsSSA);
2267
2268 // Generate dealloc for the tag.
2269 auto tagDeallocOp = epilogue.create<memref::DeallocOp>(loc, tagMemRef);
2270 if (*nEnd == end && isCopyOutAtEndOfBlock)
2271 // Since new ops are being appended (for outgoing DMAs), adjust the end to
2272 // mark end of range of the original.
2273 *nEnd = Block::iterator(tagDeallocOp.getOperation());
2274 }
2275
2276 // Generate dealloc for the buffer.
2277 if (!existingBuf) {
2278 auto bufDeallocOp = epilogue.create<memref::DeallocOp>(loc, fastMemRef);
2279 // When generating pointwise copies, `nEnd' has to be set to deallocOp on
2280 // the fast buffer (since it marks the new end insertion point).
2281 if (!copyOptions.generateDma && *nEnd == end && isCopyOutAtEndOfBlock)
2282 *nEnd = Block::iterator(bufDeallocOp.getOperation());
2283 }
2284
2285 // Replace all uses of the old memref with the faster one while remapping
2286 // access indices (subtracting out lower bound offsets for each dimension).
2287 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2288 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2289 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2290 // and (%iT, %jT) will be the 'extraOperands' for 'rep all memref uses with'.
2291 // d2, d3 correspond to the original indices (%i, %j).
2292 SmallVector<AffineExpr, 4> remapExprs;
2293 remapExprs.reserve(rank);
2294 for (unsigned i = 0; i < rank; i++) {
2295 // The starting operands of indexRemap will be regionSymbols (the symbols on
2296 // which the memref region is parametric); then those corresponding to
2297 // the memref's original indices follow.
2298 auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
2299 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2300 }
2301 auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs,
2302 b.getContext());
2303
2304 // Record the begin since it may be invalidated by memref replacement.
2305 Block::iterator prevOfBegin;
2306 bool isBeginAtStartOfBlock = (begin == block->begin());
2307 if (!isBeginAtStartOfBlock)
2308 prevOfBegin = std::prev(begin);
2309
2310 // *Only* those uses within the range [begin, end) of 'block' are replaced.
2311 (void)replaceAllMemRefUsesWith(memref, fastMemRef,
2312 /*extraIndices=*/{}, indexRemap,
2313 /*extraOperands=*/regionSymbols,
2314 /*symbolOperands=*/{},
2315 /*domOpFilter=*/&*begin,
2316 /*postDomOpFilter=*/&*postDomFilter);
2317
2318 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2319
2320 return success();
2321 }
2322
2323 /// Construct the memref region to just include the entire memref. Returns false
2324 /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
2325 /// enclosing loop IVs of `op` (starting from the outermost) that the region
2326 /// is parametric on.
getFullMemRefAsRegion(Operation * op,unsigned numParamLoopIVs,MemRefRegion * region)2327 static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
2328 MemRefRegion *region) {
2329 unsigned rank;
2330 if (auto loadOp = dyn_cast<AffineLoadOp>(op)) {
2331 rank = loadOp.getMemRefType().getRank();
2332 region->memref = loadOp.getMemRef();
2333 region->setWrite(false);
2334 } else if (auto storeOp = dyn_cast<AffineStoreOp>(op)) {
2335 rank = storeOp.getMemRefType().getRank();
2336 region->memref = storeOp.getMemRef();
2337 region->setWrite(true);
2338 } else {
2339 assert(false && "expected load or store op");
2340 return false;
2341 }
2342 auto memRefType = region->memref.getType().cast<MemRefType>();
2343 if (!memRefType.hasStaticShape())
2344 return false;
2345
2346 auto *regionCst = region->getConstraints();
2347
2348 // Just get the first numSymbols IVs, which the memref region is parametric
2349 // on.
2350 SmallVector<AffineForOp, 4> ivs;
2351 getLoopIVs(*op, &ivs);
2352 ivs.resize(numParamLoopIVs);
2353 SmallVector<Value, 4> symbols;
2354 extractForInductionVars(ivs, &symbols);
2355 regionCst->reset(rank, numParamLoopIVs, 0);
2356 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2357
2358 // Memref dim sizes provide the bounds.
2359 for (unsigned d = 0; d < rank; d++) {
2360 auto dimSize = memRefType.getDimSize(d);
2361 assert(dimSize > 0 && "filtered dynamic shapes above");
2362 regionCst->addBound(IntegerPolyhedron::LB, d, 0);
2363 regionCst->addBound(IntegerPolyhedron::UB, d, dimSize - 1);
2364 }
2365 return true;
2366 }
2367
affineDataCopyGenerate(Block::iterator begin,Block::iterator end,const AffineCopyOptions & copyOptions,Optional<Value> filterMemRef,DenseSet<Operation * > & copyNests)2368 LogicalResult mlir::affineDataCopyGenerate(Block::iterator begin,
2369 Block::iterator end,
2370 const AffineCopyOptions ©Options,
2371 Optional<Value> filterMemRef,
2372 DenseSet<Operation *> ©Nests) {
2373 if (begin == end)
2374 return success();
2375
2376 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2377 "Inconsistent block begin/end args");
2378 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2379
2380 Block *block = begin->getBlock();
2381
2382 // Copies will be generated for this depth, i.e., symbolic in all loops
2383 // surrounding the this block range.
2384 unsigned copyDepth = getNestingDepth(&*begin);
2385
2386 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2387 << "\n");
2388 LLVM_DEBUG(llvm::dbgs() << "from begin: " << *begin << "\n");
2389 LLVM_DEBUG(llvm::dbgs() << "to inclusive end: " << *std::prev(end) << "\n");
2390
2391 // List of memory regions to copy for. We need a map vector to have a
2392 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2393 // since the alloc's for example are identical except for the SSA id.
2394 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> readRegions;
2395 SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4> writeRegions;
2396
2397 // Map from original memref's to the fast buffers that their accesses are
2398 // replaced with.
2399 DenseMap<Value, Value> fastBufferMap;
2400
2401 // To check for errors when walking the block.
2402 bool error = false;
2403
2404 // Walk this range of operations to gather all memory regions.
2405 block->walk(begin, end, [&](Operation *opInst) {
2406 // Gather regions to allocate to buffers in faster memory space.
2407 if (auto loadOp = dyn_cast<AffineLoadOp>(opInst)) {
2408 if ((filterMemRef.has_value() && filterMemRef != loadOp.getMemRef()) ||
2409 (loadOp.getMemRefType().getMemorySpaceAsInt() !=
2410 copyOptions.slowMemorySpace))
2411 return;
2412 } else if (auto storeOp = dyn_cast<AffineStoreOp>(opInst)) {
2413 if ((filterMemRef.has_value() && filterMemRef != storeOp.getMemRef()) ||
2414 storeOp.getMemRefType().getMemorySpaceAsInt() !=
2415 copyOptions.slowMemorySpace)
2416 return;
2417 } else {
2418 // Neither load nor a store op.
2419 return;
2420 }
2421
2422 // Compute the MemRefRegion accessed.
2423 auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2424 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2425 /*addMemRefDimBounds=*/false))) {
2426 LLVM_DEBUG(llvm::dbgs()
2427 << "Error obtaining memory region: semi-affine maps?\n");
2428 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2429 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2430 LLVM_DEBUG(
2431 opInst->emitError("non-constant memref sizes not yet supported"));
2432 error = true;
2433 return;
2434 }
2435 }
2436
2437 // Each memref has a single buffer associated with it irrespective of how
2438 // many load's and store's happen on it.
2439 // TODO: in the future, when regions don't intersect and satisfy
2440 // other properties (based on load/store regions), we could consider
2441 // multiple buffers per memref.
2442
2443 // Add to the appropriate region if it's not already in it, or take a
2444 // bounding box union with the existing one if it's already in there.
2445 // Note that a memref may have both read and write regions - so update the
2446 // region in the other list if one exists (write in case of read and vice
2447 // versa) since there is a single bounding box for a memref across all reads
2448 // and writes that happen on it.
2449
2450 // Attempts to update; returns true if 'region' exists in targetRegions.
2451 auto updateRegion =
2452 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2453 &targetRegions) {
2454 const auto *const it = targetRegions.find(region->memref);
2455 if (it == targetRegions.end())
2456 return false;
2457
2458 // Perform a union with the existing region.
2459 if (failed(it->second->unionBoundingBox(*region))) {
2460 LLVM_DEBUG(llvm::dbgs()
2461 << "Memory region bounding box failed; "
2462 "over-approximating to the entire memref\n");
2463 // If the union fails, we will overapproximate.
2464 if (!getFullMemRefAsRegion(opInst, copyDepth, region.get())) {
2465 LLVM_DEBUG(opInst->emitError(
2466 "non-constant memref sizes not yet supported"));
2467 error = true;
2468 return true;
2469 }
2470 it->second->getConstraints()->clearAndCopyFrom(
2471 *region->getConstraints());
2472 } else {
2473 // Union was computed and stored in 'it->second': copy to 'region'.
2474 region->getConstraints()->clearAndCopyFrom(
2475 *it->second->getConstraints());
2476 }
2477 return true;
2478 };
2479
2480 bool existsInRead = updateRegion(readRegions);
2481 if (error)
2482 return;
2483 bool existsInWrite = updateRegion(writeRegions);
2484 if (error)
2485 return;
2486
2487 // Finally add it to the region list.
2488 if (region->isWrite() && !existsInWrite) {
2489 writeRegions[region->memref] = std::move(region);
2490 } else if (!region->isWrite() && !existsInRead) {
2491 readRegions[region->memref] = std::move(region);
2492 }
2493 });
2494
2495 if (error) {
2496 LLVM_DEBUG(begin->emitError(
2497 "copy generation failed for one or more memref's in this block\n"));
2498 return failure();
2499 }
2500
2501 uint64_t totalCopyBuffersSizeInBytes = 0;
2502 bool ret = true;
2503 auto processRegions =
2504 [&](const SmallMapVector<Value, std::unique_ptr<MemRefRegion>, 4>
2505 ®ions) {
2506 for (const auto ®ionEntry : regions) {
2507 // For each region, hoist copy in/out past all hoistable
2508 // 'affine.for's.
2509 Block::iterator copyInPlacementStart, copyOutPlacementStart;
2510 Block *copyPlacementBlock;
2511 findHighestBlockForPlacement(
2512 *regionEntry.second, *block, begin, end, ©PlacementBlock,
2513 ©InPlacementStart, ©OutPlacementStart);
2514
2515 uint64_t sizeInBytes;
2516 Block::iterator nBegin, nEnd;
2517 LogicalResult iRet = generateCopy(
2518 *regionEntry.second, block, begin, end, copyPlacementBlock,
2519 copyInPlacementStart, copyOutPlacementStart, copyOptions,
2520 fastBufferMap, copyNests, &sizeInBytes, &nBegin, &nEnd);
2521 if (succeeded(iRet)) {
2522 // begin/end could have been invalidated, and need update.
2523 begin = nBegin;
2524 end = nEnd;
2525 totalCopyBuffersSizeInBytes += sizeInBytes;
2526 }
2527 ret = ret & succeeded(iRet);
2528 }
2529 };
2530 processRegions(readRegions);
2531 processRegions(writeRegions);
2532
2533 if (!ret) {
2534 LLVM_DEBUG(begin->emitError(
2535 "copy generation failed for one or more memref's in this block\n"));
2536 return failure();
2537 }
2538
2539 // For a range of operations, a note will be emitted at the caller.
2540 AffineForOp forOp;
2541 if (llvm::DebugFlag && (forOp = dyn_cast<AffineForOp>(&*begin))) {
2542 LLVM_DEBUG(forOp.emitRemark()
2543 << llvm::divideCeil(totalCopyBuffersSizeInBytes, 1024)
2544 << " KiB of copy buffers in fast memory space for this block\n");
2545 }
2546
2547 if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
2548 StringRef str = "Total size of all copy buffers' for this block "
2549 "exceeds fast memory capacity\n";
2550 block->getParentOp()->emitWarning(str);
2551 }
2552
2553 return success();
2554 }
2555
2556 // A convenience version of affineDataCopyGenerate for all ops in the body of
2557 // an AffineForOp.
affineDataCopyGenerate(AffineForOp forOp,const AffineCopyOptions & copyOptions,Optional<Value> filterMemRef,DenseSet<Operation * > & copyNests)2558 LogicalResult mlir::affineDataCopyGenerate(AffineForOp forOp,
2559 const AffineCopyOptions ©Options,
2560 Optional<Value> filterMemRef,
2561 DenseSet<Operation *> ©Nests) {
2562 return affineDataCopyGenerate(forOp.getBody()->begin(),
2563 std::prev(forOp.getBody()->end()), copyOptions,
2564 filterMemRef, copyNests);
2565 }
2566
generateCopyForMemRegion(const MemRefRegion & memrefRegion,Operation * analyzedOp,const AffineCopyOptions & copyOptions,CopyGenerateResult & result)2567 LogicalResult mlir::generateCopyForMemRegion(
2568 const MemRefRegion &memrefRegion, Operation *analyzedOp,
2569 const AffineCopyOptions ©Options, CopyGenerateResult &result) {
2570 Block *block = analyzedOp->getBlock();
2571 auto begin = analyzedOp->getIterator();
2572 auto end = std::next(begin);
2573 DenseMap<Value, Value> fastBufferMap;
2574 DenseSet<Operation *> copyNests;
2575
2576 auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
2577 copyOptions, fastBufferMap, copyNests,
2578 &result.sizeInBytes, &begin, &end);
2579 if (failed(err))
2580 return err;
2581
2582 const auto &en = fastBufferMap.find(memrefRegion.memref);
2583 // In some cases (empty loops), no copy generation would have happened.
2584 if (en == fastBufferMap.end())
2585 return failure();
2586 result.alloc = en->second.getDefiningOp();
2587 assert(result.alloc && "fast buffer expected to be locally allocated");
2588 assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
2589 result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
2590 return success();
2591 }
2592
2593 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
2594 static void
gatherLoopsInBlock(Block * block,unsigned currLoopDepth,std::vector<SmallVector<AffineForOp,2>> & depthToLoops)2595 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
2596 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2597 // Add a new empty level to output if it doesn't exist level already.
2598 assert(currLoopDepth <= depthToLoops.size() && "Unexpected currLoopDepth");
2599 if (currLoopDepth == depthToLoops.size())
2600 depthToLoops.emplace_back();
2601
2602 for (auto &op : *block) {
2603 if (auto forOp = dyn_cast<AffineForOp>(op)) {
2604 depthToLoops[currLoopDepth].push_back(forOp);
2605 gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops);
2606 }
2607 }
2608 }
2609
2610 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
gatherLoops(func::FuncOp func,std::vector<SmallVector<AffineForOp,2>> & depthToLoops)2611 void mlir::gatherLoops(func::FuncOp func,
2612 std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) {
2613 for (auto &block : func)
2614 gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops);
2615
2616 // Remove last loop level from output since it's empty.
2617 if (!depthToLoops.empty()) {
2618 assert(depthToLoops.back().empty() && "Last loop level is not empty?");
2619 depthToLoops.pop_back();
2620 }
2621 }
2622
2623 // TODO: if necessary, this can be extended to also compose in any
2624 // affine.applys, fold to constant if all result dimensions of the map are
2625 // constant (canonicalizeMapAndOperands below already does this for single
2626 // result bound maps), and use simplifyMap to perform algebraic simplification.
createCanonicalizedAffineForOp(OpBuilder b,Location loc,ValueRange lbOperands,AffineMap lbMap,ValueRange ubOperands,AffineMap ubMap,int64_t step)2627 AffineForOp mlir::createCanonicalizedAffineForOp(
2628 OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
2629 ValueRange ubOperands, AffineMap ubMap, int64_t step) {
2630 SmallVector<Value, 4> lowerOperands(lbOperands);
2631 SmallVector<Value, 4> upperOperands(ubOperands);
2632
2633 fullyComposeAffineMapAndOperands(&lbMap, &lowerOperands);
2634 canonicalizeMapAndOperands(&lbMap, &lowerOperands);
2635 lbMap = removeDuplicateExprs(lbMap);
2636 fullyComposeAffineMapAndOperands(&ubMap, &upperOperands);
2637 canonicalizeMapAndOperands(&ubMap, &upperOperands);
2638 ubMap = removeDuplicateExprs(ubMap);
2639
2640 return b.create<AffineForOp>(loc, lowerOperands, lbMap, upperOperands, ubMap,
2641 step);
2642 }
2643
2644 /// Creates an AffineIfOp that encodes the conditional to choose between
2645 /// the constant trip count version and an unknown trip count version of this
2646 /// nest of loops. This is used to separate partial and full tiles if `loops`
2647 /// has the intra-tile loops. The affine.if op is inserted at the builder
2648 /// insertion point of `b`.
createSeparationCondition(MutableArrayRef<AffineForOp> loops,OpBuilder b)2649 static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
2650 OpBuilder b) {
2651 if (loops.empty())
2652 return nullptr;
2653
2654 auto *context = loops[0].getContext();
2655
2656 FlatAffineValueConstraints cst;
2657 SmallVector<Operation *, 8> ops;
2658 llvm::append_range(ops, loops);
2659 (void)getIndexSet(ops, &cst);
2660
2661 // Remove constraints that are independent of these loop IVs.
2662 cst.removeIndependentConstraints(/*pos=*/0, /*num=*/loops.size());
2663
2664 // Construct the constraint set representing the guard for full tiles. The
2665 // lower bound (and upper bound) corresponding to the full tile should be
2666 // larger (and resp. smaller) than any other lower (or upper bound).
2667 SmallVector<int64_t, 8> fullTileLb, fullTileUb;
2668 for (auto loop : loops) {
2669 (void)loop;
2670 // TODO: Non-unit stride is not an issue to generalize to.
2671 assert(loop.getStep() == 1 && "point loop step expected to be one");
2672 // Mark everything symbols for the purpose of finding a constant diff pair.
2673 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2674 1);
2675 unsigned fullTileLbPos, fullTileUbPos;
2676 if (!cst.getConstantBoundOnDimSize(0, /*lb=*/nullptr,
2677 /*boundFloorDivisor=*/nullptr,
2678 /*ub=*/nullptr, &fullTileLbPos,
2679 &fullTileUbPos)) {
2680 LLVM_DEBUG(llvm::dbgs() << "Can't get constant diff pair for a loop\n");
2681 return nullptr;
2682 }
2683
2684 SmallVector<unsigned, 4> lbIndices, ubIndices;
2685 cst.getLowerAndUpperBoundIndices(/*pos=*/0, &lbIndices, &ubIndices);
2686
2687 auto fLb = cst.getInequality(fullTileLbPos);
2688 auto fUb = cst.getInequality(fullTileUbPos);
2689 fullTileLb.assign(fLb.begin(), fLb.end());
2690 fullTileUb.assign(fUb.begin(), fUb.end());
2691
2692 // Full tile lower bound should be >= than any other lower bound.
2693 for (auto lbIndex : lbIndices)
2694 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2695 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2696
2697 // Full tile upper bound should be <= any other upper bound.
2698 for (auto ubIndex : ubIndices)
2699 for (unsigned i = 0, e = cst.getNumCols(); i < e; ++i)
2700 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2701
2702 cst.removeVar(0);
2703 }
2704
2705 // The previous step leads to all zeros for the full tile lb and ub position
2706 // itself; remove those and any other duplicates / trivial redundancies.
2707 cst.removeTrivialRedundancy();
2708
2709 // Turn everything into dims conservatively since we earlier turned all
2710 // trailing ids past point loop IV into symbols. Some of these could be outer
2711 // loop IVs; we'll canonicalize anyway.
2712 cst.setDimSymbolSeparation(0);
2713
2714 IntegerSet ifCondSet = cst.getAsIntegerSet(context);
2715 // ifCondSet can be null if cst was empty -- this can happen if all loops
2716 // in the nest have constant trip counts.
2717 if (!ifCondSet)
2718 return nullptr;
2719
2720 SmallVector<Value, 4> setOperands;
2721 cst.getValues(0, cst.getNumDimAndSymbolVars(), &setOperands);
2722 canonicalizeSetAndOperands(&ifCondSet, &setOperands);
2723 return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
2724 /*withElseRegion=*/true);
2725 }
2726
2727 /// Create the full tile loop nest (along with its body).
2728 static LogicalResult
createFullTiles(MutableArrayRef<AffineForOp> inputNest,SmallVectorImpl<AffineForOp> & fullTileLoops,OpBuilder b)2729 createFullTiles(MutableArrayRef<AffineForOp> inputNest,
2730 SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) {
2731 fullTileLoops.reserve(inputNest.size());
2732
2733 // For each loop in the original nest identify a lower/upper bound pair such
2734 // that their difference is a constant.
2735 FlatAffineValueConstraints cst;
2736 for (auto loop : inputNest) {
2737 // TODO: straightforward to generalize to a non-unit stride.
2738 if (loop.getStep() != 1) {
2739 LLVM_DEBUG(llvm::dbgs()
2740 << "[tile separation] non-unit stride not implemented\n");
2741 return failure();
2742 }
2743 SmallVector<Operation *, 1> loopOp{loop.getOperation()};
2744 (void)getIndexSet(loopOp, &cst);
2745 // We will mark everything other than this loop IV as symbol for getting a
2746 // pair of <lb, ub> with a constant difference.
2747 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1);
2748 unsigned lbPos, ubPos;
2749 if (!cst.getConstantBoundOnDimSize(/*pos=*/0, /*lb=*/nullptr,
2750 /*lbDivisor=*/nullptr, /*ub=*/nullptr,
2751 &lbPos, &ubPos) ||
2752 lbPos == ubPos) {
2753 LLVM_DEBUG(llvm::dbgs() << "[tile separation] Can't get constant diff / "
2754 "equalities not yet handled\n");
2755 return failure();
2756 }
2757
2758 // Set all variables as dimensions uniformly since some of those marked as
2759 // symbols above could be outer loop IVs (corresponding tile space IVs).
2760 cst.setDimSymbolSeparation(/*newSymbolCount=*/0);
2761
2762 AffineValueMap lbVmap, ubVmap;
2763 cst.getIneqAsAffineValueMap(/*pos=*/0, lbPos, lbVmap, b.getContext());
2764 cst.getIneqAsAffineValueMap(/*pos=*/0, ubPos, ubVmap, b.getContext());
2765 AffineForOp fullTileLoop = createCanonicalizedAffineForOp(
2766 b, loop.getLoc(), lbVmap.getOperands(), lbVmap.getAffineMap(),
2767 ubVmap.getOperands(), ubVmap.getAffineMap());
2768 b = OpBuilder::atBlockTerminator(fullTileLoop.getBody());
2769 fullTileLoops.push_back(fullTileLoop);
2770 }
2771
2772 // Add the body for the full tile loop nest.
2773 BlockAndValueMapping operandMap;
2774 for (const auto &loopEn : llvm::enumerate(inputNest))
2775 operandMap.map(loopEn.value().getInductionVar(),
2776 fullTileLoops[loopEn.index()].getInductionVar());
2777 b = OpBuilder::atBlockTerminator(fullTileLoops.back().getBody());
2778 for (auto &op : inputNest.back().getBody()->without_terminator())
2779 b.clone(op, operandMap);
2780 return success();
2781 }
2782
2783 LogicalResult
separateFullTiles(MutableArrayRef<AffineForOp> inputNest,SmallVectorImpl<AffineForOp> * fullTileNest)2784 mlir::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
2785 SmallVectorImpl<AffineForOp> *fullTileNest) {
2786 if (inputNest.empty())
2787 return success();
2788
2789 auto firstLoop = inputNest[0];
2790
2791 // Each successive for op has to be nested in the other.
2792 auto prevLoop = firstLoop;
2793 for (auto loop : inputNest.drop_front(1)) {
2794 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2795 prevLoop = loop;
2796 }
2797
2798 // Create the full tile loop nest.
2799 SmallVector<AffineForOp, 4> fullTileLoops;
2800 OpBuilder b(firstLoop);
2801 if (failed(createFullTiles(inputNest, fullTileLoops, b))) {
2802 if (!fullTileLoops.empty())
2803 fullTileLoops.front().erase();
2804 return failure();
2805 }
2806
2807 // Create and insert the version select right before the root of the nest.
2808 b = OpBuilder(firstLoop);
2809 AffineIfOp ifOp = createSeparationCondition(inputNest, b);
2810 if (!ifOp) {
2811 fullTileLoops.front().erase();
2812 LLVM_DEBUG(llvm::dbgs() << "All tiles are full tiles, or failure creating "
2813 "separation condition\n");
2814 return failure();
2815 }
2816
2817 // Move the full tile into the then block.
2818 Block *thenBlock = ifOp.getThenBlock();
2819 AffineForOp outermostFullTileLoop = fullTileLoops[0];
2820 thenBlock->getOperations().splice(
2821 std::prev(thenBlock->end()),
2822 outermostFullTileLoop->getBlock()->getOperations(),
2823 Block::iterator(outermostFullTileLoop));
2824
2825 // Move the partial tile into the else block. The partial tile is the same as
2826 // the original loop nest.
2827 Block *elseBlock = ifOp.getElseBlock();
2828 elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2829 firstLoop->getBlock()->getOperations(),
2830 Block::iterator(firstLoop));
2831
2832 if (fullTileNest)
2833 *fullTileNest = std::move(fullTileLoops);
2834
2835 return success();
2836 }
2837