1e7084713SRob Suderman //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2e7084713SRob Suderman //
3e7084713SRob Suderman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e7084713SRob Suderman // See https://llvm.org/LICENSE.txt for license information.
5e7084713SRob Suderman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e7084713SRob Suderman //
7e7084713SRob Suderman //===----------------------------------------------------------------------===//
8e7084713SRob Suderman //
9e7084713SRob Suderman // This file implements a pass to automatically promote accessed memref regions
10e7084713SRob Suderman // to buffers in a faster memory space that is explicitly managed, with the
11e7084713SRob Suderman // necessary data movement operations performed through either regular
12e7084713SRob Suderman // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13e7084713SRob Suderman // array packing/unpacking in the literature), when done on arrays that exhibit
14e7084713SRob Suderman // reuse, results in near elimination of conflict misses, TLB misses, reduced
15e7084713SRob Suderman // use of hardware prefetch streams, and reduced false sharing. It is also
16e7084713SRob Suderman // necessary for hardware that explicitly managed levels in the memory
17e7084713SRob Suderman // hierarchy, and where DMAs may have to be used. This optimization is often
18e7084713SRob Suderman // performed on already tiled code.
19e7084713SRob Suderman //
20e7084713SRob Suderman //===----------------------------------------------------------------------===//
21e7084713SRob Suderman
221834ad4aSRiver Riddle #include "PassDetail.h"
23755dc07dSRiver Riddle #include "mlir/Dialect/Affine/Analysis/Utils.h"
24e7084713SRob Suderman #include "mlir/Dialect/Affine/IR/AffineOps.h"
25a70aa7bbSRiver Riddle #include "mlir/Dialect/Affine/LoopUtils.h"
26e7084713SRob Suderman #include "mlir/Dialect/Affine/Passes.h"
27a54f4eaeSMogball #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
2866f878ceSMatthias Springer #include "mlir/Dialect/MemRef/IR/MemRef.h"
29b6eb26fdSRiver Riddle #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
30e7084713SRob Suderman #include "llvm/ADT/MapVector.h"
31e7084713SRob Suderman #include "llvm/Support/CommandLine.h"
32e7084713SRob Suderman #include "llvm/Support/Debug.h"
33e7084713SRob Suderman #include <algorithm>
34e7084713SRob Suderman
35e7084713SRob Suderman #define DEBUG_TYPE "affine-data-copy-generate"
36e7084713SRob Suderman
37e7084713SRob Suderman using namespace mlir;
38e7084713SRob Suderman
39e7084713SRob Suderman namespace {
40e7084713SRob Suderman
41e7084713SRob Suderman /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
42e7084713SRob Suderman /// introducing copy operations to transfer data into `fastMemorySpace` and
43e7084713SRob Suderman /// rewriting the original load's/store's to instead load/store from the
44e7084713SRob Suderman /// allocated fast memory buffers. Additional options specify the identifier
45e7084713SRob Suderman /// corresponding to the fast memory space and the amount of fast memory space
46e7084713SRob Suderman /// available. The pass traverses through the nesting structure, recursing to
47e7084713SRob Suderman /// inner levels if necessary to determine at what depth copies need to be
48e7084713SRob Suderman /// placed so that the allocated buffers fit within the memory capacity
49e7084713SRob Suderman /// provided.
509db53a18SRiver Riddle // TODO: We currently can't generate copies correctly when stores
51e7084713SRob Suderman // are strided. Check for strided stores.
52e7084713SRob Suderman struct AffineDataCopyGeneration
531834ad4aSRiver Riddle : public AffineDataCopyGenerationBase<AffineDataCopyGeneration> {
54400ad6f9SRiver Riddle AffineDataCopyGeneration() = default;
AffineDataCopyGeneration__anonbab2f4fe0111::AffineDataCopyGeneration55400ad6f9SRiver Riddle explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
56400ad6f9SRiver Riddle unsigned fastMemorySpace,
57400ad6f9SRiver Riddle unsigned tagMemorySpace,
58400ad6f9SRiver Riddle int minDmaTransferSize,
59400ad6f9SRiver Riddle uint64_t fastMemCapacityBytes) {
60400ad6f9SRiver Riddle this->slowMemorySpace = slowMemorySpace;
61400ad6f9SRiver Riddle this->fastMemorySpace = fastMemorySpace;
62400ad6f9SRiver Riddle this->tagMemorySpace = tagMemorySpace;
63400ad6f9SRiver Riddle this->minDmaTransferSize = minDmaTransferSize;
64400ad6f9SRiver Riddle this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
65400ad6f9SRiver Riddle }
66e7084713SRob Suderman
6741574554SRiver Riddle void runOnOperation() override;
6805f6e939SUday Bondhugula void runOnBlock(Block *block, DenseSet<Operation *> ©Nests);
69e7084713SRob Suderman
70e7084713SRob Suderman // Constant zero index to avoid too many duplicates.
71e7084713SRob Suderman Value zeroIndex = nullptr;
72e7084713SRob Suderman };
73e7084713SRob Suderman
74be0a7e9fSMehdi Amini } // namespace
75e7084713SRob Suderman
76e7084713SRob Suderman /// Generates copies for memref's living in 'slowMemorySpace' into newly created
77e7084713SRob Suderman /// buffers in 'fastMemorySpace', and replaces memory operations to the former
78e7084713SRob Suderman /// by the latter. Only load op's handled for now.
799db53a18SRiver Riddle /// TODO: extend this to store op's.
8058ceae95SRiver Riddle std::unique_ptr<OperationPass<func::FuncOp>>
createAffineDataCopyGenerationPass(unsigned slowMemorySpace,unsigned fastMemorySpace,unsigned tagMemorySpace,int minDmaTransferSize,uint64_t fastMemCapacityBytes)8158ceae95SRiver Riddle mlir::createAffineDataCopyGenerationPass(unsigned slowMemorySpace,
8258ceae95SRiver Riddle unsigned fastMemorySpace,
8358ceae95SRiver Riddle unsigned tagMemorySpace,
8458ceae95SRiver Riddle int minDmaTransferSize,
8558ceae95SRiver Riddle uint64_t fastMemCapacityBytes) {
86e7084713SRob Suderman return std::make_unique<AffineDataCopyGeneration>(
87e7084713SRob Suderman slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
88e7084713SRob Suderman fastMemCapacityBytes);
89e7084713SRob Suderman }
9058ceae95SRiver Riddle std::unique_ptr<OperationPass<func::FuncOp>>
createAffineDataCopyGenerationPass()9180aca1eaSRiver Riddle mlir::createAffineDataCopyGenerationPass() {
92e3d834a5SRiver Riddle return std::make_unique<AffineDataCopyGeneration>();
93e3d834a5SRiver Riddle }
94e7084713SRob Suderman
95e7084713SRob Suderman /// Generate copies for this block. The block is partitioned into separate
96e7084713SRob Suderman /// ranges: each range is either a sequence of one or more operations starting
97e7084713SRob Suderman /// and ending with an affine load or store op, or just an affine.forop (which
98e7084713SRob Suderman /// could have other affine for op's nested within).
runOnBlock(Block * block,DenseSet<Operation * > & copyNests)9905f6e939SUday Bondhugula void AffineDataCopyGeneration::runOnBlock(Block *block,
100e7084713SRob Suderman DenseSet<Operation *> ©Nests) {
101e7084713SRob Suderman if (block->empty())
10205f6e939SUday Bondhugula return;
103e7084713SRob Suderman
104400ad6f9SRiver Riddle uint64_t fastMemCapacityBytes =
105400ad6f9SRiver Riddle fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
106400ad6f9SRiver Riddle ? fastMemoryCapacity * 1024
107400ad6f9SRiver Riddle : fastMemoryCapacity;
108e7084713SRob Suderman AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
109e7084713SRob Suderman fastMemorySpace, tagMemorySpace,
110e7084713SRob Suderman fastMemCapacityBytes};
111e7084713SRob Suderman
112e7084713SRob Suderman // Every affine.for op in the block starts and ends a block range for copying;
113e7084713SRob Suderman // in addition, a contiguous sequence of operations starting with a
114e7084713SRob Suderman // load/store op but not including any copy nests themselves is also
115e7084713SRob Suderman // identified as a copy block range. Straightline code (a contiguous chunk of
116e7084713SRob Suderman // operations excluding AffineForOp's) are always assumed to not exhaust
117e7084713SRob Suderman // memory. As a result, this approach is conservative in some cases at the
118e7084713SRob Suderman // moment; we do a check later and report an error with location info.
1199db53a18SRiver Riddle // TODO: An 'affine.if' operation is being treated similar to an
120e7084713SRob Suderman // operation. 'affine.if''s could have 'affine.for's in them;
121e7084713SRob Suderman // treat them separately.
122e7084713SRob Suderman
123e7084713SRob Suderman // Get to the first load, store, or for op (that is not a copy nest itself).
124e7084713SRob Suderman auto curBegin =
125e7084713SRob Suderman std::find_if(block->begin(), block->end(), [&](Operation &op) {
126d891d738SRahul Joshi return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
127e7084713SRob Suderman copyNests.count(&op) == 0;
128e7084713SRob Suderman });
129e7084713SRob Suderman
130e7084713SRob Suderman // Create [begin, end) ranges.
131e7084713SRob Suderman auto it = curBegin;
132e7084713SRob Suderman while (it != block->end()) {
133e7084713SRob Suderman AffineForOp forOp;
134e7084713SRob Suderman // If you hit a non-copy for loop, we will split there.
135e7084713SRob Suderman if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
136e7084713SRob Suderman // Perform the copying up unti this 'for' op first.
13705f6e939SUday Bondhugula (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
138e7084713SRob Suderman /*filterMemRef=*/llvm::None, copyNests);
139e7084713SRob Suderman
140e7084713SRob Suderman // Returns true if the footprint is known to exceed capacity.
141e7084713SRob Suderman auto exceedsCapacity = [&](AffineForOp forOp) {
142e7084713SRob Suderman Optional<int64_t> footprint =
143e7084713SRob Suderman getMemoryFootprintBytes(forOp,
144e7084713SRob Suderman /*memorySpace=*/0);
145491d2701SKazu Hirata return (footprint.has_value() &&
146*c27d8152SKazu Hirata static_cast<uint64_t>(footprint.value()) >
1473b7c3a65SKazu Hirata fastMemCapacityBytes);
148e7084713SRob Suderman };
149e7084713SRob Suderman
150e7084713SRob Suderman // If the memory footprint of the 'affine.for' loop is higher than fast
151e7084713SRob Suderman // memory capacity (when provided), we recurse to copy at an inner level
152e7084713SRob Suderman // until we find a depth at which footprint fits in fast mem capacity. If
153e7084713SRob Suderman // the footprint can't be calculated, we assume for now it fits. Recurse
154e7084713SRob Suderman // inside if footprint for 'forOp' exceeds capacity, or when
155e7084713SRob Suderman // skipNonUnitStrideLoops is set and the step size is not one.
156e7084713SRob Suderman bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
157e7084713SRob Suderman : exceedsCapacity(forOp);
158e7084713SRob Suderman if (recurseInner) {
159e7084713SRob Suderman // We'll recurse and do the copies at an inner level for 'forInst'.
160e7084713SRob Suderman // Recurse onto the body of this loop.
16105f6e939SUday Bondhugula runOnBlock(forOp.getBody(), copyNests);
162e7084713SRob Suderman } else {
163e7084713SRob Suderman // We have enough capacity, i.e., copies will be computed for the
164e7084713SRob Suderman // portion of the block until 'it', and for 'it', which is 'forOp'. Note
165e7084713SRob Suderman // that for the latter, the copies are placed just before this loop (for
166e7084713SRob Suderman // incoming copies) and right after (for outgoing ones).
167e7084713SRob Suderman
168e7084713SRob Suderman // Inner loop copies have their own scope - we don't thus update
169e7084713SRob Suderman // consumed capacity. The footprint check above guarantees this inner
170e7084713SRob Suderman // loop's footprint fits.
17105f6e939SUday Bondhugula (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
17205f6e939SUday Bondhugula copyOptions,
173e7084713SRob Suderman /*filterMemRef=*/llvm::None, copyNests);
174e7084713SRob Suderman }
175e7084713SRob Suderman // Get to the next load or store op after 'forOp'.
176e7084713SRob Suderman curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
177d891d738SRahul Joshi return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
178e7084713SRob Suderman copyNests.count(&op) == 0;
179e7084713SRob Suderman });
180e7084713SRob Suderman it = curBegin;
181e7084713SRob Suderman } else {
182e7084713SRob Suderman assert(copyNests.count(&*it) == 0 &&
183e7084713SRob Suderman "all copy nests generated should have been skipped above");
184e7084713SRob Suderman // We simply include this op in the current range and continue for more.
185e7084713SRob Suderman ++it;
186e7084713SRob Suderman }
187e7084713SRob Suderman }
188e7084713SRob Suderman
189e7084713SRob Suderman // Generate the copy for the final block range.
190e7084713SRob Suderman if (curBegin != block->end()) {
191e7084713SRob Suderman // Can't be a terminator because it would have been skipped above.
192fe7c0d90SRiver Riddle assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
193fe7c0d90SRiver Riddle "can't be a terminator");
1942ede8918SJeremy Bruestle // Exclude the affine.yield - hence, the std::prev.
19505f6e939SUday Bondhugula (void)affineDataCopyGenerate(/*begin=*/curBegin,
19605f6e939SUday Bondhugula /*end=*/std::prev(block->end()), copyOptions,
19705f6e939SUday Bondhugula /*filterMemRef=*/llvm::None, copyNests);
198e7084713SRob Suderman }
199e7084713SRob Suderman }
200e7084713SRob Suderman
runOnOperation()20141574554SRiver Riddle void AffineDataCopyGeneration::runOnOperation() {
20258ceae95SRiver Riddle func::FuncOp f = getOperation();
203e7084713SRob Suderman OpBuilder topBuilder(f.getBody());
204a54f4eaeSMogball zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
205e7084713SRob Suderman
206e7084713SRob Suderman // Nests that are copy-in's or copy-out's; the root AffineForOps of those
207e7084713SRob Suderman // nests are stored herein.
208e7084713SRob Suderman DenseSet<Operation *> copyNests;
209e7084713SRob Suderman
210e7084713SRob Suderman // Clear recorded copy nests.
211e7084713SRob Suderman copyNests.clear();
212e7084713SRob Suderman
213e7084713SRob Suderman for (auto &block : f)
21405f6e939SUday Bondhugula runOnBlock(&block, copyNests);
215e7084713SRob Suderman
21604b5274eSUday Bondhugula // Promote any single iteration loops in the copy nests and collect
21704b5274eSUday Bondhugula // load/stores to simplify.
21804b5274eSUday Bondhugula SmallVector<Operation *, 4> copyOps;
219ec85d7c8SUday Bondhugula for (Operation *nest : copyNests)
22004b5274eSUday Bondhugula // With a post order walk, the erasure of loops does not affect
22104b5274eSUday Bondhugula // continuation of the walk or the collection of load/store ops.
22204b5274eSUday Bondhugula nest->walk([&](Operation *op) {
22304b5274eSUday Bondhugula if (auto forOp = dyn_cast<AffineForOp>(op))
224e21adfa3SRiver Riddle (void)promoteIfSingleIteration(forOp);
225ee394e68SRahul Joshi else if (isa<AffineLoadOp, AffineStoreOp>(op))
22604b5274eSUday Bondhugula copyOps.push_back(op);
22704b5274eSUday Bondhugula });
22870da33bfSUday Bondhugula
22970da33bfSUday Bondhugula // Promoting single iteration loops could lead to simplification of
23004b5274eSUday Bondhugula // contained load's/store's, and the latter could anyway also be
23104b5274eSUday Bondhugula // canonicalized.
232dc4e913bSChris Lattner RewritePatternSet patterns(&getContext());
23370da33bfSUday Bondhugula AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
23470da33bfSUday Bondhugula AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
23579d7f618SChris Lattner FrozenRewritePatternSet frozenPatterns(std::move(patterns));
2367932d21fSUday Bondhugula (void)applyOpPatternsAndFold(copyOps, frozenPatterns, /*strict=*/true);
237e7084713SRob Suderman }
238