1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "mlir/Analysis/Utils.h"
23 #include "mlir/Dialect/Affine/IR/AffineOps.h"
24 #include "mlir/Dialect/StandardOps/IR/Ops.h"
25 #include "mlir/Dialect/Affine/Passes.h"
26 #include "mlir/Pass/Pass.h"
27 #include "mlir/Transforms/LoopUtils.h"
28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include <algorithm>
32 
33 #define DEBUG_TYPE "affine-data-copy-generate"
34 
35 using namespace mlir;
36 
37 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
38 
39 static llvm::cl::opt<unsigned long long> clFastMemoryCapacity(
40     "affine-data-copy-generate-fast-mem-capacity",
41     llvm::cl::desc(
42         "Set fast memory space capacity in KiB (default: unlimited)"),
43     llvm::cl::cat(clOptionsCategory));
44 
45 static llvm::cl::opt<bool>
46     clDma("affine-data-copy-generate-dma",
47           llvm::cl::desc("Generate DMA instead of point-wise copy"),
48           llvm::cl::cat(clOptionsCategory), llvm::cl::init(true));
49 
50 static llvm::cl::opt<unsigned> clFastMemorySpace(
51     "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1),
52     llvm::cl::desc(
53         "Fast memory space identifier for copy generation (default: 1)"),
54     llvm::cl::cat(clOptionsCategory));
55 
56 static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
57     "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden,
58     llvm::cl::init(false),
59     llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
60                    "for copy placement"),
61     llvm::cl::cat(clOptionsCategory));
62 
63 namespace {
64 
65 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
66 /// introducing copy operations to transfer data into `fastMemorySpace` and
67 /// rewriting the original load's/store's to instead load/store from the
68 /// allocated fast memory buffers. Additional options specify the identifier
69 /// corresponding to the fast memory space and the amount of fast memory space
70 /// available. The pass traverses through the nesting structure, recursing to
71 /// inner levels if necessary to determine at what depth copies need to be
72 /// placed so that the allocated buffers fit within the memory capacity
73 /// provided.
74 // TODO(bondhugula): We currently can't generate copies correctly when stores
75 // are strided. Check for strided stores.
76 struct AffineDataCopyGeneration
77     : public FunctionPass<AffineDataCopyGeneration> {
78 /// Include the generated pass utilities.
79 #define GEN_PASS_AffineDataCopyGeneration
80 #include "mlir/Dialect/Affine/Passes.h.inc"
81 
82   explicit AffineDataCopyGeneration(
83       unsigned slowMemorySpace = 0,
84       unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
85       int minDmaTransferSize = 1024,
86       uint64_t fastMemCapacityBytes =
87           (clFastMemoryCapacity.getNumOccurrences() > 0
88                ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB
89                : std::numeric_limits<uint64_t>::max()),
90       bool generateDma = clDma,
91       bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop)
92       : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace),
93         tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize),
94         fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma),
95         skipNonUnitStrideLoops(skipNonUnitStrideLoops) {}
96 
97   explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other)
98       : slowMemorySpace(other.slowMemorySpace),
99         fastMemorySpace(other.fastMemorySpace),
100         tagMemorySpace(other.tagMemorySpace),
101         minDmaTransferSize(other.minDmaTransferSize),
102         fastMemCapacityBytes(other.fastMemCapacityBytes),
103         generateDma(other.generateDma),
104         skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}
105 
106   void runOnFunction() override;
107   LogicalResult runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
108 
109   // Slow memory space associated with copies.
110   const unsigned slowMemorySpace;
111   // Fast memory space associated with copies.
112   unsigned fastMemorySpace;
113   // Memory space associated with DMA tags.
114   unsigned tagMemorySpace;
115   // Minimum DMA transfer size supported by the target in bytes.
116   const int minDmaTransferSize;
117   // Capacity of the faster memory space.
118   uint64_t fastMemCapacityBytes;
119 
120   // If set, generate DMA operations instead of read/write.
121   bool generateDma;
122 
123   // If set, ignore loops with steps other than 1.
124   bool skipNonUnitStrideLoops;
125 
126   // Constant zero index to avoid too many duplicates.
127   Value zeroIndex = nullptr;
128 };
129 
130 } // end anonymous namespace
131 
132 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
133 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
134 /// by the latter. Only load op's handled for now.
135 /// TODO(bondhugula): extend this to store op's.
136 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass(
137     unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
138     int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
139   return std::make_unique<AffineDataCopyGeneration>(
140       slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
141       fastMemCapacityBytes);
142 }
143 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass() {
144   return std::make_unique<AffineDataCopyGeneration>();
145 }
146 
147 /// Generate copies for this block. The block is partitioned into separate
148 /// ranges: each range is either a sequence of one or more operations starting
149 /// and ending with an affine load or store op, or just an affine.forop (which
150 /// could have other affine for op's nested within).
151 LogicalResult
152 AffineDataCopyGeneration::runOnBlock(Block *block,
153                                      DenseSet<Operation *> &copyNests) {
154   if (block->empty())
155     return success();
156 
157   AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
158                                    fastMemorySpace, tagMemorySpace,
159                                    fastMemCapacityBytes};
160 
161   // Every affine.forop in the block starts and ends a block range for copying;
162   // in addition, a contiguous sequence of operations starting with a
163   // load/store op but not including any copy nests themselves is also
164   // identified as a copy block range. Straightline code (a contiguous chunk of
165   // operations excluding AffineForOp's) are always assumed to not exhaust
166   // memory. As a result, this approach is conservative in some cases at the
167   // moment; we do a check later and report an error with location info.
168   // TODO(bondhugula): An 'affine.if' operation is being treated similar to an
169   // operation. 'affine.if''s could have 'affine.for's in them;
170   // treat them separately.
171 
172   // Get to the first load, store, or for op (that is not a copy nest itself).
173   auto curBegin =
174       std::find_if(block->begin(), block->end(), [&](Operation &op) {
175         return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
176                 isa<AffineForOp>(op)) &&
177                copyNests.count(&op) == 0;
178       });
179 
180   // Create [begin, end) ranges.
181   auto it = curBegin;
182   while (it != block->end()) {
183     AffineForOp forOp;
184     // If you hit a non-copy for loop, we will split there.
185     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
186       // Perform the copying up unti this 'for' op first.
187       affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
188                              /*filterMemRef=*/llvm::None, copyNests);
189 
190       // Returns true if the footprint is known to exceed capacity.
191       auto exceedsCapacity = [&](AffineForOp forOp) {
192         Optional<int64_t> footprint =
193             getMemoryFootprintBytes(forOp,
194                                     /*memorySpace=*/0);
195         return (footprint.hasValue() &&
196                 static_cast<uint64_t>(footprint.getValue()) >
197                     fastMemCapacityBytes);
198       };
199 
200       // If the memory footprint of the 'affine.for' loop is higher than fast
201       // memory capacity (when provided), we recurse to copy at an inner level
202       // until we find a depth at which footprint fits in fast mem capacity. If
203       // the footprint can't be calculated, we assume for now it fits. Recurse
204       // inside if footprint for 'forOp' exceeds capacity, or when
205       // skipNonUnitStrideLoops is set and the step size is not one.
206       bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
207                                                  : exceedsCapacity(forOp);
208       if (recurseInner) {
209         // We'll recurse and do the copies at an inner level for 'forInst'.
210         // Recurse onto the body of this loop.
211         runOnBlock(forOp.getBody(), copyNests);
212       } else {
213         // We have enough capacity, i.e., copies will be computed for the
214         // portion of the block until 'it', and for 'it', which is 'forOp'. Note
215         // that for the latter, the copies are placed just before this loop (for
216         // incoming copies) and right after (for outgoing ones).
217 
218         // Inner loop copies have their own scope - we don't thus update
219         // consumed capacity. The footprint check above guarantees this inner
220         // loop's footprint fits.
221         affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
222                                /*filterMemRef=*/llvm::None, copyNests);
223       }
224       // Get to the next load or store op after 'forOp'.
225       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
226         return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
227                 isa<AffineForOp>(op)) &&
228                copyNests.count(&op) == 0;
229       });
230       it = curBegin;
231     } else {
232       assert(copyNests.count(&*it) == 0 &&
233              "all copy nests generated should have been skipped above");
234       // We simply include this op in the current range and continue for more.
235       ++it;
236     }
237   }
238 
239   // Generate the copy for the final block range.
240   if (curBegin != block->end()) {
241     // Can't be a terminator because it would have been skipped above.
242     assert(!curBegin->isKnownTerminator() && "can't be a terminator");
243     // Exclude the affine terminator - hence, the std::prev.
244     affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
245                            copyOptions, /*filterMemRef=*/llvm::None, copyNests);
246   }
247 
248   return success();
249 }
250 
251 void AffineDataCopyGeneration::runOnFunction() {
252   FuncOp f = getFunction();
253   OpBuilder topBuilder(f.getBody());
254   zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
255 
256   // Nests that are copy-in's or copy-out's; the root AffineForOps of those
257   // nests are stored herein.
258   DenseSet<Operation *> copyNests;
259 
260   // Clear recorded copy nests.
261   copyNests.clear();
262 
263   for (auto &block : f)
264     runOnBlock(&block, copyNests);
265 
266   // Promote any single iteration loops in the copy nests.
267   for (auto nest : copyNests) {
268     nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
269   }
270 }
271