1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "mlir/Analysis/Utils.h"
23 #include "mlir/Dialect/Affine/IR/AffineOps.h"
24 #include "mlir/Dialect/StandardOps/IR/Ops.h"
25 #include "mlir/Dialect/Affine/Passes.h"
26 #include "mlir/Pass/Pass.h"
27 #include "mlir/Transforms/LoopUtils.h"
28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/Debug.h"
31 #include <algorithm>
32 
33 #define DEBUG_TYPE "affine-data-copy-generate"
34 
35 using namespace mlir;
36 
37 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
38 
39 static llvm::cl::opt<unsigned long long> clFastMemoryCapacity(
40     "affine-data-copy-generate-fast-mem-capacity",
41     llvm::cl::desc(
42         "Set fast memory space capacity in KiB (default: unlimited)"),
43     llvm::cl::cat(clOptionsCategory));
44 
45 static llvm::cl::opt<bool>
46     clDma("affine-data-copy-generate-dma",
47           llvm::cl::desc("Generate DMA instead of point-wise copy"),
48           llvm::cl::cat(clOptionsCategory), llvm::cl::init(true));
49 
50 static llvm::cl::opt<unsigned> clFastMemorySpace(
51     "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1),
52     llvm::cl::desc(
53         "Fast memory space identifier for copy generation (default: 1)"),
54     llvm::cl::cat(clOptionsCategory));
55 
56 static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
57     "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden,
58     llvm::cl::init(false),
59     llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
60                    "for copy placement"),
61     llvm::cl::cat(clOptionsCategory));
62 
63 namespace {
64 
65 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
66 /// introducing copy operations to transfer data into `fastMemorySpace` and
67 /// rewriting the original load's/store's to instead load/store from the
68 /// allocated fast memory buffers. Additional options specify the identifier
69 /// corresponding to the fast memory space and the amount of fast memory space
70 /// available. The pass traverses through the nesting structure, recursing to
71 /// inner levels if necessary to determine at what depth copies need to be
72 /// placed so that the allocated buffers fit within the memory capacity
73 /// provided.
74 // TODO(bondhugula): We currently can't generate copies correctly when stores
75 // are strided. Check for strided stores.
76 struct AffineDataCopyGeneration
77     : public FunctionPass<AffineDataCopyGeneration> {
78   explicit AffineDataCopyGeneration(
79       unsigned slowMemorySpace = 0,
80       unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
81       int minDmaTransferSize = 1024,
82       uint64_t fastMemCapacityBytes =
83           (clFastMemoryCapacity.getNumOccurrences() > 0
84                ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB
85                : std::numeric_limits<uint64_t>::max()),
86       bool generateDma = clDma,
87       bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop)
88       : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace),
89         tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize),
90         fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma),
91         skipNonUnitStrideLoops(skipNonUnitStrideLoops) {}
92 
93   explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other)
94       : slowMemorySpace(other.slowMemorySpace),
95         fastMemorySpace(other.fastMemorySpace),
96         tagMemorySpace(other.tagMemorySpace),
97         minDmaTransferSize(other.minDmaTransferSize),
98         fastMemCapacityBytes(other.fastMemCapacityBytes),
99         generateDma(other.generateDma),
100         skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}
101 
102   void runOnFunction() override;
103   LogicalResult runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
104 
105   // Slow memory space associated with copies.
106   const unsigned slowMemorySpace;
107   // Fast memory space associated with copies.
108   unsigned fastMemorySpace;
109   // Memory space associated with DMA tags.
110   unsigned tagMemorySpace;
111   // Minimum DMA transfer size supported by the target in bytes.
112   const int minDmaTransferSize;
113   // Capacity of the faster memory space.
114   uint64_t fastMemCapacityBytes;
115 
116   // If set, generate DMA operations instead of read/write.
117   bool generateDma;
118 
119   // If set, ignore loops with steps other than 1.
120   bool skipNonUnitStrideLoops;
121 
122   // Constant zero index to avoid too many duplicates.
123   Value zeroIndex = nullptr;
124 };
125 
126 } // end anonymous namespace
127 
128 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
129 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
130 /// by the latter. Only load op's handled for now.
131 /// TODO(bondhugula): extend this to store op's.
132 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass(
133     unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
134     int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
135   return std::make_unique<AffineDataCopyGeneration>(
136       slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
137       fastMemCapacityBytes);
138 }
139 
140 /// Generate copies for this block. The block is partitioned into separate
141 /// ranges: each range is either a sequence of one or more operations starting
142 /// and ending with an affine load or store op, or just an affine.forop (which
143 /// could have other affine for op's nested within).
144 LogicalResult
145 AffineDataCopyGeneration::runOnBlock(Block *block,
146                                      DenseSet<Operation *> &copyNests) {
147   if (block->empty())
148     return success();
149 
150   AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
151                                    fastMemorySpace, tagMemorySpace,
152                                    fastMemCapacityBytes};
153 
154   // Every affine.forop in the block starts and ends a block range for copying;
155   // in addition, a contiguous sequence of operations starting with a
156   // load/store op but not including any copy nests themselves is also
157   // identified as a copy block range. Straightline code (a contiguous chunk of
158   // operations excluding AffineForOp's) are always assumed to not exhaust
159   // memory. As a result, this approach is conservative in some cases at the
160   // moment; we do a check later and report an error with location info.
161   // TODO(bondhugula): An 'affine.if' operation is being treated similar to an
162   // operation. 'affine.if''s could have 'affine.for's in them;
163   // treat them separately.
164 
165   // Get to the first load, store, or for op (that is not a copy nest itself).
166   auto curBegin =
167       std::find_if(block->begin(), block->end(), [&](Operation &op) {
168         return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
169                 isa<AffineForOp>(op)) &&
170                copyNests.count(&op) == 0;
171       });
172 
173   // Create [begin, end) ranges.
174   auto it = curBegin;
175   while (it != block->end()) {
176     AffineForOp forOp;
177     // If you hit a non-copy for loop, we will split there.
178     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
179       // Perform the copying up unti this 'for' op first.
180       affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
181                              /*filterMemRef=*/llvm::None, copyNests);
182 
183       // Returns true if the footprint is known to exceed capacity.
184       auto exceedsCapacity = [&](AffineForOp forOp) {
185         Optional<int64_t> footprint =
186             getMemoryFootprintBytes(forOp,
187                                     /*memorySpace=*/0);
188         return (footprint.hasValue() &&
189                 static_cast<uint64_t>(footprint.getValue()) >
190                     fastMemCapacityBytes);
191       };
192 
193       // If the memory footprint of the 'affine.for' loop is higher than fast
194       // memory capacity (when provided), we recurse to copy at an inner level
195       // until we find a depth at which footprint fits in fast mem capacity. If
196       // the footprint can't be calculated, we assume for now it fits. Recurse
197       // inside if footprint for 'forOp' exceeds capacity, or when
198       // skipNonUnitStrideLoops is set and the step size is not one.
199       bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
200                                                  : exceedsCapacity(forOp);
201       if (recurseInner) {
202         // We'll recurse and do the copies at an inner level for 'forInst'.
203         // Recurse onto the body of this loop.
204         runOnBlock(forOp.getBody(), copyNests);
205       } else {
206         // We have enough capacity, i.e., copies will be computed for the
207         // portion of the block until 'it', and for 'it', which is 'forOp'. Note
208         // that for the latter, the copies are placed just before this loop (for
209         // incoming copies) and right after (for outgoing ones).
210 
211         // Inner loop copies have their own scope - we don't thus update
212         // consumed capacity. The footprint check above guarantees this inner
213         // loop's footprint fits.
214         affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
215                                /*filterMemRef=*/llvm::None, copyNests);
216       }
217       // Get to the next load or store op after 'forOp'.
218       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
219         return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
220                 isa<AffineForOp>(op)) &&
221                copyNests.count(&op) == 0;
222       });
223       it = curBegin;
224     } else {
225       assert(copyNests.count(&*it) == 0 &&
226              "all copy nests generated should have been skipped above");
227       // We simply include this op in the current range and continue for more.
228       ++it;
229     }
230   }
231 
232   // Generate the copy for the final block range.
233   if (curBegin != block->end()) {
234     // Can't be a terminator because it would have been skipped above.
235     assert(!curBegin->isKnownTerminator() && "can't be a terminator");
236     // Exclude the affine terminator - hence, the std::prev.
237     affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
238                            copyOptions, /*filterMemRef=*/llvm::None, copyNests);
239   }
240 
241   return success();
242 }
243 
244 void AffineDataCopyGeneration::runOnFunction() {
245   FuncOp f = getFunction();
246   OpBuilder topBuilder(f.getBody());
247   zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
248 
249   // Nests that are copy-in's or copy-out's; the root AffineForOps of those
250   // nests are stored herein.
251   DenseSet<Operation *> copyNests;
252 
253   // Clear recorded copy nests.
254   copyNests.clear();
255 
256   for (auto &block : f)
257     runOnBlock(&block, copyNests);
258 
259   // Promote any single iteration loops in the copy nests.
260   for (auto nest : copyNests) {
261     nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
262   }
263 }
264 
265 static PassRegistration<AffineDataCopyGeneration>
266     pass("affine-data-copy-generate",
267          "Generate explicit copying for memory operations");
268