1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to automatically promote accessed memref regions 10 // to buffers in a faster memory space that is explicitly managed, with the 11 // necessary data movement operations performed through either regular 12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as 13 // array packing/unpacking in the literature), when done on arrays that exhibit 14 // reuse, results in near elimination of conflict misses, TLB misses, reduced 15 // use of hardware prefetch streams, and reduced false sharing. It is also 16 // necessary for hardware that explicitly managed levels in the memory 17 // hierarchy, and where DMAs may have to be used. This optimization is often 18 // performed on already tiled code. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Analysis/Utils.h" 23 #include "mlir/Dialect/Affine/IR/AffineOps.h" 24 #include "mlir/Dialect/StandardOps/IR/Ops.h" 25 #include "mlir/Dialect/Affine/Passes.h" 26 #include "mlir/Pass/Pass.h" 27 #include "mlir/Transforms/LoopUtils.h" 28 #include "llvm/ADT/MapVector.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include <algorithm> 32 33 #define DEBUG_TYPE "affine-data-copy-generate" 34 35 using namespace mlir; 36 37 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); 38 39 static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( 40 "affine-data-copy-generate-fast-mem-capacity", 41 llvm::cl::desc( 42 "Set fast memory space capacity in KiB (default: unlimited)"), 43 llvm::cl::cat(clOptionsCategory)); 44 45 static llvm::cl::opt<bool> 46 clDma("affine-data-copy-generate-dma", 47 llvm::cl::desc("Generate DMA instead of point-wise copy"), 48 llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); 49 50 static llvm::cl::opt<unsigned> clFastMemorySpace( 51 "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), 52 llvm::cl::desc( 53 "Fast memory space identifier for copy generation (default: 1)"), 54 llvm::cl::cat(clOptionsCategory)); 55 56 static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( 57 "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, 58 llvm::cl::init(false), 59 llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " 60 "for copy placement"), 61 llvm::cl::cat(clOptionsCategory)); 62 63 namespace { 64 65 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by 66 /// introducing copy operations to transfer data into `fastMemorySpace` and 67 /// rewriting the original load's/store's to instead load/store from the 68 /// allocated fast memory buffers. Additional options specify the identifier 69 /// corresponding to the fast memory space and the amount of fast memory space 70 /// available. The pass traverses through the nesting structure, recursing to 71 /// inner levels if necessary to determine at what depth copies need to be 72 /// placed so that the allocated buffers fit within the memory capacity 73 /// provided. 74 // TODO(bondhugula): We currently can't generate copies correctly when stores 75 // are strided. Check for strided stores. 76 struct AffineDataCopyGeneration 77 : public FunctionPass<AffineDataCopyGeneration> { 78 /// Include the generated pass utilities. 79 #define GEN_PASS_AffineDataCopyGeneration 80 #include "mlir/Dialect/Affine/Passes.h.inc" 81 82 explicit AffineDataCopyGeneration( 83 unsigned slowMemorySpace = 0, 84 unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, 85 int minDmaTransferSize = 1024, 86 uint64_t fastMemCapacityBytes = 87 (clFastMemoryCapacity.getNumOccurrences() > 0 88 ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB 89 : std::numeric_limits<uint64_t>::max()), 90 bool generateDma = clDma, 91 bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) 92 : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), 93 tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), 94 fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), 95 skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} 96 97 explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) 98 : slowMemorySpace(other.slowMemorySpace), 99 fastMemorySpace(other.fastMemorySpace), 100 tagMemorySpace(other.tagMemorySpace), 101 minDmaTransferSize(other.minDmaTransferSize), 102 fastMemCapacityBytes(other.fastMemCapacityBytes), 103 generateDma(other.generateDma), 104 skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} 105 106 void runOnFunction() override; 107 LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests); 108 109 // Slow memory space associated with copies. 110 const unsigned slowMemorySpace; 111 // Fast memory space associated with copies. 112 unsigned fastMemorySpace; 113 // Memory space associated with DMA tags. 114 unsigned tagMemorySpace; 115 // Minimum DMA transfer size supported by the target in bytes. 116 const int minDmaTransferSize; 117 // Capacity of the faster memory space. 118 uint64_t fastMemCapacityBytes; 119 120 // If set, generate DMA operations instead of read/write. 121 bool generateDma; 122 123 // If set, ignore loops with steps other than 1. 124 bool skipNonUnitStrideLoops; 125 126 // Constant zero index to avoid too many duplicates. 127 Value zeroIndex = nullptr; 128 }; 129 130 } // end anonymous namespace 131 132 /// Generates copies for memref's living in 'slowMemorySpace' into newly created 133 /// buffers in 'fastMemorySpace', and replaces memory operations to the former 134 /// by the latter. Only load op's handled for now. 135 /// TODO(bondhugula): extend this to store op's. 136 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass( 137 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, 138 int minDmaTransferSize, uint64_t fastMemCapacityBytes) { 139 return std::make_unique<AffineDataCopyGeneration>( 140 slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, 141 fastMemCapacityBytes); 142 } 143 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass() { 144 return std::make_unique<AffineDataCopyGeneration>(); 145 } 146 147 /// Generate copies for this block. The block is partitioned into separate 148 /// ranges: each range is either a sequence of one or more operations starting 149 /// and ending with an affine load or store op, or just an affine.forop (which 150 /// could have other affine for op's nested within). 151 LogicalResult 152 AffineDataCopyGeneration::runOnBlock(Block *block, 153 DenseSet<Operation *> ©Nests) { 154 if (block->empty()) 155 return success(); 156 157 AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, 158 fastMemorySpace, tagMemorySpace, 159 fastMemCapacityBytes}; 160 161 // Every affine.forop in the block starts and ends a block range for copying; 162 // in addition, a contiguous sequence of operations starting with a 163 // load/store op but not including any copy nests themselves is also 164 // identified as a copy block range. Straightline code (a contiguous chunk of 165 // operations excluding AffineForOp's) are always assumed to not exhaust 166 // memory. As a result, this approach is conservative in some cases at the 167 // moment; we do a check later and report an error with location info. 168 // TODO(bondhugula): An 'affine.if' operation is being treated similar to an 169 // operation. 'affine.if''s could have 'affine.for's in them; 170 // treat them separately. 171 172 // Get to the first load, store, or for op (that is not a copy nest itself). 173 auto curBegin = 174 std::find_if(block->begin(), block->end(), [&](Operation &op) { 175 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 176 isa<AffineForOp>(op)) && 177 copyNests.count(&op) == 0; 178 }); 179 180 // Create [begin, end) ranges. 181 auto it = curBegin; 182 while (it != block->end()) { 183 AffineForOp forOp; 184 // If you hit a non-copy for loop, we will split there. 185 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { 186 // Perform the copying up unti this 'for' op first. 187 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, 188 /*filterMemRef=*/llvm::None, copyNests); 189 190 // Returns true if the footprint is known to exceed capacity. 191 auto exceedsCapacity = [&](AffineForOp forOp) { 192 Optional<int64_t> footprint = 193 getMemoryFootprintBytes(forOp, 194 /*memorySpace=*/0); 195 return (footprint.hasValue() && 196 static_cast<uint64_t>(footprint.getValue()) > 197 fastMemCapacityBytes); 198 }; 199 200 // If the memory footprint of the 'affine.for' loop is higher than fast 201 // memory capacity (when provided), we recurse to copy at an inner level 202 // until we find a depth at which footprint fits in fast mem capacity. If 203 // the footprint can't be calculated, we assume for now it fits. Recurse 204 // inside if footprint for 'forOp' exceeds capacity, or when 205 // skipNonUnitStrideLoops is set and the step size is not one. 206 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 207 : exceedsCapacity(forOp); 208 if (recurseInner) { 209 // We'll recurse and do the copies at an inner level for 'forInst'. 210 // Recurse onto the body of this loop. 211 runOnBlock(forOp.getBody(), copyNests); 212 } else { 213 // We have enough capacity, i.e., copies will be computed for the 214 // portion of the block until 'it', and for 'it', which is 'forOp'. Note 215 // that for the latter, the copies are placed just before this loop (for 216 // incoming copies) and right after (for outgoing ones). 217 218 // Inner loop copies have their own scope - we don't thus update 219 // consumed capacity. The footprint check above guarantees this inner 220 // loop's footprint fits. 221 affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, 222 /*filterMemRef=*/llvm::None, copyNests); 223 } 224 // Get to the next load or store op after 'forOp'. 225 curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { 226 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 227 isa<AffineForOp>(op)) && 228 copyNests.count(&op) == 0; 229 }); 230 it = curBegin; 231 } else { 232 assert(copyNests.count(&*it) == 0 && 233 "all copy nests generated should have been skipped above"); 234 // We simply include this op in the current range and continue for more. 235 ++it; 236 } 237 } 238 239 // Generate the copy for the final block range. 240 if (curBegin != block->end()) { 241 // Can't be a terminator because it would have been skipped above. 242 assert(!curBegin->isKnownTerminator() && "can't be a terminator"); 243 // Exclude the affine terminator - hence, the std::prev. 244 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), 245 copyOptions, /*filterMemRef=*/llvm::None, copyNests); 246 } 247 248 return success(); 249 } 250 251 void AffineDataCopyGeneration::runOnFunction() { 252 FuncOp f = getFunction(); 253 OpBuilder topBuilder(f.getBody()); 254 zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); 255 256 // Nests that are copy-in's or copy-out's; the root AffineForOps of those 257 // nests are stored herein. 258 DenseSet<Operation *> copyNests; 259 260 // Clear recorded copy nests. 261 copyNests.clear(); 262 263 for (auto &block : f) 264 runOnBlock(&block, copyNests); 265 266 // Promote any single iteration loops in the copy nests. 267 for (auto nest : copyNests) { 268 nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); 269 } 270 } 271