1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to automatically promote accessed memref regions 10 // to buffers in a faster memory space that is explicitly managed, with the 11 // necessary data movement operations performed through either regular 12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as 13 // array packing/unpacking in the literature), when done on arrays that exhibit 14 // reuse, results in near elimination of conflict misses, TLB misses, reduced 15 // use of hardware prefetch streams, and reduced false sharing. It is also 16 // necessary for hardware that explicitly managed levels in the memory 17 // hierarchy, and where DMAs may have to be used. This optimization is often 18 // performed on already tiled code. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Analysis/Utils.h" 23 #include "mlir/Dialect/Affine/IR/AffineOps.h" 24 #include "mlir/Dialect/StandardOps/IR/Ops.h" 25 #include "mlir/Dialect/Affine/Passes.h" 26 #include "mlir/Pass/Pass.h" 27 #include "mlir/Transforms/LoopUtils.h" 28 #include "llvm/ADT/MapVector.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include <algorithm> 32 33 #define DEBUG_TYPE "affine-data-copy-generate" 34 35 using namespace mlir; 36 37 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); 38 39 static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( 40 "affine-data-copy-generate-fast-mem-capacity", 41 llvm::cl::desc( 42 "Set fast memory space capacity in KiB (default: unlimited)"), 43 llvm::cl::cat(clOptionsCategory)); 44 45 static llvm::cl::opt<bool> 46 clDma("affine-data-copy-generate-dma", 47 llvm::cl::desc("Generate DMA instead of point-wise copy"), 48 llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); 49 50 static llvm::cl::opt<unsigned> clFastMemorySpace( 51 "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), 52 llvm::cl::desc( 53 "Fast memory space identifier for copy generation (default: 1)"), 54 llvm::cl::cat(clOptionsCategory)); 55 56 static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( 57 "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, 58 llvm::cl::init(false), 59 llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " 60 "for copy placement"), 61 llvm::cl::cat(clOptionsCategory)); 62 63 namespace { 64 65 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by 66 /// introducing copy operations to transfer data into `fastMemorySpace` and 67 /// rewriting the original load's/store's to instead load/store from the 68 /// allocated fast memory buffers. Additional options specify the identifier 69 /// corresponding to the fast memory space and the amount of fast memory space 70 /// available. The pass traverses through the nesting structure, recursing to 71 /// inner levels if necessary to determine at what depth copies need to be 72 /// placed so that the allocated buffers fit within the memory capacity 73 /// provided. 74 // TODO(bondhugula): We currently can't generate copies correctly when stores 75 // are strided. Check for strided stores. 76 struct AffineDataCopyGeneration 77 : public FunctionPass<AffineDataCopyGeneration> { 78 explicit AffineDataCopyGeneration( 79 unsigned slowMemorySpace = 0, 80 unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, 81 int minDmaTransferSize = 1024, 82 uint64_t fastMemCapacityBytes = 83 (clFastMemoryCapacity.getNumOccurrences() > 0 84 ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB 85 : std::numeric_limits<uint64_t>::max()), 86 bool generateDma = clDma, 87 bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) 88 : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), 89 tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), 90 fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), 91 skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} 92 93 explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) 94 : slowMemorySpace(other.slowMemorySpace), 95 fastMemorySpace(other.fastMemorySpace), 96 tagMemorySpace(other.tagMemorySpace), 97 minDmaTransferSize(other.minDmaTransferSize), 98 fastMemCapacityBytes(other.fastMemCapacityBytes), 99 generateDma(other.generateDma), 100 skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} 101 102 void runOnFunction() override; 103 LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests); 104 105 // Slow memory space associated with copies. 106 const unsigned slowMemorySpace; 107 // Fast memory space associated with copies. 108 unsigned fastMemorySpace; 109 // Memory space associated with DMA tags. 110 unsigned tagMemorySpace; 111 // Minimum DMA transfer size supported by the target in bytes. 112 const int minDmaTransferSize; 113 // Capacity of the faster memory space. 114 uint64_t fastMemCapacityBytes; 115 116 // If set, generate DMA operations instead of read/write. 117 bool generateDma; 118 119 // If set, ignore loops with steps other than 1. 120 bool skipNonUnitStrideLoops; 121 122 // Constant zero index to avoid too many duplicates. 123 Value zeroIndex = nullptr; 124 }; 125 126 } // end anonymous namespace 127 128 /// Generates copies for memref's living in 'slowMemorySpace' into newly created 129 /// buffers in 'fastMemorySpace', and replaces memory operations to the former 130 /// by the latter. Only load op's handled for now. 131 /// TODO(bondhugula): extend this to store op's. 132 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass( 133 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, 134 int minDmaTransferSize, uint64_t fastMemCapacityBytes) { 135 return std::make_unique<AffineDataCopyGeneration>( 136 slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, 137 fastMemCapacityBytes); 138 } 139 140 /// Generate copies for this block. The block is partitioned into separate 141 /// ranges: each range is either a sequence of one or more operations starting 142 /// and ending with an affine load or store op, or just an affine.forop (which 143 /// could have other affine for op's nested within). 144 LogicalResult 145 AffineDataCopyGeneration::runOnBlock(Block *block, 146 DenseSet<Operation *> ©Nests) { 147 if (block->empty()) 148 return success(); 149 150 AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, 151 fastMemorySpace, tagMemorySpace, 152 fastMemCapacityBytes}; 153 154 // Every affine.forop in the block starts and ends a block range for copying; 155 // in addition, a contiguous sequence of operations starting with a 156 // load/store op but not including any copy nests themselves is also 157 // identified as a copy block range. Straightline code (a contiguous chunk of 158 // operations excluding AffineForOp's) are always assumed to not exhaust 159 // memory. As a result, this approach is conservative in some cases at the 160 // moment; we do a check later and report an error with location info. 161 // TODO(bondhugula): An 'affine.if' operation is being treated similar to an 162 // operation. 'affine.if''s could have 'affine.for's in them; 163 // treat them separately. 164 165 // Get to the first load, store, or for op (that is not a copy nest itself). 166 auto curBegin = 167 std::find_if(block->begin(), block->end(), [&](Operation &op) { 168 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 169 isa<AffineForOp>(op)) && 170 copyNests.count(&op) == 0; 171 }); 172 173 // Create [begin, end) ranges. 174 auto it = curBegin; 175 while (it != block->end()) { 176 AffineForOp forOp; 177 // If you hit a non-copy for loop, we will split there. 178 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { 179 // Perform the copying up unti this 'for' op first. 180 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, 181 /*filterMemRef=*/llvm::None, copyNests); 182 183 // Returns true if the footprint is known to exceed capacity. 184 auto exceedsCapacity = [&](AffineForOp forOp) { 185 Optional<int64_t> footprint = 186 getMemoryFootprintBytes(forOp, 187 /*memorySpace=*/0); 188 return (footprint.hasValue() && 189 static_cast<uint64_t>(footprint.getValue()) > 190 fastMemCapacityBytes); 191 }; 192 193 // If the memory footprint of the 'affine.for' loop is higher than fast 194 // memory capacity (when provided), we recurse to copy at an inner level 195 // until we find a depth at which footprint fits in fast mem capacity. If 196 // the footprint can't be calculated, we assume for now it fits. Recurse 197 // inside if footprint for 'forOp' exceeds capacity, or when 198 // skipNonUnitStrideLoops is set and the step size is not one. 199 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 200 : exceedsCapacity(forOp); 201 if (recurseInner) { 202 // We'll recurse and do the copies at an inner level for 'forInst'. 203 // Recurse onto the body of this loop. 204 runOnBlock(forOp.getBody(), copyNests); 205 } else { 206 // We have enough capacity, i.e., copies will be computed for the 207 // portion of the block until 'it', and for 'it', which is 'forOp'. Note 208 // that for the latter, the copies are placed just before this loop (for 209 // incoming copies) and right after (for outgoing ones). 210 211 // Inner loop copies have their own scope - we don't thus update 212 // consumed capacity. The footprint check above guarantees this inner 213 // loop's footprint fits. 214 affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, 215 /*filterMemRef=*/llvm::None, copyNests); 216 } 217 // Get to the next load or store op after 'forOp'. 218 curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { 219 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 220 isa<AffineForOp>(op)) && 221 copyNests.count(&op) == 0; 222 }); 223 it = curBegin; 224 } else { 225 assert(copyNests.count(&*it) == 0 && 226 "all copy nests generated should have been skipped above"); 227 // We simply include this op in the current range and continue for more. 228 ++it; 229 } 230 } 231 232 // Generate the copy for the final block range. 233 if (curBegin != block->end()) { 234 // Can't be a terminator because it would have been skipped above. 235 assert(!curBegin->isKnownTerminator() && "can't be a terminator"); 236 // Exclude the affine terminator - hence, the std::prev. 237 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), 238 copyOptions, /*filterMemRef=*/llvm::None, copyNests); 239 } 240 241 return success(); 242 } 243 244 void AffineDataCopyGeneration::runOnFunction() { 245 FuncOp f = getFunction(); 246 OpBuilder topBuilder(f.getBody()); 247 zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); 248 249 // Nests that are copy-in's or copy-out's; the root AffineForOps of those 250 // nests are stored herein. 251 DenseSet<Operation *> copyNests; 252 253 // Clear recorded copy nests. 254 copyNests.clear(); 255 256 for (auto &block : f) 257 runOnBlock(&block, copyNests); 258 259 // Promote any single iteration loops in the copy nests. 260 for (auto nest : copyNests) { 261 nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); 262 } 263 } 264 265 static PassRegistration<AffineDataCopyGeneration> 266 pass("affine-data-copy-generate", 267 "Generate explicit copying for memory operations"); 268