1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to automatically promote accessed memref regions 10 // to buffers in a faster memory space that is explicitly managed, with the 11 // necessary data movement operations performed through either regular 12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as 13 // array packing/unpacking in the literature), when done on arrays that exhibit 14 // reuse, results in near elimination of conflict misses, TLB misses, reduced 15 // use of hardware prefetch streams, and reduced false sharing. It is also 16 // necessary for hardware that explicitly managed levels in the memory 17 // hierarchy, and where DMAs may have to be used. This optimization is often 18 // performed on already tiled code. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Analysis/Utils.h" 23 #include "mlir/Dialect/Affine/IR/AffineOps.h" 24 #include "mlir/Dialect/Affine/Passes.h" 25 #include "mlir/IR/Builders.h" 26 #include "mlir/Pass/Pass.h" 27 #include "mlir/Transforms/LoopUtils.h" 28 #include "mlir/Transforms/Utils.h" 29 #include "llvm/ADT/MapVector.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Support/Debug.h" 32 #include <algorithm> 33 34 #define DEBUG_TYPE "affine-data-copy-generate" 35 36 using namespace mlir; 37 38 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); 39 40 static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( 41 "affine-data-copy-generate-fast-mem-capacity", 42 llvm::cl::desc( 43 "Set fast memory space capacity in KiB (default: unlimited)"), 44 llvm::cl::cat(clOptionsCategory)); 45 46 static llvm::cl::opt<bool> 47 clDma("affine-data-copy-generate-dma", 48 llvm::cl::desc("Generate DMA instead of point-wise copy"), 49 llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); 50 51 static llvm::cl::opt<unsigned> clFastMemorySpace( 52 "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), 53 llvm::cl::desc( 54 "Fast memory space identifier for copy generation (default: 1)"), 55 llvm::cl::cat(clOptionsCategory)); 56 57 static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( 58 "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, 59 llvm::cl::init(false), 60 llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " 61 "for copy placement"), 62 llvm::cl::cat(clOptionsCategory)); 63 64 namespace { 65 66 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by 67 /// introducing copy operations to transfer data into `fastMemorySpace` and 68 /// rewriting the original load's/store's to instead load/store from the 69 /// allocated fast memory buffers. Additional options specify the identifier 70 /// corresponding to the fast memory space and the amount of fast memory space 71 /// available. The pass traverses through the nesting structure, recursing to 72 /// inner levels if necessary to determine at what depth copies need to be 73 /// placed so that the allocated buffers fit within the memory capacity 74 /// provided. 75 // TODO(bondhugula): We currently can't generate copies correctly when stores 76 // are strided. Check for strided stores. 77 struct AffineDataCopyGeneration 78 : public FunctionPass<AffineDataCopyGeneration> { 79 explicit AffineDataCopyGeneration( 80 unsigned slowMemorySpace = 0, 81 unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, 82 int minDmaTransferSize = 1024, 83 uint64_t fastMemCapacityBytes = 84 (clFastMemoryCapacity.getNumOccurrences() > 0 85 ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB 86 : std::numeric_limits<uint64_t>::max()), 87 bool generateDma = clDma, 88 bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) 89 : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), 90 tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), 91 fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), 92 skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} 93 94 explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) 95 : slowMemorySpace(other.slowMemorySpace), 96 fastMemorySpace(other.fastMemorySpace), 97 tagMemorySpace(other.tagMemorySpace), 98 minDmaTransferSize(other.minDmaTransferSize), 99 fastMemCapacityBytes(other.fastMemCapacityBytes), 100 generateDma(other.generateDma), 101 skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} 102 103 void runOnFunction() override; 104 LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests); 105 106 // Slow memory space associated with copies. 107 const unsigned slowMemorySpace; 108 // Fast memory space associated with copies. 109 unsigned fastMemorySpace; 110 // Memory space associated with DMA tags. 111 unsigned tagMemorySpace; 112 // Minimum DMA transfer size supported by the target in bytes. 113 const int minDmaTransferSize; 114 // Capacity of the faster memory space. 115 uint64_t fastMemCapacityBytes; 116 117 // If set, generate DMA operations instead of read/write. 118 bool generateDma; 119 120 // If set, ignore loops with steps other than 1. 121 bool skipNonUnitStrideLoops; 122 123 // Constant zero index to avoid too many duplicates. 124 Value zeroIndex = nullptr; 125 }; 126 127 } // end anonymous namespace 128 129 /// Generates copies for memref's living in 'slowMemorySpace' into newly created 130 /// buffers in 'fastMemorySpace', and replaces memory operations to the former 131 /// by the latter. Only load op's handled for now. 132 /// TODO(bondhugula): extend this to store op's. 133 std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass( 134 unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, 135 int minDmaTransferSize, uint64_t fastMemCapacityBytes) { 136 return std::make_unique<AffineDataCopyGeneration>( 137 slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, 138 fastMemCapacityBytes); 139 } 140 141 /// Generate copies for this block. The block is partitioned into separate 142 /// ranges: each range is either a sequence of one or more operations starting 143 /// and ending with an affine load or store op, or just an affine.forop (which 144 /// could have other affine for op's nested within). 145 LogicalResult 146 AffineDataCopyGeneration::runOnBlock(Block *block, 147 DenseSet<Operation *> ©Nests) { 148 if (block->empty()) 149 return success(); 150 151 AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, 152 fastMemorySpace, tagMemorySpace, 153 fastMemCapacityBytes}; 154 155 // Every affine.forop in the block starts and ends a block range for copying; 156 // in addition, a contiguous sequence of operations starting with a 157 // load/store op but not including any copy nests themselves is also 158 // identified as a copy block range. Straightline code (a contiguous chunk of 159 // operations excluding AffineForOp's) are always assumed to not exhaust 160 // memory. As a result, this approach is conservative in some cases at the 161 // moment; we do a check later and report an error with location info. 162 // TODO(bondhugula): An 'affine.if' operation is being treated similar to an 163 // operation. 'affine.if''s could have 'affine.for's in them; 164 // treat them separately. 165 166 // Get to the first load, store, or for op (that is not a copy nest itself). 167 auto curBegin = 168 std::find_if(block->begin(), block->end(), [&](Operation &op) { 169 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 170 isa<AffineForOp>(op)) && 171 copyNests.count(&op) == 0; 172 }); 173 174 // Create [begin, end) ranges. 175 auto it = curBegin; 176 while (it != block->end()) { 177 AffineForOp forOp; 178 // If you hit a non-copy for loop, we will split there. 179 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { 180 // Perform the copying up unti this 'for' op first. 181 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, 182 /*filterMemRef=*/llvm::None, copyNests); 183 184 // Returns true if the footprint is known to exceed capacity. 185 auto exceedsCapacity = [&](AffineForOp forOp) { 186 Optional<int64_t> footprint = 187 getMemoryFootprintBytes(forOp, 188 /*memorySpace=*/0); 189 return (footprint.hasValue() && 190 static_cast<uint64_t>(footprint.getValue()) > 191 fastMemCapacityBytes); 192 }; 193 194 // If the memory footprint of the 'affine.for' loop is higher than fast 195 // memory capacity (when provided), we recurse to copy at an inner level 196 // until we find a depth at which footprint fits in fast mem capacity. If 197 // the footprint can't be calculated, we assume for now it fits. Recurse 198 // inside if footprint for 'forOp' exceeds capacity, or when 199 // skipNonUnitStrideLoops is set and the step size is not one. 200 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 201 : exceedsCapacity(forOp); 202 if (recurseInner) { 203 // We'll recurse and do the copies at an inner level for 'forInst'. 204 // Recurse onto the body of this loop. 205 runOnBlock(forOp.getBody(), copyNests); 206 } else { 207 // We have enough capacity, i.e., copies will be computed for the 208 // portion of the block until 'it', and for 'it', which is 'forOp'. Note 209 // that for the latter, the copies are placed just before this loop (for 210 // incoming copies) and right after (for outgoing ones). 211 212 // Inner loop copies have their own scope - we don't thus update 213 // consumed capacity. The footprint check above guarantees this inner 214 // loop's footprint fits. 215 affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, 216 /*filterMemRef=*/llvm::None, copyNests); 217 } 218 // Get to the next load or store op after 'forOp'. 219 curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { 220 return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || 221 isa<AffineForOp>(op)) && 222 copyNests.count(&op) == 0; 223 }); 224 it = curBegin; 225 } else { 226 assert(copyNests.count(&*it) == 0 && 227 "all copy nests generated should have been skipped above"); 228 // We simply include this op in the current range and continue for more. 229 ++it; 230 } 231 } 232 233 // Generate the copy for the final block range. 234 if (curBegin != block->end()) { 235 // Can't be a terminator because it would have been skipped above. 236 assert(!curBegin->isKnownTerminator() && "can't be a terminator"); 237 // Exclude the affine terminator - hence, the std::prev. 238 affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), 239 copyOptions, /*filterMemRef=*/llvm::None, copyNests); 240 } 241 242 return success(); 243 } 244 245 void AffineDataCopyGeneration::runOnFunction() { 246 FuncOp f = getFunction(); 247 OpBuilder topBuilder(f.getBody()); 248 zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); 249 250 // Nests that are copy-in's or copy-out's; the root AffineForOps of those 251 // nests are stored herein. 252 DenseSet<Operation *> copyNests; 253 254 // Clear recorded copy nests. 255 copyNests.clear(); 256 257 for (auto &block : f) 258 runOnBlock(&block, copyNests); 259 260 // Promote any single iteration loops in the copy nests. 261 for (auto nest : copyNests) { 262 nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); 263 } 264 } 265 266 static PassRegistration<AffineDataCopyGeneration> 267 pass("affine-data-copy-generate", 268 "Generate explicit copying for memory operations"); 269