1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements utilities that allow one to create IR moving the data 10 // across different levels of the GPU memory hierarchy. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h" 15 16 #include "mlir/Dialect/Affine/LoopUtils.h" 17 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" 18 #include "mlir/Dialect/GPU/IR/GPUDialect.h" 19 #include "mlir/Dialect/MemRef/IR/MemRef.h" 20 #include "mlir/Dialect/SCF/IR/SCF.h" 21 #include "mlir/IR/ImplicitLocOpBuilder.h" 22 #include "mlir/Pass/Pass.h" 23 24 using namespace mlir; 25 using namespace mlir::gpu; 26 27 /// Emits the (imperfect) loop nest performing the copy between "from" and "to" 28 /// values using the bounds derived from the "from" value. Emits at least 29 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with 30 /// single-iteration loops. Maps the innermost loops to thread dimensions, in 31 /// reverse order to enable access coalescing in the innermost loop. 32 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) { 33 auto memRefType = from.getType().cast<MemRefType>(); 34 auto rank = memRefType.getRank(); 35 36 SmallVector<Value, 4> lbs, ubs, steps; 37 Value zero = b.create<arith::ConstantIndexOp>(0); 38 Value one = b.create<arith::ConstantIndexOp>(1); 39 40 // Make sure we have enough loops to use all thread dimensions, these trivial 41 // loops should be outermost and therefore inserted first. 42 if (rank < GPUDialect::getNumWorkgroupDimensions()) { 43 unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank; 44 lbs.resize(extraLoops, zero); 45 ubs.resize(extraLoops, one); 46 steps.resize(extraLoops, one); 47 } 48 49 // Add existing bounds. 50 lbs.append(rank, zero); 51 ubs.reserve(lbs.size()); 52 steps.reserve(lbs.size()); 53 for (auto idx = 0; idx < rank; ++idx) { 54 ubs.push_back(b.createOrFold<memref::DimOp>( 55 from, b.create<arith::ConstantIndexOp>(idx))); 56 steps.push_back(one); 57 } 58 59 // Obtain thread identifiers and block sizes, necessary to map to them. 60 auto indexType = b.getIndexType(); 61 SmallVector<Value, 3> threadIds, blockDims; 62 for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) { 63 threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim)); 64 blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim)); 65 } 66 67 // Produce the loop nest with copies. 68 SmallVector<Value, 8> ivs(lbs.size()); 69 mlir::scf::buildLoopNest( 70 b, b.getLoc(), lbs, ubs, steps, 71 [&](OpBuilder &b, Location loc, ValueRange loopIvs) { 72 ivs.assign(loopIvs.begin(), loopIvs.end()); 73 auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank); 74 Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs); 75 b.create<memref::StoreOp>(loc, loaded, to, activeIvs); 76 }); 77 78 // Map the innermost loops to threads in reverse order. 79 for (const auto &en : 80 llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back( 81 GPUDialect::getNumWorkgroupDimensions())))) { 82 Value v = en.value(); 83 auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp()); 84 mapLoopToProcessorIds(loop, {threadIds[en.index()]}, 85 {blockDims[en.index()]}); 86 } 87 } 88 89 /// Emits the loop nests performing the copy to the designated location in the 90 /// beginning of the region, and from the designated location immediately before 91 /// the terminator of the first block of the region. The region is expected to 92 /// have one block. This boils down to the following structure 93 /// 94 /// ^bb(...): 95 /// <loop-bound-computation> 96 /// for %arg0 = ... to ... step ... { 97 /// ... 98 /// for %argN = <thread-id-x> to ... step <block-dim-x> { 99 /// %0 = load %from[%arg0, ..., %argN] 100 /// store %0, %to[%arg0, ..., %argN] 101 /// } 102 /// ... 103 /// } 104 /// gpu.barrier 105 /// <... original body ...> 106 /// gpu.barrier 107 /// for %arg0 = ... to ... step ... { 108 /// ... 109 /// for %argN = <thread-id-x> to ... step <block-dim-x> { 110 /// %1 = load %to[%arg0, ..., %argN] 111 /// store %1, %from[%arg0, ..., %argN] 112 /// } 113 /// ... 114 /// } 115 /// 116 /// Inserts the barriers unconditionally since different threads may be copying 117 /// values and reading them. An analysis would be required to eliminate barriers 118 /// in case where value is only used by the thread that copies it. Both copies 119 /// are inserted unconditionally, an analysis would be required to only copy 120 /// live-in and live-out values when necessary. This copies the entire memref 121 /// pointed to by "from". In case a smaller block would be sufficient, the 122 /// caller can create a subview of the memref and promote it instead. 123 static void insertCopies(Region ®ion, Location loc, Value from, Value to) { 124 auto fromType = from.getType().cast<MemRefType>(); 125 auto toType = to.getType().cast<MemRefType>(); 126 (void)fromType; 127 (void)toType; 128 assert(fromType.getShape() == toType.getShape()); 129 assert(fromType.getRank() != 0); 130 assert(llvm::hasSingleElement(region) && 131 "unstructured control flow not supported"); 132 133 auto b = ImplicitLocOpBuilder::atBlockBegin(loc, ®ion.front()); 134 insertCopyLoops(b, from, to); 135 b.create<gpu::BarrierOp>(); 136 137 b.setInsertionPoint(®ion.front().back()); 138 b.create<gpu::BarrierOp>(); 139 insertCopyLoops(b, to, from); 140 } 141 142 /// Promotes a function argument to workgroup memory in the given function. The 143 /// copies will be inserted in the beginning and in the end of the function. 144 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { 145 Value value = op.getArgument(arg); 146 auto type = value.getType().dyn_cast<MemRefType>(); 147 assert(type && type.hasStaticShape() && "can only promote memrefs"); 148 149 // Get the type of the buffer in the workgroup memory. 150 int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace(); 151 auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {}, 152 workgroupMemoryAddressSpace); 153 Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc()); 154 155 // Replace the uses first since only the original uses are currently present. 156 // Then insert the copies. 157 value.replaceAllUsesWith(attribution); 158 insertCopies(op.getBody(), op.getLoc(), value, attribution); 159 } 160