1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements utilities that allow one to create IR moving the data 10 // across different levels of the GPU memory hierarchy. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/GPU/MemoryPromotion.h" 15 #include "mlir/Dialect/GPU/GPUDialect.h" 16 #include "mlir/Dialect/SCF/EDSC/Builders.h" 17 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" 18 #include "mlir/Pass/Pass.h" 19 #include "mlir/Transforms/LoopUtils.h" 20 21 using namespace mlir; 22 using namespace mlir::edsc; 23 using namespace mlir::edsc::intrinsics; 24 using namespace mlir::gpu; 25 26 /// Returns the textual name of a GPU dimension. 27 static StringRef getDimName(unsigned dim) { 28 if (dim == 0) 29 return "x"; 30 if (dim == 1) 31 return "y"; 32 if (dim == 2) 33 return "z"; 34 35 llvm_unreachable("dimension ID overflow"); 36 } 37 38 /// Emits the (imperfect) loop nest performing the copy between "from" and "to" 39 /// values using the bounds derived from the "from" value. Emits at least 40 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with 41 /// single-iteration loops. Maps the innermost loops to thread dimensions, in 42 /// reverse order to enable access coalescing in the innermost loop. 43 static void insertCopyLoops(OpBuilder &builder, Location loc, 44 MemRefBoundsCapture &bounds, Value from, Value to) { 45 // Create EDSC handles for bounds. 46 unsigned rank = bounds.rank(); 47 SmallVector<Value, 4> lbs, ubs, steps; 48 49 // Make sure we have enough loops to use all thread dimensions, these trivial 50 // loops should be outermost and therefore inserted first. 51 if (rank < GPUDialect::getNumWorkgroupDimensions()) { 52 unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank; 53 Value zero = std_constant_index(0); 54 Value one = std_constant_index(1); 55 lbs.resize(extraLoops, zero); 56 ubs.resize(extraLoops, one); 57 steps.resize(extraLoops, one); 58 } 59 60 // Add existing bounds. 61 lbs.append(bounds.getLbs().begin(), bounds.getLbs().end()); 62 ubs.append(bounds.getUbs().begin(), bounds.getUbs().end()); 63 64 // Emit constant operations for steps. 65 steps.reserve(lbs.size()); 66 llvm::transform(bounds.getSteps(), std::back_inserter(steps), 67 [](int64_t step) { return std_constant_index(step); }); 68 69 // Obtain thread identifiers and block sizes, necessary to map to them. 70 auto indexType = builder.getIndexType(); 71 SmallVector<Value, 3> threadIds, blockDims; 72 for (unsigned i = 0; i < 3; ++i) { 73 auto dimName = builder.getStringAttr(getDimName(i)); 74 threadIds.push_back( 75 builder.create<gpu::ThreadIdOp>(loc, indexType, dimName)); 76 blockDims.push_back( 77 builder.create<gpu::BlockDimOp>(loc, indexType, dimName)); 78 } 79 80 // Produce the loop nest with copies. 81 SmallVector<Value, 8> ivs(lbs.size()); 82 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) { 83 ivs.assign(loopIvs.begin(), loopIvs.end()); 84 auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank); 85 StdIndexedValue fromHandle(from), toHandle(to); 86 toHandle(activeIvs) = fromHandle(activeIvs); 87 }); 88 89 // Map the innermost loops to threads in reverse order. 90 for (auto en : 91 llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back( 92 GPUDialect::getNumWorkgroupDimensions())))) { 93 Value v = en.value(); 94 auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp()); 95 mapLoopToProcessorIds(loop, {threadIds[en.index()]}, 96 {blockDims[en.index()]}); 97 } 98 } 99 100 /// Emits the loop nests performing the copy to the designated location in the 101 /// beginning of the region, and from the designated location immediately before 102 /// the terminator of the first block of the region. The region is expected to 103 /// have one block. This boils down to the following structure 104 /// 105 /// ^bb(...): 106 /// <loop-bound-computation> 107 /// for %arg0 = ... to ... step ... { 108 /// ... 109 /// for %argN = <thread-id-x> to ... step <block-dim-x> { 110 /// %0 = load %from[%arg0, ..., %argN] 111 /// store %0, %to[%arg0, ..., %argN] 112 /// } 113 /// ... 114 /// } 115 /// gpu.barrier 116 /// <... original body ...> 117 /// gpu.barrier 118 /// for %arg0 = ... to ... step ... { 119 /// ... 120 /// for %argN = <thread-id-x> to ... step <block-dim-x> { 121 /// %1 = load %to[%arg0, ..., %argN] 122 /// store %1, %from[%arg0, ..., %argN] 123 /// } 124 /// ... 125 /// } 126 /// 127 /// Inserts the barriers unconditionally since different threads may be copying 128 /// values and reading them. An analysis would be required to eliminate barriers 129 /// in case where value is only used by the thread that copies it. Both copies 130 /// are inserted unconditionally, an analysis would be required to only copy 131 /// live-in and live-out values when necessary. This copies the entire memref 132 /// pointed to by "from". In case a smaller block would be sufficient, the 133 /// caller can create a subview of the memref and promote it instead. 134 static void insertCopies(Region ®ion, Location loc, Value from, Value to) { 135 auto fromType = from.getType().cast<MemRefType>(); 136 auto toType = to.getType().cast<MemRefType>(); 137 (void)fromType; 138 (void)toType; 139 assert(fromType.getShape() == toType.getShape()); 140 assert(fromType.getRank() != 0); 141 assert(llvm::hasSingleElement(region) && 142 "unstructured control flow not supported"); 143 144 OpBuilder builder(region.getContext()); 145 builder.setInsertionPointToStart(®ion.front()); 146 147 ScopedContext edscContext(builder, loc); 148 MemRefBoundsCapture fromBoundsCapture(from); 149 insertCopyLoops(builder, loc, fromBoundsCapture, from, to); 150 builder.create<gpu::BarrierOp>(loc); 151 152 builder.setInsertionPoint(®ion.front().back()); 153 builder.create<gpu::BarrierOp>(loc); 154 insertCopyLoops(builder, loc, fromBoundsCapture, to, from); 155 } 156 157 /// Promotes a function argument to workgroup memory in the given function. The 158 /// copies will be inserted in the beginning and in the end of the function. 159 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { 160 Value value = op.getArgument(arg); 161 auto type = value.getType().dyn_cast<MemRefType>(); 162 assert(type && type.hasStaticShape() && "can only promote memrefs"); 163 164 // Get the type of the buffer in the workgroup memory. 165 int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace(); 166 auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {}, 167 workgroupMemoryAddressSpace); 168 169 Value attribution = op.addWorkgroupAttribution(bufferType); 170 171 // Replace the uses first since only the original uses are currently present. 172 // Then insert the copies. 173 value.replaceAllUsesWith(attribution); 174 insertCopies(op.getBody(), op.getLoc(), value, attribution); 175 } 176