1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h"
15
16 #include "mlir/Dialect/Affine/LoopUtils.h"
17 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
18 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
19 #include "mlir/Dialect/MemRef/IR/MemRef.h"
20 #include "mlir/Dialect/SCF/IR/SCF.h"
21 #include "mlir/IR/ImplicitLocOpBuilder.h"
22 #include "mlir/Pass/Pass.h"
23
24 using namespace mlir;
25 using namespace mlir::gpu;
26
27 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
28 /// values using the bounds derived from the "from" value. Emits at least
29 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
30 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
31 /// reverse order to enable access coalescing in the innermost loop.
insertCopyLoops(ImplicitLocOpBuilder & b,Value from,Value to)32 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
33 auto memRefType = from.getType().cast<MemRefType>();
34 auto rank = memRefType.getRank();
35
36 SmallVector<Value, 4> lbs, ubs, steps;
37 Value zero = b.create<arith::ConstantIndexOp>(0);
38 Value one = b.create<arith::ConstantIndexOp>(1);
39
40 // Make sure we have enough loops to use all thread dimensions, these trivial
41 // loops should be outermost and therefore inserted first.
42 if (rank < GPUDialect::getNumWorkgroupDimensions()) {
43 unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
44 lbs.resize(extraLoops, zero);
45 ubs.resize(extraLoops, one);
46 steps.resize(extraLoops, one);
47 }
48
49 // Add existing bounds.
50 lbs.append(rank, zero);
51 ubs.reserve(lbs.size());
52 steps.reserve(lbs.size());
53 for (auto idx = 0; idx < rank; ++idx) {
54 ubs.push_back(b.createOrFold<memref::DimOp>(
55 from, b.create<arith::ConstantIndexOp>(idx)));
56 steps.push_back(one);
57 }
58
59 // Obtain thread identifiers and block sizes, necessary to map to them.
60 auto indexType = b.getIndexType();
61 SmallVector<Value, 3> threadIds, blockDims;
62 for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
63 threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
64 blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
65 }
66
67 // Produce the loop nest with copies.
68 SmallVector<Value, 8> ivs(lbs.size());
69 mlir::scf::buildLoopNest(
70 b, b.getLoc(), lbs, ubs, steps,
71 [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
72 ivs.assign(loopIvs.begin(), loopIvs.end());
73 auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
74 Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
75 b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
76 });
77
78 // Map the innermost loops to threads in reverse order.
79 for (const auto &en :
80 llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
81 GPUDialect::getNumWorkgroupDimensions())))) {
82 Value v = en.value();
83 auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
84 mapLoopToProcessorIds(loop, {threadIds[en.index()]},
85 {blockDims[en.index()]});
86 }
87 }
88
89 /// Emits the loop nests performing the copy to the designated location in the
90 /// beginning of the region, and from the designated location immediately before
91 /// the terminator of the first block of the region. The region is expected to
92 /// have one block. This boils down to the following structure
93 ///
94 /// ^bb(...):
95 /// <loop-bound-computation>
96 /// for %arg0 = ... to ... step ... {
97 /// ...
98 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
99 /// %0 = load %from[%arg0, ..., %argN]
100 /// store %0, %to[%arg0, ..., %argN]
101 /// }
102 /// ...
103 /// }
104 /// gpu.barrier
105 /// <... original body ...>
106 /// gpu.barrier
107 /// for %arg0 = ... to ... step ... {
108 /// ...
109 /// for %argN = <thread-id-x> to ... step <block-dim-x> {
110 /// %1 = load %to[%arg0, ..., %argN]
111 /// store %1, %from[%arg0, ..., %argN]
112 /// }
113 /// ...
114 /// }
115 ///
116 /// Inserts the barriers unconditionally since different threads may be copying
117 /// values and reading them. An analysis would be required to eliminate barriers
118 /// in case where value is only used by the thread that copies it. Both copies
119 /// are inserted unconditionally, an analysis would be required to only copy
120 /// live-in and live-out values when necessary. This copies the entire memref
121 /// pointed to by "from". In case a smaller block would be sufficient, the
122 /// caller can create a subview of the memref and promote it instead.
insertCopies(Region & region,Location loc,Value from,Value to)123 static void insertCopies(Region ®ion, Location loc, Value from, Value to) {
124 auto fromType = from.getType().cast<MemRefType>();
125 auto toType = to.getType().cast<MemRefType>();
126 (void)fromType;
127 (void)toType;
128 assert(fromType.getShape() == toType.getShape());
129 assert(fromType.getRank() != 0);
130 assert(llvm::hasSingleElement(region) &&
131 "unstructured control flow not supported");
132
133 auto b = ImplicitLocOpBuilder::atBlockBegin(loc, ®ion.front());
134 insertCopyLoops(b, from, to);
135 b.create<gpu::BarrierOp>();
136
137 b.setInsertionPoint(®ion.front().back());
138 b.create<gpu::BarrierOp>();
139 insertCopyLoops(b, to, from);
140 }
141
142 /// Promotes a function argument to workgroup memory in the given function. The
143 /// copies will be inserted in the beginning and in the end of the function.
promoteToWorkgroupMemory(GPUFuncOp op,unsigned arg)144 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
145 Value value = op.getArgument(arg);
146 auto type = value.getType().dyn_cast<MemRefType>();
147 assert(type && type.hasStaticShape() && "can only promote memrefs");
148
149 // Get the type of the buffer in the workgroup memory.
150 int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
151 auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {},
152 workgroupMemoryAddressSpace);
153 Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
154
155 // Replace the uses first since only the original uses are currently present.
156 // Then insert the copies.
157 value.replaceAllUsesWith(attribution);
158 insertCopies(op.getBody(), op.getLoc(), value, attribution);
159 }
160