1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "mlir/Dialect/GPU/MemoryPromotion.h"
15 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
16 #include "mlir/Dialect/GPU/GPUDialect.h"
17 #include "mlir/Dialect/MemRef/IR/MemRef.h"
18 #include "mlir/Dialect/SCF/SCF.h"
19 #include "mlir/Dialect/StandardOps/IR/Ops.h"
20 #include "mlir/IR/ImplicitLocOpBuilder.h"
21 #include "mlir/Pass/Pass.h"
22 #include "mlir/Transforms/LoopUtils.h"
23 
24 using namespace mlir;
25 using namespace mlir::gpu;
26 
27 /// Returns the textual name of a GPU dimension.
28 static StringRef getDimName(unsigned dim) {
29   if (dim == 0)
30     return "x";
31   if (dim == 1)
32     return "y";
33   if (dim == 2)
34     return "z";
35 
36   llvm_unreachable("dimension ID overflow");
37 }
38 
39 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
40 /// values using the bounds derived from the "from" value. Emits at least
41 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
42 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
43 /// reverse order to enable access coalescing in the innermost loop.
44 static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
45   auto memRefType = from.getType().cast<MemRefType>();
46   auto rank = memRefType.getRank();
47 
48   SmallVector<Value, 4> lbs, ubs, steps;
49   Value zero = b.create<arith::ConstantIndexOp>(0);
50   Value one = b.create<arith::ConstantIndexOp>(1);
51 
52   // Make sure we have enough loops to use all thread dimensions, these trivial
53   // loops should be outermost and therefore inserted first.
54   if (rank < GPUDialect::getNumWorkgroupDimensions()) {
55     unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
56     lbs.resize(extraLoops, zero);
57     ubs.resize(extraLoops, one);
58     steps.resize(extraLoops, one);
59   }
60 
61   // Add existing bounds.
62   lbs.append(rank, zero);
63   ubs.reserve(lbs.size());
64   steps.reserve(lbs.size());
65   for (auto idx = 0; idx < rank; ++idx) {
66     ubs.push_back(b.createOrFold<memref::DimOp>(
67         from, b.create<arith::ConstantIndexOp>(idx)));
68     steps.push_back(one);
69   }
70 
71   // Obtain thread identifiers and block sizes, necessary to map to them.
72   auto indexType = b.getIndexType();
73   SmallVector<Value, 3> threadIds, blockDims;
74   for (unsigned i = 0; i < 3; ++i) {
75     auto dimName = b.getStringAttr(getDimName(i));
76     threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dimName));
77     blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dimName));
78   }
79 
80   // Produce the loop nest with copies.
81   SmallVector<Value, 8> ivs(lbs.size());
82   mlir::scf::buildLoopNest(
83       b, b.getLoc(), lbs, ubs, steps,
84       [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
85         ivs.assign(loopIvs.begin(), loopIvs.end());
86         auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
87         Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
88         b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
89       });
90 
91   // Map the innermost loops to threads in reverse order.
92   for (auto en :
93        llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
94            GPUDialect::getNumWorkgroupDimensions())))) {
95     Value v = en.value();
96     auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
97     mapLoopToProcessorIds(loop, {threadIds[en.index()]},
98                           {blockDims[en.index()]});
99   }
100 }
101 
102 /// Emits the loop nests performing the copy to the designated location in the
103 /// beginning of the region, and from the designated location immediately before
104 /// the terminator of the first block of the region. The region is expected to
105 /// have one block. This boils down to the following structure
106 ///
107 ///   ^bb(...):
108 ///     <loop-bound-computation>
109 ///     for %arg0 = ... to ... step ... {
110 ///       ...
111 ///         for %argN = <thread-id-x> to ... step <block-dim-x> {
112 ///           %0 = load %from[%arg0, ..., %argN]
113 ///           store %0, %to[%arg0, ..., %argN]
114 ///         }
115 ///       ...
116 ///     }
117 ///     gpu.barrier
118 ///     <... original body ...>
119 ///     gpu.barrier
120 ///     for %arg0 = ... to ... step ... {
121 ///       ...
122 ///         for %argN = <thread-id-x> to ... step <block-dim-x> {
123 ///           %1 = load %to[%arg0, ..., %argN]
124 ///           store %1, %from[%arg0, ..., %argN]
125 ///         }
126 ///       ...
127 ///     }
128 ///
129 /// Inserts the barriers unconditionally since different threads may be copying
130 /// values and reading them. An analysis would be required to eliminate barriers
131 /// in case where value is only used by the thread that copies it. Both copies
132 /// are inserted unconditionally, an analysis would be required to only copy
133 /// live-in and live-out values when necessary. This copies the entire memref
134 /// pointed to by "from". In case a smaller block would be sufficient, the
135 /// caller can create a subview of the memref and promote it instead.
136 static void insertCopies(Region &region, Location loc, Value from, Value to) {
137   auto fromType = from.getType().cast<MemRefType>();
138   auto toType = to.getType().cast<MemRefType>();
139   (void)fromType;
140   (void)toType;
141   assert(fromType.getShape() == toType.getShape());
142   assert(fromType.getRank() != 0);
143   assert(llvm::hasSingleElement(region) &&
144          "unstructured control flow not supported");
145 
146   auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
147   insertCopyLoops(b, from, to);
148   b.create<gpu::BarrierOp>();
149 
150   b.setInsertionPoint(&region.front().back());
151   b.create<gpu::BarrierOp>();
152   insertCopyLoops(b, to, from);
153 }
154 
155 /// Promotes a function argument to workgroup memory in the given function. The
156 /// copies will be inserted in the beginning and in the end of the function.
157 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
158   Value value = op.getArgument(arg);
159   auto type = value.getType().dyn_cast<MemRefType>();
160   assert(type && type.hasStaticShape() && "can only promote memrefs");
161 
162   // Get the type of the buffer in the workgroup memory.
163   int workgroupMemoryAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
164   auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), {},
165                                     workgroupMemoryAddressSpace);
166 
167   Value attribution = op.addWorkgroupAttribution(bufferType);
168 
169   // Replace the uses first since only the original uses are currently present.
170   // Then insert the copies.
171   value.replaceAllUsesWith(attribution);
172   insertCopies(op.getBody(), op.getLoc(), value, attribution);
173 }
174