1 //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements utilities that allow one to create IR moving the data
10 // across different levels of the GPU memory hierarchy.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "mlir/Dialect/GPU/MemoryPromotion.h"
15 #include "mlir/Dialect/GPU/GPUDialect.h"
16 #include "mlir/Dialect/LoopOps/EDSC/Builders.h"
17 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
18 #include "mlir/Pass/Pass.h"
19 #include "mlir/Support/Functional.h"
20 #include "mlir/Transforms/LoopUtils.h"
21 
22 using namespace mlir;
23 using namespace mlir::edsc;
24 using namespace mlir::edsc::intrinsics;
25 using namespace mlir::gpu;
26 
27 /// Returns the textual name of a GPU dimension.
28 static StringRef getDimName(unsigned dim) {
29   if (dim == 0)
30     return "x";
31   if (dim == 1)
32     return "y";
33   if (dim == 2)
34     return "z";
35 
36   llvm_unreachable("dimension ID overflow");
37 }
38 
39 /// Emits the (imperfect) loop nest performing the copy between "from" and "to"
40 /// values using the bounds derived from the "from" value. Emits at least
41 /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
42 /// single-iteration loops. Maps the innermost loops to thread dimensions, in
43 /// reverse order to enable access coalescing in the innermost loop.
44 static void insertCopyLoops(OpBuilder &builder, Location loc,
45                             MemRefBoundsCapture &bounds, Value from, Value to) {
46   // Create EDSC handles for bounds.
47   unsigned rank = bounds.rank();
48   SmallVector<ValueHandle, 4> lbs, ubs, steps;
49 
50   // Make sure we have enough loops to use all thread dimensions, these trivial
51   // loops should be outermost and therefore inserted first.
52   if (rank < GPUDialect::getNumWorkgroupDimensions()) {
53     unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
54     ValueHandle zero = std_constant_index(0);
55     ValueHandle one = std_constant_index(1);
56     lbs.resize(extraLoops, zero);
57     ubs.resize(extraLoops, one);
58     steps.resize(extraLoops, one);
59   }
60 
61   // Add existing bounds.
62   lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
63   ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
64 
65   // Emit constant operations for steps.
66   steps.reserve(lbs.size());
67   llvm::transform(bounds.getSteps(), std::back_inserter(steps),
68                   [](int64_t step) { return std_constant_index(step); });
69 
70   // Obtain thread identifiers and block sizes, necessary to map to them.
71   auto indexType = builder.getIndexType();
72   SmallVector<Value, 3> threadIds, blockDims;
73   for (unsigned i = 0; i < 3; ++i) {
74     auto dimName = builder.getStringAttr(getDimName(i));
75     threadIds.push_back(
76         builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
77     blockDims.push_back(
78         builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
79   }
80 
81   // Produce the loop nest with copies.
82   SmallVector<ValueHandle, 8> ivs(lbs.size(), ValueHandle(indexType));
83   auto ivPtrs = makeHandlePointers(MutableArrayRef<ValueHandle>(ivs));
84   LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
85     auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
86     StdIndexedValue fromHandle(from), toHandle(to);
87     toHandle(activeIvs) = fromHandle(activeIvs);
88   });
89 
90   // Map the innermost loops to threads in reverse order.
91   for (auto en :
92        llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
93            GPUDialect::getNumWorkgroupDimensions())))) {
94     auto loop = cast<loop::ForOp>(
95         en.value().getValue().getParentRegion()->getParentOp());
96     mapLoopToProcessorIds(loop, {threadIds[en.index()]},
97                           {blockDims[en.index()]});
98   }
99 }
100 
101 /// Emits the loop nests performing the copy to the designated location in the
102 /// beginning of the region, and from the designated location immediately before
103 /// the terminator of the first block of the region. The region is expected to
104 /// have one block. This boils down to the following structure
105 ///
106 ///   ^bb(...):
107 ///     <loop-bound-computation>
108 ///     for %arg0 = ... to ... step ... {
109 ///       ...
110 ///         for %argN = <thread-id-x> to ... step <block-dim-x> {
111 ///           %0 = load %from[%arg0, ..., %argN]
112 ///           store %0, %to[%arg0, ..., %argN]
113 ///         }
114 ///       ...
115 ///     }
116 ///     gpu.barrier
117 ///     <... original body ...>
118 ///     gpu.barrier
119 ///     for %arg0 = ... to ... step ... {
120 ///       ...
121 ///         for %argN = <thread-id-x> to ... step <block-dim-x> {
122 ///           %1 = load %to[%arg0, ..., %argN]
123 ///           store %1, %from[%arg0, ..., %argN]
124 ///         }
125 ///       ...
126 ///     }
127 ///
128 /// Inserts the barriers unconditionally since different threads may be copying
129 /// values and reading them. An analysis would be required to eliminate barriers
130 /// in case where value is only used by the thread that copies it. Both copies
131 /// are inserted unconditionally, an analysis would be required to only copy
132 /// live-in and live-out values when necessary. This copies the entire memref
133 /// pointed to by "from". In case a smaller block would be sufficient, the
134 /// caller can create a subview of the memref and promote it instead.
135 static void insertCopies(Region &region, Location loc, Value from, Value to) {
136   auto fromType = from.getType().cast<MemRefType>();
137   auto toType = to.getType().cast<MemRefType>();
138   (void)fromType;
139   (void)toType;
140   assert(fromType.getShape() == toType.getShape());
141   assert(fromType.getRank() != 0);
142   assert(has_single_element(region) &&
143          "unstructured control flow not supported");
144 
145   OpBuilder builder(region.getContext());
146   builder.setInsertionPointToStart(&region.front());
147 
148   ScopedContext edscContext(builder, loc);
149   MemRefBoundsCapture fromBoundsCapture(from);
150   insertCopyLoops(builder, loc, fromBoundsCapture, from, to);
151   builder.create<gpu::BarrierOp>(loc);
152 
153   builder.setInsertionPoint(&region.front().back());
154   builder.create<gpu::BarrierOp>(loc);
155   insertCopyLoops(builder, loc, fromBoundsCapture, to, from);
156 }
157 
158 /// Promotes a function argument to workgroup memory in the given function. The
159 /// copies will be inserted in the beginning and in the end of the function.
160 void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
161   Value value = op.getArgument(arg);
162   auto type = value.getType().dyn_cast<MemRefType>();
163   assert(type && type.hasStaticShape() && "can only promote memrefs");
164 
165   Value attribution =
166       op.addWorkgroupAttribution(type.getShape(), type.getElementType());
167 
168   // Replace the uses first since only the original uses are currently present.
169   // Then insert the copies.
170   value.replaceAllUsesWith(attribution);
171   insertCopies(op.getBody(), op.getLoc(), value, attribution);
172 }
173