1 //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the GPU dialect pattern rewriters that make GPU op
10 // within a region execute asynchronously.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "PassDetail.h"
15 #include "mlir/Dialect/Async/IR/Async.h"
16 #include "mlir/Dialect/GPU/GPUDialect.h"
17 #include "mlir/Dialect/GPU/Passes.h"
18 #include "mlir/Dialect/GPU/Utils.h"
19 #include "mlir/Dialect/StandardOps/IR/Ops.h"
20 #include "mlir/IR/BlockAndValueMapping.h"
21 #include "mlir/IR/Builders.h"
22 #include "mlir/IR/PatternMatch.h"
23 #include "mlir/IR/SymbolTable.h"
24 #include "mlir/Support/LLVM.h"
25 #include "mlir/Transforms/RegionUtils.h"
26 #include "llvm/ADT/TypeSwitch.h"
27 
28 using namespace mlir;
29 namespace {
30 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
31   struct ThreadTokenCallback;
32   struct DeferWaitCallback;
33   struct SingleTokenUseCallback;
34   void runOnFunction() override;
35 };
36 } // namespace
37 
38 static bool isTerminator(Operation *op) {
39   return op->mightHaveTrait<OpTrait::IsTerminator>();
40 }
41 static bool hasSideEffects(Operation *op) {
42   return !MemoryEffectOpInterface::hasNoEffect(op);
43 }
44 
45 // Region walk callback which makes GPU ops implementing the AsyncOpInterface
46 // execute asynchronously.
47 struct GpuAsyncRegionPass::ThreadTokenCallback {
48   ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
49 
50   // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
51   // create a current token (unless it already exists), and 'thread' that token
52   // through the `op` so that it executes asynchronously.
53   //
54   // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
55   // host-synchronize execution. A `!gpu.async.token` will therefore only be
56   // used inside of its block and GPU execution will always synchronize with
57   // the host at block boundaries.
58   WalkResult operator()(Operation *op) {
59     if (isa<gpu::LaunchOp>(op))
60       return op->emitOpError("replace with gpu.launch_func first");
61     if (isa<gpu::WaitOp>(op))
62       return op->emitOpError("unexpected pre-existing gpu.wait");
63     builder.setInsertionPoint(op);
64     if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
65       return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
66     if (!currentToken)
67       return success();
68     // Insert host synchronization before terminator or op with side effects.
69     if (isTerminator(op) || hasSideEffects(op))
70       currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
71     return success();
72   }
73 
74 private:
75   // Replaces asyncOp with a clone that returns a token.
76   LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
77     auto *op = asyncOp.getOperation();
78     if (asyncOp.getAsyncToken())
79       // TODO: Support ops that are already async.
80       return op->emitOpError("is already async");
81     if (op->getNumRegions() > 0)
82       return op->emitOpError("regions are not supported");
83 
84     auto tokenType = builder.getType<gpu::AsyncTokenType>();
85 
86     // If there is no current token, insert a `gpu.wait async` without
87     // dependencies to create one.
88     if (!currentToken)
89       currentToken = createWaitOp(op->getLoc(), tokenType, {});
90     asyncOp.addAsyncDependency(currentToken);
91 
92     // Clone the op to return a token in addition to the other results.
93     SmallVector<Type, 1> resultTypes;
94     resultTypes.reserve(1 + op->getNumResults());
95     copy(op->getResultTypes(), std::back_inserter(resultTypes));
96     resultTypes.push_back(tokenType);
97     auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
98                                     op->getOperands(), op->getAttrDictionary(),
99                                     op->getSuccessors());
100 
101     // Replace the op with the async clone.
102     auto results = newOp->getResults();
103     currentToken = results.back();
104     builder.insert(newOp);
105     op->replaceAllUsesWith(results.drop_back());
106     op->erase();
107 
108     return success();
109   }
110 
111   Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
112     return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken();
113   }
114 
115   OpBuilder builder;
116 
117   // The token that represents the current asynchronous dependency. It's valid
118   // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
119   // In between, each gpu::AsyncOpInterface depends on the current token and
120   // produces the new one.
121   Value currentToken = {};
122 };
123 
124 /// Erases `executeOp` and returns a clone with additional `results`.
125 async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
126                                    ValueRange results) {
127   // Add values to async.yield op.
128   Operation *yieldOp = executeOp.getBody()->getTerminator();
129   yieldOp->insertOperands(yieldOp->getNumOperands(), results);
130 
131   // Construct new result type list with additional types.
132   SmallVector<Type, 2> resultTypes;
133   resultTypes.reserve(executeOp.getNumResults() + results.size());
134   transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
135             [](Type type) {
136               // Extract value type from !async.value.
137               if (auto valueType = type.dyn_cast<async::ValueType>())
138                 return valueType.getValueType();
139               assert(type.isa<async::TokenType>() && "expected token type");
140               return type;
141             });
142   transform(results, std::back_inserter(resultTypes),
143             [](Value value) { return value.getType(); });
144 
145   // Clone executeOp with the extra results.
146   OpBuilder builder(executeOp);
147   auto newOp = builder.create<async::ExecuteOp>(
148       executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
149       executeOp.dependencies(), executeOp.operands());
150   BlockAndValueMapping mapper;
151   newOp.getRegion().getBlocks().clear();
152   executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
153 
154   // Replace executeOp with cloned one.
155   executeOp.getOperation()->replaceAllUsesWith(
156       newOp.getResults().drop_back(results.size()));
157   executeOp.erase();
158 
159   return newOp;
160 }
161 
162 // Callback for `async.execute` ops which tries to push the contained
163 // synchronous `gpu.wait` op to the dependencies of the `async.execute`.
164 struct GpuAsyncRegionPass::DeferWaitCallback {
165   // If the `executeOp`s token is used only in `async.execute` or `async.await`
166   // ops, add the region's last `gpu.wait` op to the worklist if it is
167   // synchronous and is the last op with side effects.
168   void operator()(async::ExecuteOp executeOp) {
169     if (!areAllUsersExecuteOrAwait(executeOp.token()))
170       return;
171     // async.execute's region is currently restricted to one block.
172     for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
173       if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
174         if (!waitOp.asyncToken())
175           worklist.push_back(waitOp);
176         return;
177       }
178       if (hasSideEffects(&op))
179         return;
180     }
181   }
182 
183   // The destructor performs the actual rewrite work.
184   ~DeferWaitCallback() {
185     for (size_t i = 0; i < worklist.size(); ++i) {
186       auto waitOp = worklist[i];
187       auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
188 
189       // Erase `gpu.wait` and return async dependencies from execute op instead.
190       SmallVector<Value, 4> dependencies = waitOp.asyncDependencies();
191       waitOp.erase();
192       executeOp = addExecuteResults(executeOp, dependencies);
193 
194       // Add the async dependency to each user of the `async.execute` token.
195       auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
196       for (Operation *user : executeOp.token().getUsers())
197         addAsyncDependencyAfter(asyncTokens, user);
198     }
199   }
200 
201 private:
202   // Returns whether all token users are either 'async.execute' or 'async.await'
203   // ops. This is used as a requirement for pushing 'gpu.wait' ops from a
204   // 'async.execute' body to it's users. Specifically, we do not allow
205   // terminator users, because it could mean that the `async.execute` is inside
206   // control flow code.
207   static bool areAllUsersExecuteOrAwait(Value token) {
208     return !token.use_empty() &&
209            llvm::all_of(token.getUsers(), [](Operation *user) {
210              return isa<async::ExecuteOp, async::AwaitOp>(user);
211            });
212   }
213 
214   // Add the `asyncToken` as dependency as needed after `op`.
215   void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
216     OpBuilder builder(op->getContext());
217     auto loc = op->getLoc();
218 
219     Block::iterator it;
220     SmallVector<Value, 1> tokens;
221     tokens.reserve(asyncTokens.size());
222     TypeSwitch<Operation *>(op)
223         .Case<async::AwaitOp>([&](auto awaitOp) {
224           // Add async.await ops to wait for the !gpu.async.tokens.
225           builder.setInsertionPointAfter(op);
226           for (auto asyncToken : asyncTokens)
227             tokens.push_back(
228                 builder.create<async::AwaitOp>(loc, asyncToken).result());
229           // Set `it` after the inserted async.await ops.
230           it = builder.getInsertionPoint();
231         })
232         .Case<async::ExecuteOp>([&](auto executeOp) {
233           // Set `it` to the beginning of the region and add asyncTokens to the
234           // async.execute operands.
235           it = executeOp.getBody()->begin();
236           executeOp.operandsMutable().append(asyncTokens);
237           SmallVector<Type, 1> tokenTypes(
238               asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
239           copy(executeOp.getBody()->addArguments(tokenTypes),
240                std::back_inserter(tokens));
241         });
242 
243     // Advance `it` to terminator or op with side-effects.
244     it = std::find_if(it, Block::iterator(), [](Operation &op) {
245       return isTerminator(&op) || hasSideEffects(&op);
246     });
247 
248     // If `op` implements the AsyncOpInterface, add `token` to the list of async
249     // dependencies.
250     if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
251       for (auto token : tokens)
252         asyncOp.addAsyncDependency(token);
253       return;
254     }
255 
256     // Otherwise, insert a gpu.wait before 'it'.
257     builder.setInsertionPoint(it->getBlock(), it);
258     auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
259 
260     // If the new waitOp is at the end of an async.execute region, add it to the
261     // worklist. 'operator()(executeOp)' would do the same, but this is faster.
262     auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
263     if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
264         !it->getNextNode())
265       worklist.push_back(waitOp);
266   }
267 
268   SmallVector<gpu::WaitOp, 8> worklist;
269 };
270 
271 // Callback for `async.execute` ops which repeats !gpu.async.token results
272 // so that each of them is only used once.
273 struct GpuAsyncRegionPass::SingleTokenUseCallback {
274   void operator()(async::ExecuteOp executeOp) {
275     // Extract !gpu.async.token results which have multiple uses.
276     auto multiUseResults =
277         llvm::make_filter_range(executeOp.results(), [](OpResult result) {
278           if (result.use_empty() || result.hasOneUse())
279             return false;
280           auto valueType = result.getType().dyn_cast<async::ValueType>();
281           return valueType &&
282                  valueType.getValueType().isa<gpu::AsyncTokenType>();
283         });
284     if (multiUseResults.empty())
285       return;
286 
287     // Indices within !async.execute results (i.e. without the async.token).
288     SmallVector<int, 4> indices;
289     transform(multiUseResults, std::back_inserter(indices),
290               [](OpResult result) {
291                 return result.getResultNumber() - 1; // Index without token.
292               });
293 
294     for (auto index : indices) {
295       assert(!executeOp.results()[index].getUses().empty());
296       // Repeat async.yield token result, one for each use after the first one.
297       auto uses = llvm::drop_begin(executeOp.results()[index].getUses());
298       auto count = std::distance(uses.begin(), uses.end());
299       auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
300       SmallVector<Value, 4> operands(count, yieldOp.getOperand(index));
301       executeOp = addExecuteResults(executeOp, operands);
302       // Update 'uses' to refer to the new executeOp.
303       uses = llvm::drop_begin(executeOp.results()[index].getUses());
304       auto results = executeOp.results().take_back(count);
305       for (auto pair : llvm::zip(uses, results))
306         std::get<0>(pair).set(std::get<1>(pair));
307     }
308   }
309 };
310 
311 // Replaces synchronous GPU ops in the op's region with asynchronous ones and
312 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
313 // execution semantics and that no GPU ops are asynchronous yet.
314 void GpuAsyncRegionPass::runOnFunction() {
315   if (getFunction()
316           .getRegion()
317           .walk(ThreadTokenCallback(getContext()))
318           .wasInterrupted())
319     return signalPassFailure();
320 
321   // Collect gpu.wait ops that we can move out of async.execute regions.
322   getFunction().getRegion().walk(DeferWaitCallback());
323   // Makes each !gpu.async.token returned from async.execute op have single use.
324   getFunction().getRegion().walk(SingleTokenUseCallback());
325 }
326 
327 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
328   return std::make_unique<GpuAsyncRegionPass>();
329 }
330