1 //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect pattern rewriters that make GPU op 10 // within a region execute asynchronously. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Dialect/Async/IR/Async.h" 16 #include "mlir/Dialect/GPU/GPUDialect.h" 17 #include "mlir/Dialect/GPU/Passes.h" 18 #include "mlir/Dialect/GPU/Utils.h" 19 #include "mlir/Dialect/StandardOps/IR/Ops.h" 20 #include "mlir/IR/BlockAndValueMapping.h" 21 #include "mlir/IR/Builders.h" 22 #include "mlir/IR/PatternMatch.h" 23 #include "mlir/IR/SymbolTable.h" 24 #include "mlir/Support/LLVM.h" 25 #include "mlir/Transforms/RegionUtils.h" 26 #include "llvm/ADT/TypeSwitch.h" 27 28 using namespace mlir; 29 namespace { 30 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> { 31 struct ThreadTokenCallback; 32 struct DeferWaitCallback; 33 void runOnFunction() override; 34 }; 35 } // namespace 36 37 static bool isTerminator(Operation *op) { return !op->isKnownNonTerminator(); } 38 static bool hasSideEffects(Operation *op) { 39 return !MemoryEffectOpInterface::hasNoEffect(op); 40 } 41 42 // Region walk callback which makes GPU ops implementing the AsyncOpInterface 43 // execute asynchronously. 44 struct GpuAsyncRegionPass::ThreadTokenCallback { 45 ThreadTokenCallback(MLIRContext &context) : builder(&context) {} 46 47 // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to 48 // create a current token (unless it already exists), and 'thread' that token 49 // through the `op` so that it executes asynchronously. 50 // 51 // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to 52 // host-synchronize execution. A `!gpu.async.token` will therefore only be 53 // used inside of its block and GPU execution will always synchronize with 54 // the host at block boundaries. 55 WalkResult operator()(Operation *op) { 56 if (isa<gpu::LaunchOp>(op)) 57 return op->emitOpError("replace with gpu.launch_func first"); 58 if (isa<gpu::WaitOp>(op)) 59 return op->emitOpError("unexpected pre-existing gpu.wait"); 60 builder.setInsertionPoint(op); 61 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op)) 62 return rewriteAsyncOp(asyncOp); // Replace GPU op with async version. 63 if (!currentToken) 64 return success(); 65 // Insert host synchronization before terminator or op with side effects. 66 if (isTerminator(op) || hasSideEffects(op)) 67 currentToken = createWaitOp(op->getLoc(), Type(), {currentToken}); 68 return success(); 69 } 70 71 private: 72 // Replaces asyncOp with a clone that returns a token. 73 LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) { 74 auto *op = asyncOp.getOperation(); 75 if (asyncOp.getAsyncToken()) 76 // TODO: Support ops that are already async. 77 return op->emitOpError("is already async"); 78 if (op->getNumRegions() > 0) 79 return op->emitOpError("regions are not supported"); 80 81 // If there is no current token, insert a `gpu.wait async` without 82 // dependencies to create one. 83 if (!currentToken) 84 currentToken = createWaitOp(op->getLoc(), tokenType, {}); 85 asyncOp.addAsyncDependency(currentToken); 86 87 // Clone the op to return a token in addition to the other results. 88 SmallVector<Type, 1> resultTypes; 89 resultTypes.reserve(1 + op->getNumResults()); 90 copy(op->getResultTypes(), std::back_inserter(resultTypes)); 91 resultTypes.push_back(tokenType); 92 auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes, 93 op->getOperands(), op->getAttrDictionary(), 94 op->getSuccessors()); 95 96 // Replace the op with the async clone. 97 auto results = newOp->getResults(); 98 currentToken = results.back(); 99 builder.insert(newOp); 100 op->replaceAllUsesWith(results.drop_back()); 101 op->erase(); 102 103 return success(); 104 } 105 106 Value createWaitOp(Location loc, Type resultType, ValueRange operands) { 107 return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken(); 108 } 109 110 OpBuilder builder; 111 const Type tokenType = builder.getType<gpu::AsyncTokenType>(); 112 // The token that represents the current asynchronous dependency. It's valid 113 // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op. 114 // In between, each gpu::AsyncOpInterface depends on the current token and 115 // produces the new one. 116 Value currentToken = {}; 117 }; 118 119 // Callback for `async.execute` ops which tries to push the contained 120 // synchronous `gpu.wait` op to the dependencies of the `async.execute`. 121 struct GpuAsyncRegionPass::DeferWaitCallback { 122 // If the `executeOp`s token is used only in `async.execute` or `async.await` 123 // ops, add the region's last `gpu.wait` op to the worklist if it is 124 // synchronous and is the last op with side effects. 125 void operator()(async::ExecuteOp executeOp) { 126 if (!areAllUsersExecuteOrAwait(executeOp.token())) 127 return; 128 // async.execute's region is currently restricted to one block. 129 for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) { 130 if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) { 131 if (!waitOp.asyncToken()) 132 worklist.push_back(waitOp); 133 return; 134 } 135 if (hasSideEffects(&op)) 136 return; 137 } 138 } 139 140 // The destructor performs the actual rewrite work. 141 ~DeferWaitCallback() { 142 for (size_t i = 0; i < worklist.size(); ++i) { 143 auto waitOp = worklist[i]; 144 auto executeOp = waitOp->getParentOfType<async::ExecuteOp>(); 145 auto numDependencies = waitOp.asyncDependencies().size(); 146 147 // Erase `gpu.wait` and return async dependencies from region instead. 148 auto &yieldOp = executeOp.getBody()->getOperations().back(); 149 yieldOp.insertOperands(yieldOp.getNumOperands(), 150 waitOp.asyncDependencies()); 151 waitOp.erase(); 152 auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies); 153 154 // Add the async dependency to each user of the `async.execute` token. 155 for (Operation *user : executeOp.token().getUsers()) 156 addAsyncDependencyAfter(asyncTokens, user); 157 } 158 } 159 160 private: 161 // Append `count` `!async.value<!gpu.async.token>` results to `executeOp`. 162 static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp, 163 unsigned count) { 164 auto numResults = executeOp.getNumResults() + count; 165 166 // Construct new result type list with `count` additional types. 167 SmallVector<Type, 2> resultTypes; 168 resultTypes.reserve(numResults); 169 transform(executeOp.getResultTypes(), std::back_inserter(resultTypes), 170 [](Type type) { 171 // Extract value type from !async.value. 172 if (auto valueType = type.dyn_cast<async::ValueType>()) 173 return valueType.getValueType(); 174 assert(type.isa<async::TokenType>() && "expected token type"); 175 return type; 176 }); 177 OpBuilder builder(executeOp); 178 auto tokenType = builder.getType<gpu::AsyncTokenType>(); 179 resultTypes.resize(numResults, tokenType); 180 181 // Clone executeOp with the extra `!gpu.async.token` results. 182 auto newOp = builder.create<async::ExecuteOp>( 183 executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/, 184 executeOp.dependencies(), executeOp.operands()); 185 BlockAndValueMapping mapper; 186 newOp.getRegion().getBlocks().clear(); 187 executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper); 188 189 // Replace executeOp with cloned one. 190 executeOp.getOperation()->replaceAllUsesWith( 191 newOp.getResults().drop_back(count)); 192 executeOp.erase(); 193 executeOp = newOp; 194 195 // Return the new result values. 196 return executeOp.getResults().take_back(count); 197 } 198 199 // Returns whether all token users are either 'async.execute' or 'async.await' 200 // ops. This is used as a requirement for pushing 'gpu.wait' ops from a 201 // 'async.execute' body to it's users. Specifically, we do not allow 202 // terminator users, because it could mean that the `async.execute` is inside 203 // control flow code. 204 static bool areAllUsersExecuteOrAwait(Value token) { 205 return llvm::all_of(token.getUsers(), [](Operation *user) { 206 return isa<async::ExecuteOp, async::AwaitOp>(user); 207 }); 208 } 209 210 // Add the `asyncToken` as dependency as needed after `op`. 211 void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) { 212 OpBuilder builder(op->getContext()); 213 auto loc = op->getLoc(); 214 215 Block::iterator it; 216 SmallVector<Value, 1> tokens; 217 tokens.reserve(asyncTokens.size()); 218 TypeSwitch<Operation *>(op) 219 .Case<async::AwaitOp>([&](auto awaitOp) { 220 // Add async.await ops to wait for the !gpu.async.tokens. 221 builder.setInsertionPointAfter(op); 222 for (auto asyncToken : asyncTokens) 223 tokens.push_back( 224 builder.create<async::AwaitOp>(loc, asyncToken).result()); 225 // Set `it` after the inserted async.await ops. 226 it = builder.getInsertionPoint(); 227 }) 228 .Case<async::ExecuteOp>([&](auto executeOp) { 229 // Set `it` to the beginning of the region and add asyncTokens to the 230 // async.execute operands. 231 it = executeOp.getBody()->begin(); 232 executeOp.operandsMutable().append(asyncTokens); 233 SmallVector<Type, 1> tokenTypes( 234 asyncTokens.size(), builder.getType<gpu::AsyncTokenType>()); 235 copy(executeOp.getBody()->addArguments(tokenTypes), 236 std::back_inserter(tokens)); 237 }); 238 239 // Advance `it` to terminator or op with side-effects. 240 it = std::find_if(it, Block::iterator(), [](Operation &op) { 241 return isTerminator(&op) || hasSideEffects(&op); 242 }); 243 244 // If `op` implements the AsyncOpInterface, add `token` to the list of async 245 // dependencies. 246 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) { 247 for (auto token : tokens) 248 asyncOp.addAsyncDependency(token); 249 return; 250 } 251 252 // Otherwise, insert a gpu.wait before 'it'. 253 builder.setInsertionPoint(it->getBlock(), it); 254 auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens); 255 256 // If the new waitOp is at the end of an async.execute region, add it to the 257 // worklist. 'operator()(executeOp)' would do the same, but this is faster. 258 auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp()); 259 if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) && 260 !it->getNextNode()) 261 worklist.push_back(waitOp); 262 } 263 264 SmallVector<gpu::WaitOp, 8> worklist; 265 }; 266 267 // Replaces synchronous GPU ops in the op's region with asynchronous ones and 268 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential 269 // execution semantics and that no GPU ops are asynchronous yet. 270 void GpuAsyncRegionPass::runOnFunction() { 271 if (getFunction() 272 .getRegion() 273 .walk(ThreadTokenCallback(getContext())) 274 .wasInterrupted()) 275 return signalPassFailure(); 276 277 // Collect gpu.wait ops that we can move out of async.execute regions. 278 getFunction().getRegion().walk(DeferWaitCallback()); 279 } 280 281 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() { 282 return std::make_unique<GpuAsyncRegionPass>(); 283 } 284