1 //===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the GPU dialect pattern rewriters that make GPU op 10 // within a region execute asynchronously. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PassDetail.h" 15 #include "mlir/Dialect/Async/IR/Async.h" 16 #include "mlir/Dialect/GPU/GPUDialect.h" 17 #include "mlir/Dialect/GPU/Passes.h" 18 #include "mlir/Dialect/GPU/Utils.h" 19 #include "mlir/Dialect/StandardOps/IR/Ops.h" 20 #include "mlir/IR/BlockAndValueMapping.h" 21 #include "mlir/IR/Builders.h" 22 #include "mlir/IR/PatternMatch.h" 23 #include "mlir/IR/SymbolTable.h" 24 #include "mlir/Support/LLVM.h" 25 #include "mlir/Transforms/RegionUtils.h" 26 #include "llvm/ADT/TypeSwitch.h" 27 28 using namespace mlir; 29 namespace { 30 class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> { 31 struct ThreadTokenCallback; 32 struct DeferWaitCallback; 33 void runOnFunction() override; 34 }; 35 } // namespace 36 37 static bool isTerminator(Operation *op) { 38 return op->mightHaveTrait<OpTrait::IsTerminator>(); 39 } 40 static bool hasSideEffects(Operation *op) { 41 return !MemoryEffectOpInterface::hasNoEffect(op); 42 } 43 44 // Region walk callback which makes GPU ops implementing the AsyncOpInterface 45 // execute asynchronously. 46 struct GpuAsyncRegionPass::ThreadTokenCallback { 47 ThreadTokenCallback(MLIRContext &context) : builder(&context) {} 48 49 // If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to 50 // create a current token (unless it already exists), and 'thread' that token 51 // through the `op` so that it executes asynchronously. 52 // 53 // If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to 54 // host-synchronize execution. A `!gpu.async.token` will therefore only be 55 // used inside of its block and GPU execution will always synchronize with 56 // the host at block boundaries. 57 WalkResult operator()(Operation *op) { 58 if (isa<gpu::LaunchOp>(op)) 59 return op->emitOpError("replace with gpu.launch_func first"); 60 if (isa<gpu::WaitOp>(op)) 61 return op->emitOpError("unexpected pre-existing gpu.wait"); 62 builder.setInsertionPoint(op); 63 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op)) 64 return rewriteAsyncOp(asyncOp); // Replace GPU op with async version. 65 if (!currentToken) 66 return success(); 67 // Insert host synchronization before terminator or op with side effects. 68 if (isTerminator(op) || hasSideEffects(op)) 69 currentToken = createWaitOp(op->getLoc(), Type(), {currentToken}); 70 return success(); 71 } 72 73 private: 74 // Replaces asyncOp with a clone that returns a token. 75 LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) { 76 auto *op = asyncOp.getOperation(); 77 if (asyncOp.getAsyncToken()) 78 // TODO: Support ops that are already async. 79 return op->emitOpError("is already async"); 80 if (op->getNumRegions() > 0) 81 return op->emitOpError("regions are not supported"); 82 83 auto tokenType = builder.getType<gpu::AsyncTokenType>(); 84 85 // If there is no current token, insert a `gpu.wait async` without 86 // dependencies to create one. 87 if (!currentToken) 88 currentToken = createWaitOp(op->getLoc(), tokenType, {}); 89 asyncOp.addAsyncDependency(currentToken); 90 91 // Clone the op to return a token in addition to the other results. 92 SmallVector<Type, 1> resultTypes; 93 resultTypes.reserve(1 + op->getNumResults()); 94 copy(op->getResultTypes(), std::back_inserter(resultTypes)); 95 resultTypes.push_back(tokenType); 96 auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes, 97 op->getOperands(), op->getAttrDictionary(), 98 op->getSuccessors()); 99 100 // Replace the op with the async clone. 101 auto results = newOp->getResults(); 102 currentToken = results.back(); 103 builder.insert(newOp); 104 op->replaceAllUsesWith(results.drop_back()); 105 op->erase(); 106 107 return success(); 108 } 109 110 Value createWaitOp(Location loc, Type resultType, ValueRange operands) { 111 return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken(); 112 } 113 114 OpBuilder builder; 115 116 // The token that represents the current asynchronous dependency. It's valid 117 // range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op. 118 // In between, each gpu::AsyncOpInterface depends on the current token and 119 // produces the new one. 120 Value currentToken = {}; 121 }; 122 123 // Callback for `async.execute` ops which tries to push the contained 124 // synchronous `gpu.wait` op to the dependencies of the `async.execute`. 125 struct GpuAsyncRegionPass::DeferWaitCallback { 126 // If the `executeOp`s token is used only in `async.execute` or `async.await` 127 // ops, add the region's last `gpu.wait` op to the worklist if it is 128 // synchronous and is the last op with side effects. 129 void operator()(async::ExecuteOp executeOp) { 130 if (!areAllUsersExecuteOrAwait(executeOp.token())) 131 return; 132 // async.execute's region is currently restricted to one block. 133 for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) { 134 if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) { 135 if (!waitOp.asyncToken()) 136 worklist.push_back(waitOp); 137 return; 138 } 139 if (hasSideEffects(&op)) 140 return; 141 } 142 } 143 144 // The destructor performs the actual rewrite work. 145 ~DeferWaitCallback() { 146 for (size_t i = 0; i < worklist.size(); ++i) { 147 auto waitOp = worklist[i]; 148 auto executeOp = waitOp->getParentOfType<async::ExecuteOp>(); 149 auto numDependencies = waitOp.asyncDependencies().size(); 150 151 // Erase `gpu.wait` and return async dependencies from region instead. 152 auto &yieldOp = executeOp.getBody()->getOperations().back(); 153 yieldOp.insertOperands(yieldOp.getNumOperands(), 154 waitOp.asyncDependencies()); 155 waitOp.erase(); 156 auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies); 157 158 // Add the async dependency to each user of the `async.execute` token. 159 for (Operation *user : executeOp.token().getUsers()) 160 addAsyncDependencyAfter(asyncTokens, user); 161 } 162 } 163 164 private: 165 // Append `count` `!async.value<!gpu.async.token>` results to `executeOp`. 166 static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp, 167 unsigned count) { 168 auto numResults = executeOp.getNumResults() + count; 169 170 // Construct new result type list with `count` additional types. 171 SmallVector<Type, 2> resultTypes; 172 resultTypes.reserve(numResults); 173 transform(executeOp.getResultTypes(), std::back_inserter(resultTypes), 174 [](Type type) { 175 // Extract value type from !async.value. 176 if (auto valueType = type.dyn_cast<async::ValueType>()) 177 return valueType.getValueType(); 178 assert(type.isa<async::TokenType>() && "expected token type"); 179 return type; 180 }); 181 OpBuilder builder(executeOp); 182 auto tokenType = builder.getType<gpu::AsyncTokenType>(); 183 resultTypes.resize(numResults, tokenType); 184 185 // Clone executeOp with the extra `!gpu.async.token` results. 186 auto newOp = builder.create<async::ExecuteOp>( 187 executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/, 188 executeOp.dependencies(), executeOp.operands()); 189 BlockAndValueMapping mapper; 190 newOp.getRegion().getBlocks().clear(); 191 executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper); 192 193 // Replace executeOp with cloned one. 194 executeOp.getOperation()->replaceAllUsesWith( 195 newOp.getResults().drop_back(count)); 196 executeOp.erase(); 197 executeOp = newOp; 198 199 // Return the new result values. 200 return executeOp.getResults().take_back(count); 201 } 202 203 // Returns whether all token users are either 'async.execute' or 'async.await' 204 // ops. This is used as a requirement for pushing 'gpu.wait' ops from a 205 // 'async.execute' body to it's users. Specifically, we do not allow 206 // terminator users, because it could mean that the `async.execute` is inside 207 // control flow code. 208 static bool areAllUsersExecuteOrAwait(Value token) { 209 return llvm::all_of(token.getUsers(), [](Operation *user) { 210 return isa<async::ExecuteOp, async::AwaitOp>(user); 211 }); 212 } 213 214 // Add the `asyncToken` as dependency as needed after `op`. 215 void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) { 216 OpBuilder builder(op->getContext()); 217 auto loc = op->getLoc(); 218 219 Block::iterator it; 220 SmallVector<Value, 1> tokens; 221 tokens.reserve(asyncTokens.size()); 222 TypeSwitch<Operation *>(op) 223 .Case<async::AwaitOp>([&](auto awaitOp) { 224 // Add async.await ops to wait for the !gpu.async.tokens. 225 builder.setInsertionPointAfter(op); 226 for (auto asyncToken : asyncTokens) 227 tokens.push_back( 228 builder.create<async::AwaitOp>(loc, asyncToken).result()); 229 // Set `it` after the inserted async.await ops. 230 it = builder.getInsertionPoint(); 231 }) 232 .Case<async::ExecuteOp>([&](auto executeOp) { 233 // Set `it` to the beginning of the region and add asyncTokens to the 234 // async.execute operands. 235 it = executeOp.getBody()->begin(); 236 executeOp.operandsMutable().append(asyncTokens); 237 SmallVector<Type, 1> tokenTypes( 238 asyncTokens.size(), builder.getType<gpu::AsyncTokenType>()); 239 copy(executeOp.getBody()->addArguments(tokenTypes), 240 std::back_inserter(tokens)); 241 }); 242 243 // Advance `it` to terminator or op with side-effects. 244 it = std::find_if(it, Block::iterator(), [](Operation &op) { 245 return isTerminator(&op) || hasSideEffects(&op); 246 }); 247 248 // If `op` implements the AsyncOpInterface, add `token` to the list of async 249 // dependencies. 250 if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) { 251 for (auto token : tokens) 252 asyncOp.addAsyncDependency(token); 253 return; 254 } 255 256 // Otherwise, insert a gpu.wait before 'it'. 257 builder.setInsertionPoint(it->getBlock(), it); 258 auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens); 259 260 // If the new waitOp is at the end of an async.execute region, add it to the 261 // worklist. 'operator()(executeOp)' would do the same, but this is faster. 262 auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp()); 263 if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) && 264 !it->getNextNode()) 265 worklist.push_back(waitOp); 266 } 267 268 SmallVector<gpu::WaitOp, 8> worklist; 269 }; 270 271 // Replaces synchronous GPU ops in the op's region with asynchronous ones and 272 // inserts the necessary synchronization (as gpu.wait ops). Assumes sequential 273 // execution semantics and that no GPU ops are asynchronous yet. 274 void GpuAsyncRegionPass::runOnFunction() { 275 if (getFunction() 276 .getRegion() 277 .walk(ThreadTokenCallback(getContext())) 278 .wasInterrupted()) 279 return signalPassFailure(); 280 281 // Collect gpu.wait ops that we can move out of async.execute regions. 282 getFunction().getRegion().walk(DeferWaitCallback()); 283 } 284 285 std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() { 286 return std::make_unique<GpuAsyncRegionPass>(); 287 } 288