1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to convert gpu.launch_func op into a sequence of 10 // GPU runtime calls. As most of GPU runtimes does not have a stable published 11 // ABI, this pass uses a slim runtime layer that builds on top of the public 12 // API from GPU runtime headers. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" 17 18 #include "../PassDetail.h" 19 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" 20 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" 21 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" 22 #include "mlir/Dialect/Async/IR/Async.h" 23 #include "mlir/Dialect/GPU/GPUDialect.h" 24 #include "mlir/Dialect/GPU/Passes.h" 25 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 26 #include "mlir/IR/Attributes.h" 27 #include "mlir/IR/Builders.h" 28 #include "mlir/IR/BuiltinOps.h" 29 #include "mlir/IR/BuiltinTypes.h" 30 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/Support/Error.h" 33 #include "llvm/Support/FormatVariadic.h" 34 35 using namespace mlir; 36 37 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; 38 39 namespace { 40 41 class GpuToLLVMConversionPass 42 : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> { 43 public: 44 GpuToLLVMConversionPass() = default; 45 46 GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other) 47 : GpuToLLVMConversionPassBase(other) {} 48 49 // Run the dialect converter on the module. 50 void runOnOperation() override; 51 52 private: 53 Option<std::string> gpuBinaryAnnotation{ 54 *this, "gpu-binary-annotation", 55 llvm::cl::desc("Annotation attribute string for GPU binary"), 56 llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())}; 57 }; 58 59 struct FunctionCallBuilder { 60 FunctionCallBuilder(StringRef functionName, Type returnType, 61 ArrayRef<Type> argumentTypes) 62 : functionName(functionName), 63 functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} 64 LLVM::CallOp create(Location loc, OpBuilder &builder, 65 ArrayRef<Value> arguments) const; 66 67 StringRef functionName; 68 LLVM::LLVMFunctionType functionType; 69 }; 70 71 template <typename OpTy> 72 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { 73 public: 74 explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 75 : ConvertOpToLLVMPattern<OpTy>(typeConverter) {} 76 77 protected: 78 MLIRContext *context = &this->getTypeConverter()->getContext(); 79 80 Type llvmVoidType = LLVM::LLVMVoidType::get(context); 81 Type llvmPointerType = 82 LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 83 Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType); 84 Type llvmInt8Type = IntegerType::get(context, 8); 85 Type llvmInt32Type = IntegerType::get(context, 32); 86 Type llvmInt64Type = IntegerType::get(context, 64); 87 Type llvmIntPtrType = IntegerType::get( 88 context, this->getTypeConverter()->getPointerBitwidth(0)); 89 90 FunctionCallBuilder moduleLoadCallBuilder = { 91 "mgpuModuleLoad", 92 llvmPointerType /* void *module */, 93 {llvmPointerType /* void *cubin */}}; 94 FunctionCallBuilder moduleUnloadCallBuilder = { 95 "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; 96 FunctionCallBuilder moduleGetFunctionCallBuilder = { 97 "mgpuModuleGetFunction", 98 llvmPointerType /* void *function */, 99 { 100 llvmPointerType, /* void *module */ 101 llvmPointerType /* char *name */ 102 }}; 103 FunctionCallBuilder launchKernelCallBuilder = { 104 "mgpuLaunchKernel", 105 llvmVoidType, 106 { 107 llvmPointerType, /* void* f */ 108 llvmIntPtrType, /* intptr_t gridXDim */ 109 llvmIntPtrType, /* intptr_t gridyDim */ 110 llvmIntPtrType, /* intptr_t gridZDim */ 111 llvmIntPtrType, /* intptr_t blockXDim */ 112 llvmIntPtrType, /* intptr_t blockYDim */ 113 llvmIntPtrType, /* intptr_t blockZDim */ 114 llvmInt32Type, /* unsigned int sharedMemBytes */ 115 llvmPointerType, /* void *hstream */ 116 llvmPointerPointerType, /* void **kernelParams */ 117 llvmPointerPointerType /* void **extra */ 118 }}; 119 FunctionCallBuilder streamCreateCallBuilder = { 120 "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; 121 FunctionCallBuilder streamDestroyCallBuilder = { 122 "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}}; 123 FunctionCallBuilder streamSynchronizeCallBuilder = { 124 "mgpuStreamSynchronize", 125 llvmVoidType, 126 {llvmPointerType /* void *stream */}}; 127 FunctionCallBuilder streamWaitEventCallBuilder = { 128 "mgpuStreamWaitEvent", 129 llvmVoidType, 130 {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}}; 131 FunctionCallBuilder eventCreateCallBuilder = { 132 "mgpuEventCreate", llvmPointerType /* void *event */, {}}; 133 FunctionCallBuilder eventDestroyCallBuilder = { 134 "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}}; 135 FunctionCallBuilder eventSynchronizeCallBuilder = { 136 "mgpuEventSynchronize", 137 llvmVoidType, 138 {llvmPointerType /* void *event */}}; 139 FunctionCallBuilder eventRecordCallBuilder = { 140 "mgpuEventRecord", 141 llvmVoidType, 142 {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}}; 143 FunctionCallBuilder hostRegisterCallBuilder = { 144 "mgpuMemHostRegisterMemRef", 145 llvmVoidType, 146 {llvmIntPtrType /* intptr_t rank */, 147 llvmPointerType /* void *memrefDesc */, 148 llvmIntPtrType /* intptr_t elementSizeBytes */}}; 149 FunctionCallBuilder allocCallBuilder = { 150 "mgpuMemAlloc", 151 llvmPointerType /* void * */, 152 {llvmIntPtrType /* intptr_t sizeBytes */, 153 llvmPointerType /* void *stream */}}; 154 FunctionCallBuilder deallocCallBuilder = { 155 "mgpuMemFree", 156 llvmVoidType, 157 {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}}; 158 FunctionCallBuilder memcpyCallBuilder = { 159 "mgpuMemcpy", 160 llvmVoidType, 161 {llvmPointerType /* void *dst */, llvmPointerType /* void *src */, 162 llvmIntPtrType /* intptr_t sizeBytes */, 163 llvmPointerType /* void *stream */}}; 164 }; 165 166 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime 167 /// call. Currently it supports CUDA and ROCm (HIP). 168 class ConvertHostRegisterOpToGpuRuntimeCallPattern 169 : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> { 170 public: 171 ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 172 : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {} 173 174 private: 175 LogicalResult 176 matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 177 ConversionPatternRewriter &rewriter) const override; 178 }; 179 180 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime 181 /// call. Currently it supports CUDA and ROCm (HIP). 182 class ConvertAllocOpToGpuRuntimeCallPattern 183 : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> { 184 public: 185 ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 186 : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {} 187 188 private: 189 LogicalResult 190 matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands, 191 ConversionPatternRewriter &rewriter) const override; 192 }; 193 194 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime 195 /// call. Currently it supports CUDA and ROCm (HIP). 196 class ConvertDeallocOpToGpuRuntimeCallPattern 197 : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> { 198 public: 199 ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 200 : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {} 201 202 private: 203 LogicalResult 204 matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 205 ConversionPatternRewriter &rewriter) const override; 206 }; 207 208 class ConvertAsyncYieldToGpuRuntimeCallPattern 209 : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> { 210 public: 211 ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 212 : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {} 213 214 private: 215 LogicalResult 216 matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands, 217 ConversionPatternRewriter &rewriter) const override; 218 }; 219 220 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime 221 /// call. Currently it supports CUDA and ROCm (HIP). 222 class ConvertWaitOpToGpuRuntimeCallPattern 223 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 224 public: 225 ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 226 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 227 228 private: 229 LogicalResult 230 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 231 ConversionPatternRewriter &rewriter) const override; 232 }; 233 234 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime 235 /// call. Currently it supports CUDA and ROCm (HIP). 236 class ConvertWaitAsyncOpToGpuRuntimeCallPattern 237 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 238 public: 239 ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 240 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 241 242 private: 243 LogicalResult 244 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 245 ConversionPatternRewriter &rewriter) const override; 246 }; 247 248 /// A rewrite patter to convert gpu.launch_func operations into a sequence of 249 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). 250 /// 251 /// In essence, a gpu.launch_func operations gets compiled into the following 252 /// sequence of runtime calls: 253 /// 254 /// * moduleLoad -- loads the module given the cubin / hsaco data 255 /// * moduleGetFunction -- gets a handle to the actual kernel function 256 /// * getStreamHelper -- initializes a new compute stream on GPU 257 /// * launchKernel -- launches the kernel on a stream 258 /// * streamSynchronize -- waits for operations on the stream to finish 259 /// 260 /// Intermediate data structures are allocated on the stack. 261 class ConvertLaunchFuncOpToGpuRuntimeCallPattern 262 : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> { 263 public: 264 ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, 265 StringRef gpuBinaryAnnotation) 266 : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter), 267 gpuBinaryAnnotation(gpuBinaryAnnotation) {} 268 269 private: 270 Value generateParamsArray(gpu::LaunchFuncOp launchOp, 271 ArrayRef<Value> operands, OpBuilder &builder) const; 272 Value generateKernelNameConstant(StringRef moduleName, StringRef name, 273 Location loc, OpBuilder &builder) const; 274 275 LogicalResult 276 matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 277 ConversionPatternRewriter &rewriter) const override; 278 279 llvm::SmallString<32> gpuBinaryAnnotation; 280 }; 281 282 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> { 283 using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern; 284 285 LogicalResult matchAndRewrite(gpu::GPUModuleOp op, 286 PatternRewriter &rewriter) const override { 287 // GPU kernel modules are no longer necessary since we have a global 288 // constant with the CUBIN, or HSACO data. 289 rewriter.eraseOp(op); 290 return success(); 291 } 292 }; 293 294 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime 295 /// call. Currently it supports CUDA and ROCm (HIP). 296 class ConvertMemcpyOpToGpuRuntimeCallPattern 297 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> { 298 public: 299 ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 300 : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {} 301 302 private: 303 LogicalResult 304 matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 305 ConversionPatternRewriter &rewriter) const override; 306 }; 307 } // namespace 308 309 void GpuToLLVMConversionPass::runOnOperation() { 310 LLVMTypeConverter converter(&getContext()); 311 RewritePatternSet patterns(&getContext()); 312 LLVMConversionTarget target(getContext()); 313 314 populateVectorToLLVMConversionPatterns(converter, patterns); 315 populateStdToLLVMConversionPatterns(converter, patterns); 316 populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, 317 target); 318 319 converter.addConversion( 320 [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { 321 return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 322 }); 323 patterns.add<ConvertAllocOpToGpuRuntimeCallPattern, 324 ConvertDeallocOpToGpuRuntimeCallPattern, 325 ConvertHostRegisterOpToGpuRuntimeCallPattern, 326 ConvertMemcpyOpToGpuRuntimeCallPattern, 327 ConvertWaitAsyncOpToGpuRuntimeCallPattern, 328 ConvertWaitOpToGpuRuntimeCallPattern, 329 ConvertAsyncYieldToGpuRuntimeCallPattern>(converter); 330 patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(converter, 331 gpuBinaryAnnotation); 332 patterns.add<EraseGpuModuleOpPattern>(&converter.getContext()); 333 334 if (failed( 335 applyPartialConversion(getOperation(), target, std::move(patterns)))) 336 signalPassFailure(); 337 } 338 339 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, 340 ArrayRef<Value> arguments) const { 341 auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>(); 342 auto function = [&] { 343 if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName)) 344 return function; 345 return OpBuilder::atBlockEnd(module.getBody()) 346 .create<LLVM::LLVMFuncOp>(loc, functionName, functionType); 347 }(); 348 return builder.create<LLVM::CallOp>( 349 loc, const_cast<LLVM::LLVMFunctionType &>(functionType).getReturnType(), 350 builder.getSymbolRefAttr(function), arguments); 351 } 352 353 // Returns whether all operands are of LLVM type. 354 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, 355 ConversionPatternRewriter &rewriter) { 356 if (!llvm::all_of(operands, [](Value value) { 357 return LLVM::isCompatibleType(value.getType()); 358 })) 359 return rewriter.notifyMatchFailure( 360 op, "Cannot convert if operands aren't of LLVM type."); 361 return success(); 362 } 363 364 static LogicalResult 365 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter, 366 gpu::AsyncOpInterface op) { 367 if (op.getAsyncDependencies().size() != 1) 368 return rewriter.notifyMatchFailure( 369 op, "Can only convert with exactly one async dependency."); 370 371 if (!op.getAsyncToken()) 372 return rewriter.notifyMatchFailure(op, "Can convert only async version."); 373 374 return success(); 375 } 376 377 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( 378 gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 379 ConversionPatternRewriter &rewriter) const { 380 auto *op = hostRegisterOp.getOperation(); 381 if (failed(areAllLLVMTypes(op, operands, rewriter))) 382 return failure(); 383 384 Location loc = op->getLoc(); 385 386 auto memRefType = hostRegisterOp.value().getType(); 387 auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType(); 388 auto elementSize = getSizeInBytes(loc, elementType, rewriter); 389 390 auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(), 391 operands, rewriter); 392 arguments.push_back(elementSize); 393 hostRegisterCallBuilder.create(loc, rewriter, arguments); 394 395 rewriter.eraseOp(op); 396 return success(); 397 } 398 399 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( 400 gpu::AllocOp allocOp, ArrayRef<Value> operands, 401 ConversionPatternRewriter &rewriter) const { 402 MemRefType memRefType = allocOp.getType(); 403 404 if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) || 405 !isConvertibleAndHasIdentityMaps(memRefType) || 406 failed(isAsyncWithOneDependency(rewriter, allocOp))) 407 return failure(); 408 409 auto loc = allocOp.getLoc(); 410 auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary()); 411 412 // Get shape of the memref as values: static sizes are constant 413 // values and dynamic sizes are passed to 'alloc' as operands. 414 SmallVector<Value, 4> shape; 415 SmallVector<Value, 4> strides; 416 Value sizeBytes; 417 getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter, 418 shape, strides, sizeBytes); 419 420 // Allocate the underlying buffer and store a pointer to it in the MemRef 421 // descriptor. 422 Type elementPtrType = this->getElementPtrType(memRefType); 423 auto stream = adaptor.asyncDependencies().front(); 424 Value allocatedPtr = 425 allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0); 426 allocatedPtr = 427 rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); 428 429 // No alignment. 430 Value alignedPtr = allocatedPtr; 431 432 // Create the MemRef descriptor. 433 auto memRefDescriptor = this->createMemRefDescriptor( 434 loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter); 435 436 rewriter.replaceOp(allocOp, {memRefDescriptor, stream}); 437 438 return success(); 439 } 440 441 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( 442 gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 443 ConversionPatternRewriter &rewriter) const { 444 if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) || 445 failed(isAsyncWithOneDependency(rewriter, deallocOp))) 446 return failure(); 447 448 Location loc = deallocOp.getLoc(); 449 450 auto adaptor = 451 gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary()); 452 Value pointer = 453 MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc); 454 auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer); 455 Value stream = adaptor.asyncDependencies().front(); 456 deallocCallBuilder.create(loc, rewriter, {casted, stream}); 457 458 rewriter.replaceOp(deallocOp, {stream}); 459 return success(); 460 } 461 462 static bool isGpuAsyncTokenType(Value value) { 463 return value.getType().isa<gpu::AsyncTokenType>(); 464 } 465 466 // Converts !gpu.async.token operands of `async.yield` to runtime calls. The 467 // !gpu.async.token are lowered to stream within the async.execute region, but 468 // are passed as events between them. For each !gpu.async.token operand, we 469 // create an event and record it on the stream. 470 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( 471 async::YieldOp yieldOp, ArrayRef<Value> operands, 472 ConversionPatternRewriter &rewriter) const { 473 if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType)) 474 return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand"); 475 476 Location loc = yieldOp.getLoc(); 477 SmallVector<Value, 4> newOperands(operands.begin(), operands.end()); 478 llvm::SmallDenseSet<Value> streams; 479 for (auto &operand : yieldOp->getOpOperands()) { 480 if (!isGpuAsyncTokenType(operand.get())) 481 continue; 482 auto idx = operand.getOperandNumber(); 483 auto stream = operands[idx]; 484 auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 485 eventRecordCallBuilder.create(loc, rewriter, {event, stream}); 486 newOperands[idx] = event; 487 streams.insert(stream); 488 } 489 for (auto stream : streams) 490 streamDestroyCallBuilder.create(loc, rewriter, {stream}); 491 492 rewriter.updateRootInPlace(yieldOp, 493 [&] { yieldOp->setOperands(newOperands); }); 494 return success(); 495 } 496 497 // Returns whether `value` is the result of an LLVM::CallOp to `functionName`. 498 static bool isDefinedByCallTo(Value value, StringRef functionName) { 499 assert(value.getType().isa<LLVM::LLVMPointerType>()); 500 if (auto defOp = value.getDefiningOp<LLVM::CallOp>()) 501 return defOp.callee()->equals(functionName); 502 return false; 503 } 504 505 // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host 506 // with the stream/event operands. The operands are destroyed. That is, it 507 // assumes that it is not used afterwards or elsewhere. Otherwise we will get a 508 // runtime error. Eventually, we should guarantee this property. 509 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite( 510 gpu::WaitOp waitOp, ArrayRef<Value> operands, 511 ConversionPatternRewriter &rewriter) const { 512 if (waitOp.asyncToken()) 513 return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op."); 514 515 Location loc = waitOp.getLoc(); 516 517 for (auto operand : operands) { 518 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 519 // The converted operand's definition created a stream. 520 streamSynchronizeCallBuilder.create(loc, rewriter, {operand}); 521 streamDestroyCallBuilder.create(loc, rewriter, {operand}); 522 } else { 523 // Otherwise the converted operand is an event. This assumes that we use 524 // events in control flow code as well. 525 eventSynchronizeCallBuilder.create(loc, rewriter, {operand}); 526 eventDestroyCallBuilder.create(loc, rewriter, {operand}); 527 } 528 } 529 530 rewriter.eraseOp(waitOp); 531 return success(); 532 } 533 534 // Converts `gpu.wait async` to runtime calls. The converted op creates a new 535 // stream that is synchronized with stream/event operands. The operands are 536 // destroyed. That is, it assumes that it is not used afterwards or elsewhere. 537 // Otherwise we will get a runtime error. Eventually, we should guarantee this 538 // property. 539 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( 540 gpu::WaitOp waitOp, ArrayRef<Value> operands, 541 ConversionPatternRewriter &rewriter) const { 542 if (!waitOp.asyncToken()) 543 return rewriter.notifyMatchFailure(waitOp, "Can only convert async op."); 544 545 Location loc = waitOp.getLoc(); 546 547 auto insertionPoint = rewriter.saveInsertionPoint(); 548 SmallVector<Value, 1> events; 549 for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) { 550 auto operand = std::get<1>(pair); 551 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 552 // The converted operand's definition created a stream. Insert an event 553 // into the stream just after the last use of the original token operand. 554 auto *defOp = std::get<0>(pair).getDefiningOp(); 555 rewriter.setInsertionPointAfter(defOp); 556 auto event = 557 eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 558 eventRecordCallBuilder.create(loc, rewriter, {event, operand}); 559 events.push_back(event); 560 } else { 561 // Otherwise the converted operand is an event. This assumes that we use 562 // events in control flow code as well. 563 events.push_back(operand); 564 } 565 } 566 rewriter.restoreInsertionPoint(insertionPoint); 567 auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 568 for (auto event : events) 569 streamWaitEventCallBuilder.create(loc, rewriter, {stream, event}); 570 for (auto event : events) 571 eventDestroyCallBuilder.create(loc, rewriter, {event}); 572 rewriter.replaceOp(waitOp, {stream}); 573 574 return success(); 575 } 576 577 // Creates a struct containing all kernel parameters on the stack and returns 578 // an array of type-erased pointers to the fields of the struct. The array can 579 // then be passed to the CUDA / ROCm (HIP) kernel launch calls. 580 // The generated code is essentially as follows: 581 // 582 // %struct = alloca(sizeof(struct { Parameters... })) 583 // %array = alloca(NumParameters * sizeof(void *)) 584 // for (i : [0, NumParameters)) 585 // %fieldPtr = llvm.getelementptr %struct[0, i] 586 // llvm.store parameters[i], %fieldPtr 587 // %elementPtr = llvm.getelementptr %array[i] 588 // llvm.store %fieldPtr, %elementPtr 589 // return %array 590 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( 591 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 592 OpBuilder &builder) const { 593 auto loc = launchOp.getLoc(); 594 auto numKernelOperands = launchOp.getNumKernelOperands(); 595 auto arguments = getTypeConverter()->promoteOperands( 596 loc, launchOp.getOperands().take_back(numKernelOperands), 597 operands.take_back(numKernelOperands), builder); 598 auto numArguments = arguments.size(); 599 SmallVector<Type, 4> argumentTypes; 600 argumentTypes.reserve(numArguments); 601 for (auto argument : arguments) 602 argumentTypes.push_back(argument.getType()); 603 auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(), 604 argumentTypes); 605 auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 606 builder.getI32IntegerAttr(1)); 607 auto structPtr = builder.create<LLVM::AllocaOp>( 608 loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0); 609 auto arraySize = builder.create<LLVM::ConstantOp>( 610 loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments)); 611 auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType, 612 arraySize, /*alignment=*/0); 613 auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 614 builder.getI32IntegerAttr(0)); 615 for (auto en : llvm::enumerate(arguments)) { 616 auto index = builder.create<LLVM::ConstantOp>( 617 loc, llvmInt32Type, builder.getI32IntegerAttr(en.index())); 618 auto fieldPtr = builder.create<LLVM::GEPOp>( 619 loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr, 620 ArrayRef<Value>{zero, index.getResult()}); 621 builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr); 622 auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, 623 arrayPtr, index.getResult()); 624 auto casted = 625 builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr); 626 builder.create<LLVM::StoreOp>(loc, casted, elementPtr); 627 } 628 return arrayPtr; 629 } 630 631 // Generates an LLVM IR dialect global that contains the name of the given 632 // kernel function as a C string, and returns a pointer to its beginning. 633 // The code is essentially: 634 // 635 // llvm.global constant @kernel_name("function_name\00") 636 // func(...) { 637 // %0 = llvm.addressof @kernel_name 638 // %1 = llvm.constant (0 : index) 639 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> 640 // } 641 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( 642 StringRef moduleName, StringRef name, Location loc, 643 OpBuilder &builder) const { 644 // Make sure the trailing zero is included in the constant. 645 std::vector<char> kernelName(name.begin(), name.end()); 646 kernelName.push_back('\0'); 647 648 std::string globalName = 649 std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); 650 return LLVM::createGlobalString( 651 loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), 652 LLVM::Linkage::Internal); 653 } 654 655 // Emits LLVM IR to launch a kernel function. Expects the module that contains 656 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a 657 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. 658 // 659 // %0 = call %binarygetter 660 // %1 = call %moduleLoad(%0) 661 // %2 = <see generateKernelNameConstant> 662 // %3 = call %moduleGetFunction(%1, %2) 663 // %4 = call %streamCreate() 664 // %5 = <see generateParamsArray> 665 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr) 666 // call %streamSynchronize(%4) 667 // call %streamDestroy(%4) 668 // call %moduleUnload(%1) 669 // 670 // If the op is async, the stream corresponds to the (single) async dependency 671 // as well as the async token the op produces. 672 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( 673 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 674 ConversionPatternRewriter &rewriter) const { 675 if (failed(areAllLLVMTypes(launchOp, operands, rewriter))) 676 return failure(); 677 678 if (launchOp.asyncDependencies().size() > 1) 679 return rewriter.notifyMatchFailure( 680 launchOp, "Cannot convert with more than one async dependency."); 681 682 // Fail when the synchronous version of the op has async dependencies. The 683 // lowering destroys the stream, and we do not want to check that there is no 684 // use of the stream after this op. 685 if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty()) 686 return rewriter.notifyMatchFailure( 687 launchOp, "Cannot convert non-async op with async dependencies."); 688 689 Location loc = launchOp.getLoc(); 690 691 // Create an LLVM global with CUBIN extracted from the kernel annotation and 692 // obtain a pointer to the first byte in it. 693 auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>( 694 launchOp, launchOp.getKernelModuleName()); 695 assert(kernelModule && "expected a kernel module"); 696 697 auto binaryAttr = 698 kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation); 699 if (!binaryAttr) { 700 kernelModule.emitOpError() 701 << "missing " << gpuBinaryAnnotation << " attribute"; 702 return failure(); 703 } 704 705 SmallString<128> nameBuffer(kernelModule.getName()); 706 nameBuffer.append(kGpuBinaryStorageSuffix); 707 Value data = 708 LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), 709 binaryAttr.getValue(), LLVM::Linkage::Internal); 710 711 auto module = moduleLoadCallBuilder.create(loc, rewriter, data); 712 // Get the function from the module. The name corresponds to the name of 713 // the kernel function. 714 auto kernelName = generateKernelNameConstant( 715 launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter); 716 auto function = moduleGetFunctionCallBuilder.create( 717 loc, rewriter, {module.getResult(0), kernelName}); 718 auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type, 719 rewriter.getI32IntegerAttr(0)); 720 auto adaptor = 721 gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary()); 722 Value stream = 723 adaptor.asyncDependencies().empty() 724 ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0) 725 : adaptor.asyncDependencies().front(); 726 // Create array of pointers to kernel arguments. 727 auto kernelParams = generateParamsArray(launchOp, operands, rewriter); 728 auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType); 729 launchKernelCallBuilder.create(loc, rewriter, 730 {function.getResult(0), launchOp.gridSizeX(), 731 launchOp.gridSizeY(), launchOp.gridSizeZ(), 732 launchOp.blockSizeX(), launchOp.blockSizeY(), 733 launchOp.blockSizeZ(), 734 /*sharedMemBytes=*/zero, stream, kernelParams, 735 /*extra=*/nullpointer}); 736 737 if (launchOp.asyncToken()) { 738 // Async launch: make dependent ops use the same stream. 739 rewriter.replaceOp(launchOp, {stream}); 740 } else { 741 // Synchronize with host and destroy stream. This must be the stream created 742 // above (with no other uses) because we check that the synchronous version 743 // does not have any async dependencies. 744 streamSynchronizeCallBuilder.create(loc, rewriter, stream); 745 streamDestroyCallBuilder.create(loc, rewriter, stream); 746 rewriter.eraseOp(launchOp); 747 } 748 moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0)); 749 750 return success(); 751 } 752 753 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite( 754 gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 755 ConversionPatternRewriter &rewriter) const { 756 auto memRefType = memcpyOp.src().getType().cast<MemRefType>(); 757 758 if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) || 759 !isConvertibleAndHasIdentityMaps(memRefType) || 760 failed(isAsyncWithOneDependency(rewriter, memcpyOp))) 761 return failure(); 762 763 auto loc = memcpyOp.getLoc(); 764 auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary()); 765 766 MemRefDescriptor srcDesc(adaptor.src()); 767 768 Value numElements = 769 memRefType.hasStaticShape() 770 ? createIndexConstant(rewriter, loc, memRefType.getNumElements()) 771 // For identity layouts (verified above), the number of elements is 772 // stride[0] * size[0]. 773 : rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0), 774 srcDesc.size(rewriter, loc, 0)); 775 776 Type elementPtrType = getElementPtrType(memRefType); 777 Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType); 778 Value gepPtr = rewriter.create<LLVM::GEPOp>( 779 loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements}); 780 auto sizeBytes = 781 rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr); 782 783 auto src = rewriter.create<LLVM::BitcastOp>( 784 loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc)); 785 auto dst = rewriter.create<LLVM::BitcastOp>( 786 loc, llvmPointerType, 787 MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc)); 788 789 auto stream = adaptor.asyncDependencies().front(); 790 memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream}); 791 792 rewriter.replaceOp(memcpyOp, {stream}); 793 794 return success(); 795 } 796 797 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> 798 mlir::createGpuToLLVMConversionPass() { 799 return std::make_unique<GpuToLLVMConversionPass>(); 800 } 801