1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to convert gpu.launch_func op into a sequence of 10 // GPU runtime calls. As most of GPU runtimes does not have a stable published 11 // ABI, this pass uses a slim runtime layer that builds on top of the public 12 // API from GPU runtime headers. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" 17 18 #include "../PassDetail.h" 19 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" 20 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" 21 #include "mlir/Conversion/LLVMCommon/Pattern.h" 22 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" 23 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" 24 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" 25 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" 26 #include "mlir/Dialect/Async/IR/Async.h" 27 #include "mlir/Dialect/GPU/GPUDialect.h" 28 #include "mlir/Dialect/GPU/Passes.h" 29 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 30 #include "mlir/IR/Attributes.h" 31 #include "mlir/IR/Builders.h" 32 #include "mlir/IR/BuiltinOps.h" 33 #include "mlir/IR/BuiltinTypes.h" 34 35 #include "llvm/ADT/STLExtras.h" 36 #include "llvm/Support/Error.h" 37 #include "llvm/Support/FormatVariadic.h" 38 39 using namespace mlir; 40 41 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; 42 43 namespace { 44 45 class GpuToLLVMConversionPass 46 : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> { 47 public: 48 GpuToLLVMConversionPass() = default; 49 50 GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other) 51 : GpuToLLVMConversionPassBase(other) {} 52 53 // Run the dialect converter on the module. 54 void runOnOperation() override; 55 56 private: 57 Option<std::string> gpuBinaryAnnotation{ 58 *this, "gpu-binary-annotation", 59 llvm::cl::desc("Annotation attribute string for GPU binary"), 60 llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())}; 61 }; 62 63 struct FunctionCallBuilder { 64 FunctionCallBuilder(StringRef functionName, Type returnType, 65 ArrayRef<Type> argumentTypes) 66 : functionName(functionName), 67 functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} 68 LLVM::CallOp create(Location loc, OpBuilder &builder, 69 ArrayRef<Value> arguments) const; 70 71 StringRef functionName; 72 LLVM::LLVMFunctionType functionType; 73 }; 74 75 template <typename OpTy> 76 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { 77 public: 78 explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 79 : ConvertOpToLLVMPattern<OpTy>(typeConverter) {} 80 81 protected: 82 Value getNumElements(ConversionPatternRewriter &rewriter, Location loc, 83 MemRefType type, MemRefDescriptor desc) const { 84 return type.hasStaticShape() 85 ? ConvertToLLVMPattern::createIndexConstant( 86 rewriter, loc, type.getNumElements()) 87 // For identity maps (verified by caller), the number of 88 // elements is stride[0] * size[0]. 89 : rewriter.create<LLVM::MulOp>(loc, 90 desc.stride(rewriter, loc, 0), 91 desc.size(rewriter, loc, 0)); 92 } 93 94 MLIRContext *context = &this->getTypeConverter()->getContext(); 95 96 Type llvmVoidType = LLVM::LLVMVoidType::get(context); 97 Type llvmPointerType = 98 LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 99 Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType); 100 Type llvmInt8Type = IntegerType::get(context, 8); 101 Type llvmInt32Type = IntegerType::get(context, 32); 102 Type llvmInt64Type = IntegerType::get(context, 64); 103 Type llvmIntPtrType = IntegerType::get( 104 context, this->getTypeConverter()->getPointerBitwidth(0)); 105 106 FunctionCallBuilder moduleLoadCallBuilder = { 107 "mgpuModuleLoad", 108 llvmPointerType /* void *module */, 109 {llvmPointerType /* void *cubin */}}; 110 FunctionCallBuilder moduleUnloadCallBuilder = { 111 "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; 112 FunctionCallBuilder moduleGetFunctionCallBuilder = { 113 "mgpuModuleGetFunction", 114 llvmPointerType /* void *function */, 115 { 116 llvmPointerType, /* void *module */ 117 llvmPointerType /* char *name */ 118 }}; 119 FunctionCallBuilder launchKernelCallBuilder = { 120 "mgpuLaunchKernel", 121 llvmVoidType, 122 { 123 llvmPointerType, /* void* f */ 124 llvmIntPtrType, /* intptr_t gridXDim */ 125 llvmIntPtrType, /* intptr_t gridyDim */ 126 llvmIntPtrType, /* intptr_t gridZDim */ 127 llvmIntPtrType, /* intptr_t blockXDim */ 128 llvmIntPtrType, /* intptr_t blockYDim */ 129 llvmIntPtrType, /* intptr_t blockZDim */ 130 llvmInt32Type, /* unsigned int sharedMemBytes */ 131 llvmPointerType, /* void *hstream */ 132 llvmPointerPointerType, /* void **kernelParams */ 133 llvmPointerPointerType /* void **extra */ 134 }}; 135 FunctionCallBuilder streamCreateCallBuilder = { 136 "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; 137 FunctionCallBuilder streamDestroyCallBuilder = { 138 "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}}; 139 FunctionCallBuilder streamSynchronizeCallBuilder = { 140 "mgpuStreamSynchronize", 141 llvmVoidType, 142 {llvmPointerType /* void *stream */}}; 143 FunctionCallBuilder streamWaitEventCallBuilder = { 144 "mgpuStreamWaitEvent", 145 llvmVoidType, 146 {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}}; 147 FunctionCallBuilder eventCreateCallBuilder = { 148 "mgpuEventCreate", llvmPointerType /* void *event */, {}}; 149 FunctionCallBuilder eventDestroyCallBuilder = { 150 "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}}; 151 FunctionCallBuilder eventSynchronizeCallBuilder = { 152 "mgpuEventSynchronize", 153 llvmVoidType, 154 {llvmPointerType /* void *event */}}; 155 FunctionCallBuilder eventRecordCallBuilder = { 156 "mgpuEventRecord", 157 llvmVoidType, 158 {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}}; 159 FunctionCallBuilder hostRegisterCallBuilder = { 160 "mgpuMemHostRegisterMemRef", 161 llvmVoidType, 162 {llvmIntPtrType /* intptr_t rank */, 163 llvmPointerType /* void *memrefDesc */, 164 llvmIntPtrType /* intptr_t elementSizeBytes */}}; 165 FunctionCallBuilder allocCallBuilder = { 166 "mgpuMemAlloc", 167 llvmPointerType /* void * */, 168 {llvmIntPtrType /* intptr_t sizeBytes */, 169 llvmPointerType /* void *stream */}}; 170 FunctionCallBuilder deallocCallBuilder = { 171 "mgpuMemFree", 172 llvmVoidType, 173 {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}}; 174 FunctionCallBuilder memcpyCallBuilder = { 175 "mgpuMemcpy", 176 llvmVoidType, 177 {llvmPointerType /* void *dst */, llvmPointerType /* void *src */, 178 llvmIntPtrType /* intptr_t sizeBytes */, 179 llvmPointerType /* void *stream */}}; 180 FunctionCallBuilder memsetCallBuilder = { 181 "mgpuMemset32", 182 llvmVoidType, 183 {llvmPointerType /* void *dst */, llvmInt32Type /* unsigned int value */, 184 llvmIntPtrType /* intptr_t sizeBytes */, 185 llvmPointerType /* void *stream */}}; 186 }; 187 188 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime 189 /// call. Currently it supports CUDA and ROCm (HIP). 190 class ConvertHostRegisterOpToGpuRuntimeCallPattern 191 : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> { 192 public: 193 ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 194 : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {} 195 196 private: 197 LogicalResult 198 matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 199 ConversionPatternRewriter &rewriter) const override; 200 }; 201 202 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime 203 /// call. Currently it supports CUDA and ROCm (HIP). 204 class ConvertAllocOpToGpuRuntimeCallPattern 205 : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> { 206 public: 207 ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 208 : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {} 209 210 private: 211 LogicalResult 212 matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands, 213 ConversionPatternRewriter &rewriter) const override; 214 }; 215 216 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime 217 /// call. Currently it supports CUDA and ROCm (HIP). 218 class ConvertDeallocOpToGpuRuntimeCallPattern 219 : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> { 220 public: 221 ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 222 : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {} 223 224 private: 225 LogicalResult 226 matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 227 ConversionPatternRewriter &rewriter) const override; 228 }; 229 230 class ConvertAsyncYieldToGpuRuntimeCallPattern 231 : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> { 232 public: 233 ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 234 : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {} 235 236 private: 237 LogicalResult 238 matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands, 239 ConversionPatternRewriter &rewriter) const override; 240 }; 241 242 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime 243 /// call. Currently it supports CUDA and ROCm (HIP). 244 class ConvertWaitOpToGpuRuntimeCallPattern 245 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 246 public: 247 ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 248 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 249 250 private: 251 LogicalResult 252 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 253 ConversionPatternRewriter &rewriter) const override; 254 }; 255 256 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime 257 /// call. Currently it supports CUDA and ROCm (HIP). 258 class ConvertWaitAsyncOpToGpuRuntimeCallPattern 259 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 260 public: 261 ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 262 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 263 264 private: 265 LogicalResult 266 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 267 ConversionPatternRewriter &rewriter) const override; 268 }; 269 270 /// A rewrite patter to convert gpu.launch_func operations into a sequence of 271 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). 272 /// 273 /// In essence, a gpu.launch_func operations gets compiled into the following 274 /// sequence of runtime calls: 275 /// 276 /// * moduleLoad -- loads the module given the cubin / hsaco data 277 /// * moduleGetFunction -- gets a handle to the actual kernel function 278 /// * getStreamHelper -- initializes a new compute stream on GPU 279 /// * launchKernel -- launches the kernel on a stream 280 /// * streamSynchronize -- waits for operations on the stream to finish 281 /// 282 /// Intermediate data structures are allocated on the stack. 283 class ConvertLaunchFuncOpToGpuRuntimeCallPattern 284 : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> { 285 public: 286 ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, 287 StringRef gpuBinaryAnnotation) 288 : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter), 289 gpuBinaryAnnotation(gpuBinaryAnnotation) {} 290 291 private: 292 Value generateParamsArray(gpu::LaunchFuncOp launchOp, 293 ArrayRef<Value> operands, OpBuilder &builder) const; 294 Value generateKernelNameConstant(StringRef moduleName, StringRef name, 295 Location loc, OpBuilder &builder) const; 296 297 LogicalResult 298 matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 299 ConversionPatternRewriter &rewriter) const override; 300 301 llvm::SmallString<32> gpuBinaryAnnotation; 302 }; 303 304 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> { 305 using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern; 306 307 LogicalResult matchAndRewrite(gpu::GPUModuleOp op, 308 PatternRewriter &rewriter) const override { 309 // GPU kernel modules are no longer necessary since we have a global 310 // constant with the CUBIN, or HSACO data. 311 rewriter.eraseOp(op); 312 return success(); 313 } 314 }; 315 316 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime 317 /// call. Currently it supports CUDA and ROCm (HIP). 318 class ConvertMemcpyOpToGpuRuntimeCallPattern 319 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> { 320 public: 321 ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 322 : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {} 323 324 private: 325 LogicalResult 326 matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 327 ConversionPatternRewriter &rewriter) const override; 328 }; 329 330 /// A rewrite pattern to convert gpu.memset operations into a GPU runtime 331 /// call. Currently it supports CUDA and ROCm (HIP). 332 class ConvertMemsetOpToGpuRuntimeCallPattern 333 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemsetOp> { 334 public: 335 ConvertMemsetOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 336 : ConvertOpToGpuRuntimeCallPattern<gpu::MemsetOp>(typeConverter) {} 337 338 private: 339 LogicalResult 340 matchAndRewrite(gpu::MemsetOp memsetOp, ArrayRef<Value> operands, 341 ConversionPatternRewriter &rewriter) const override; 342 }; 343 } // namespace 344 345 void GpuToLLVMConversionPass::runOnOperation() { 346 LLVMTypeConverter converter(&getContext()); 347 RewritePatternSet patterns(&getContext()); 348 LLVMConversionTarget target(getContext()); 349 350 target.addIllegalDialect<gpu::GPUDialect>(); 351 target.addIllegalOp<UnrealizedConversionCastOp>(); 352 353 populateVectorToLLVMConversionPatterns(converter, patterns); 354 populateMemRefToLLVMConversionPatterns(converter, patterns); 355 populateStdToLLVMConversionPatterns(converter, patterns); 356 populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, 357 target); 358 populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); 359 360 if (failed( 361 applyPartialConversion(getOperation(), target, std::move(patterns)))) 362 signalPassFailure(); 363 } 364 365 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, 366 ArrayRef<Value> arguments) const { 367 auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>(); 368 auto function = [&] { 369 if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName)) 370 return function; 371 return OpBuilder::atBlockEnd(module.getBody()) 372 .create<LLVM::LLVMFuncOp>(loc, functionName, functionType); 373 }(); 374 return builder.create<LLVM::CallOp>(loc, function, arguments); 375 } 376 377 // Returns whether all operands are of LLVM type. 378 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, 379 ConversionPatternRewriter &rewriter) { 380 if (!llvm::all_of(operands, [](Value value) { 381 return LLVM::isCompatibleType(value.getType()); 382 })) 383 return rewriter.notifyMatchFailure( 384 op, "Cannot convert if operands aren't of LLVM type."); 385 return success(); 386 } 387 388 static LogicalResult 389 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter, 390 gpu::AsyncOpInterface op) { 391 if (op.getAsyncDependencies().size() != 1) 392 return rewriter.notifyMatchFailure( 393 op, "Can only convert with exactly one async dependency."); 394 395 if (!op.getAsyncToken()) 396 return rewriter.notifyMatchFailure(op, "Can convert only async version."); 397 398 return success(); 399 } 400 401 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( 402 gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 403 ConversionPatternRewriter &rewriter) const { 404 auto *op = hostRegisterOp.getOperation(); 405 if (failed(areAllLLVMTypes(op, operands, rewriter))) 406 return failure(); 407 408 Location loc = op->getLoc(); 409 410 auto memRefType = hostRegisterOp.value().getType(); 411 auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType(); 412 auto elementSize = getSizeInBytes(loc, elementType, rewriter); 413 414 auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(), 415 operands, rewriter); 416 arguments.push_back(elementSize); 417 hostRegisterCallBuilder.create(loc, rewriter, arguments); 418 419 rewriter.eraseOp(op); 420 return success(); 421 } 422 423 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( 424 gpu::AllocOp allocOp, ArrayRef<Value> operands, 425 ConversionPatternRewriter &rewriter) const { 426 MemRefType memRefType = allocOp.getType(); 427 428 if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) || 429 !isConvertibleAndHasIdentityMaps(memRefType) || 430 failed(isAsyncWithOneDependency(rewriter, allocOp))) 431 return failure(); 432 433 auto loc = allocOp.getLoc(); 434 auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary()); 435 436 // Get shape of the memref as values: static sizes are constant 437 // values and dynamic sizes are passed to 'alloc' as operands. 438 SmallVector<Value, 4> shape; 439 SmallVector<Value, 4> strides; 440 Value sizeBytes; 441 getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter, 442 shape, strides, sizeBytes); 443 444 // Allocate the underlying buffer and store a pointer to it in the MemRef 445 // descriptor. 446 Type elementPtrType = this->getElementPtrType(memRefType); 447 auto stream = adaptor.asyncDependencies().front(); 448 Value allocatedPtr = 449 allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0); 450 allocatedPtr = 451 rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); 452 453 // No alignment. 454 Value alignedPtr = allocatedPtr; 455 456 // Create the MemRef descriptor. 457 auto memRefDescriptor = this->createMemRefDescriptor( 458 loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter); 459 460 rewriter.replaceOp(allocOp, {memRefDescriptor, stream}); 461 462 return success(); 463 } 464 465 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( 466 gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 467 ConversionPatternRewriter &rewriter) const { 468 if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) || 469 failed(isAsyncWithOneDependency(rewriter, deallocOp))) 470 return failure(); 471 472 Location loc = deallocOp.getLoc(); 473 474 auto adaptor = 475 gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary()); 476 Value pointer = 477 MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc); 478 auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer); 479 Value stream = adaptor.asyncDependencies().front(); 480 deallocCallBuilder.create(loc, rewriter, {casted, stream}); 481 482 rewriter.replaceOp(deallocOp, {stream}); 483 return success(); 484 } 485 486 static bool isGpuAsyncTokenType(Value value) { 487 return value.getType().isa<gpu::AsyncTokenType>(); 488 } 489 490 // Converts !gpu.async.token operands of `async.yield` to runtime calls. The 491 // !gpu.async.token are lowered to stream within the async.execute region, but 492 // are passed as events between them. For each !gpu.async.token operand, we 493 // create an event and record it on the stream. 494 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( 495 async::YieldOp yieldOp, ArrayRef<Value> operands, 496 ConversionPatternRewriter &rewriter) const { 497 if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType)) 498 return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand"); 499 500 Location loc = yieldOp.getLoc(); 501 SmallVector<Value, 4> newOperands(operands.begin(), operands.end()); 502 llvm::SmallDenseSet<Value> streams; 503 for (auto &operand : yieldOp->getOpOperands()) { 504 if (!isGpuAsyncTokenType(operand.get())) 505 continue; 506 auto idx = operand.getOperandNumber(); 507 auto stream = operands[idx]; 508 auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 509 eventRecordCallBuilder.create(loc, rewriter, {event, stream}); 510 newOperands[idx] = event; 511 streams.insert(stream); 512 } 513 for (auto stream : streams) 514 streamDestroyCallBuilder.create(loc, rewriter, {stream}); 515 516 rewriter.updateRootInPlace(yieldOp, 517 [&] { yieldOp->setOperands(newOperands); }); 518 return success(); 519 } 520 521 // Returns whether `value` is the result of an LLVM::CallOp to `functionName`. 522 static bool isDefinedByCallTo(Value value, StringRef functionName) { 523 assert(value.getType().isa<LLVM::LLVMPointerType>()); 524 if (auto defOp = value.getDefiningOp<LLVM::CallOp>()) 525 return defOp.callee()->equals(functionName); 526 return false; 527 } 528 529 // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host 530 // with the stream/event operands. The operands are destroyed. That is, it 531 // assumes that it is not used afterwards or elsewhere. Otherwise we will get a 532 // runtime error. Eventually, we should guarantee this property. 533 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite( 534 gpu::WaitOp waitOp, ArrayRef<Value> operands, 535 ConversionPatternRewriter &rewriter) const { 536 if (waitOp.asyncToken()) 537 return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op."); 538 539 Location loc = waitOp.getLoc(); 540 541 for (auto operand : operands) { 542 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 543 // The converted operand's definition created a stream. 544 streamSynchronizeCallBuilder.create(loc, rewriter, {operand}); 545 streamDestroyCallBuilder.create(loc, rewriter, {operand}); 546 } else { 547 // Otherwise the converted operand is an event. This assumes that we use 548 // events in control flow code as well. 549 eventSynchronizeCallBuilder.create(loc, rewriter, {operand}); 550 eventDestroyCallBuilder.create(loc, rewriter, {operand}); 551 } 552 } 553 554 rewriter.eraseOp(waitOp); 555 return success(); 556 } 557 558 // Converts `gpu.wait async` to runtime calls. The converted op creates a new 559 // stream that is synchronized with stream/event operands. The operands are 560 // destroyed. That is, it assumes that it is not used afterwards or elsewhere. 561 // Otherwise we will get a runtime error. Eventually, we should guarantee this 562 // property. 563 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( 564 gpu::WaitOp waitOp, ArrayRef<Value> operands, 565 ConversionPatternRewriter &rewriter) const { 566 if (!waitOp.asyncToken()) 567 return rewriter.notifyMatchFailure(waitOp, "Can only convert async op."); 568 569 Location loc = waitOp.getLoc(); 570 571 auto insertionPoint = rewriter.saveInsertionPoint(); 572 SmallVector<Value, 1> events; 573 for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) { 574 auto operand = std::get<1>(pair); 575 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 576 // The converted operand's definition created a stream. Insert an event 577 // into the stream just after the last use of the original token operand. 578 auto *defOp = std::get<0>(pair).getDefiningOp(); 579 rewriter.setInsertionPointAfter(defOp); 580 auto event = 581 eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 582 eventRecordCallBuilder.create(loc, rewriter, {event, operand}); 583 events.push_back(event); 584 } else { 585 // Otherwise the converted operand is an event. This assumes that we use 586 // events in control flow code as well. 587 events.push_back(operand); 588 } 589 } 590 rewriter.restoreInsertionPoint(insertionPoint); 591 auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 592 for (auto event : events) 593 streamWaitEventCallBuilder.create(loc, rewriter, {stream, event}); 594 for (auto event : events) 595 eventDestroyCallBuilder.create(loc, rewriter, {event}); 596 rewriter.replaceOp(waitOp, {stream}); 597 598 return success(); 599 } 600 601 // Creates a struct containing all kernel parameters on the stack and returns 602 // an array of type-erased pointers to the fields of the struct. The array can 603 // then be passed to the CUDA / ROCm (HIP) kernel launch calls. 604 // The generated code is essentially as follows: 605 // 606 // %struct = alloca(sizeof(struct { Parameters... })) 607 // %array = alloca(NumParameters * sizeof(void *)) 608 // for (i : [0, NumParameters)) 609 // %fieldPtr = llvm.getelementptr %struct[0, i] 610 // llvm.store parameters[i], %fieldPtr 611 // %elementPtr = llvm.getelementptr %array[i] 612 // llvm.store %fieldPtr, %elementPtr 613 // return %array 614 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( 615 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 616 OpBuilder &builder) const { 617 auto loc = launchOp.getLoc(); 618 auto numKernelOperands = launchOp.getNumKernelOperands(); 619 auto arguments = getTypeConverter()->promoteOperands( 620 loc, launchOp.getOperands().take_back(numKernelOperands), 621 operands.take_back(numKernelOperands), builder); 622 auto numArguments = arguments.size(); 623 SmallVector<Type, 4> argumentTypes; 624 argumentTypes.reserve(numArguments); 625 for (auto argument : arguments) 626 argumentTypes.push_back(argument.getType()); 627 auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(), 628 argumentTypes); 629 auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 630 builder.getI32IntegerAttr(1)); 631 auto structPtr = builder.create<LLVM::AllocaOp>( 632 loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0); 633 auto arraySize = builder.create<LLVM::ConstantOp>( 634 loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments)); 635 auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType, 636 arraySize, /*alignment=*/0); 637 auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 638 builder.getI32IntegerAttr(0)); 639 for (auto en : llvm::enumerate(arguments)) { 640 auto index = builder.create<LLVM::ConstantOp>( 641 loc, llvmInt32Type, builder.getI32IntegerAttr(en.index())); 642 auto fieldPtr = builder.create<LLVM::GEPOp>( 643 loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr, 644 ArrayRef<Value>{zero, index.getResult()}); 645 builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr); 646 auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, 647 arrayPtr, index.getResult()); 648 auto casted = 649 builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr); 650 builder.create<LLVM::StoreOp>(loc, casted, elementPtr); 651 } 652 return arrayPtr; 653 } 654 655 // Generates an LLVM IR dialect global that contains the name of the given 656 // kernel function as a C string, and returns a pointer to its beginning. 657 // The code is essentially: 658 // 659 // llvm.global constant @kernel_name("function_name\00") 660 // func(...) { 661 // %0 = llvm.addressof @kernel_name 662 // %1 = llvm.constant (0 : index) 663 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> 664 // } 665 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( 666 StringRef moduleName, StringRef name, Location loc, 667 OpBuilder &builder) const { 668 // Make sure the trailing zero is included in the constant. 669 std::vector<char> kernelName(name.begin(), name.end()); 670 kernelName.push_back('\0'); 671 672 std::string globalName = 673 std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); 674 return LLVM::createGlobalString( 675 loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), 676 LLVM::Linkage::Internal); 677 } 678 679 // Emits LLVM IR to launch a kernel function. Expects the module that contains 680 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a 681 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. 682 // 683 // %0 = call %binarygetter 684 // %1 = call %moduleLoad(%0) 685 // %2 = <see generateKernelNameConstant> 686 // %3 = call %moduleGetFunction(%1, %2) 687 // %4 = call %streamCreate() 688 // %5 = <see generateParamsArray> 689 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr) 690 // call %streamSynchronize(%4) 691 // call %streamDestroy(%4) 692 // call %moduleUnload(%1) 693 // 694 // If the op is async, the stream corresponds to the (single) async dependency 695 // as well as the async token the op produces. 696 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( 697 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 698 ConversionPatternRewriter &rewriter) const { 699 if (failed(areAllLLVMTypes(launchOp, operands, rewriter))) 700 return failure(); 701 702 if (launchOp.asyncDependencies().size() > 1) 703 return rewriter.notifyMatchFailure( 704 launchOp, "Cannot convert with more than one async dependency."); 705 706 // Fail when the synchronous version of the op has async dependencies. The 707 // lowering destroys the stream, and we do not want to check that there is no 708 // use of the stream after this op. 709 if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty()) 710 return rewriter.notifyMatchFailure( 711 launchOp, "Cannot convert non-async op with async dependencies."); 712 713 Location loc = launchOp.getLoc(); 714 715 // Create an LLVM global with CUBIN extracted from the kernel annotation and 716 // obtain a pointer to the first byte in it. 717 auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>( 718 launchOp, launchOp.getKernelModuleName()); 719 assert(kernelModule && "expected a kernel module"); 720 721 auto binaryAttr = 722 kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation); 723 if (!binaryAttr) { 724 kernelModule.emitOpError() 725 << "missing " << gpuBinaryAnnotation << " attribute"; 726 return failure(); 727 } 728 729 SmallString<128> nameBuffer(kernelModule.getName()); 730 nameBuffer.append(kGpuBinaryStorageSuffix); 731 Value data = 732 LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), 733 binaryAttr.getValue(), LLVM::Linkage::Internal); 734 735 auto module = moduleLoadCallBuilder.create(loc, rewriter, data); 736 // Get the function from the module. The name corresponds to the name of 737 // the kernel function. 738 auto kernelName = generateKernelNameConstant( 739 launchOp.getKernelModuleName().getValue(), 740 launchOp.getKernelName().getValue(), loc, rewriter); 741 auto function = moduleGetFunctionCallBuilder.create( 742 loc, rewriter, {module.getResult(0), kernelName}); 743 auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type, 744 rewriter.getI32IntegerAttr(0)); 745 auto adaptor = 746 gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary()); 747 Value stream = 748 adaptor.asyncDependencies().empty() 749 ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0) 750 : adaptor.asyncDependencies().front(); 751 // Create array of pointers to kernel arguments. 752 auto kernelParams = generateParamsArray(launchOp, operands, rewriter); 753 auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType); 754 launchKernelCallBuilder.create(loc, rewriter, 755 {function.getResult(0), launchOp.gridSizeX(), 756 launchOp.gridSizeY(), launchOp.gridSizeZ(), 757 launchOp.blockSizeX(), launchOp.blockSizeY(), 758 launchOp.blockSizeZ(), 759 /*sharedMemBytes=*/zero, stream, kernelParams, 760 /*extra=*/nullpointer}); 761 762 if (launchOp.asyncToken()) { 763 // Async launch: make dependent ops use the same stream. 764 rewriter.replaceOp(launchOp, {stream}); 765 } else { 766 // Synchronize with host and destroy stream. This must be the stream created 767 // above (with no other uses) because we check that the synchronous version 768 // does not have any async dependencies. 769 streamSynchronizeCallBuilder.create(loc, rewriter, stream); 770 streamDestroyCallBuilder.create(loc, rewriter, stream); 771 rewriter.eraseOp(launchOp); 772 } 773 moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0)); 774 775 return success(); 776 } 777 778 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite( 779 gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 780 ConversionPatternRewriter &rewriter) const { 781 auto memRefType = memcpyOp.src().getType().cast<MemRefType>(); 782 783 if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) || 784 !isConvertibleAndHasIdentityMaps(memRefType) || 785 failed(isAsyncWithOneDependency(rewriter, memcpyOp))) 786 return failure(); 787 788 auto loc = memcpyOp.getLoc(); 789 auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary()); 790 791 MemRefDescriptor srcDesc(adaptor.src()); 792 Value numElements = getNumElements(rewriter, loc, memRefType, srcDesc); 793 794 Type elementPtrType = getElementPtrType(memRefType); 795 Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType); 796 Value gepPtr = rewriter.create<LLVM::GEPOp>( 797 loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements}); 798 auto sizeBytes = 799 rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr); 800 801 auto src = rewriter.create<LLVM::BitcastOp>( 802 loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc)); 803 auto dst = rewriter.create<LLVM::BitcastOp>( 804 loc, llvmPointerType, 805 MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc)); 806 807 auto stream = adaptor.asyncDependencies().front(); 808 memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream}); 809 810 rewriter.replaceOp(memcpyOp, {stream}); 811 812 return success(); 813 } 814 815 LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite( 816 gpu::MemsetOp memsetOp, ArrayRef<Value> operands, 817 ConversionPatternRewriter &rewriter) const { 818 auto memRefType = memsetOp.dst().getType().cast<MemRefType>(); 819 820 if (failed(areAllLLVMTypes(memsetOp, operands, rewriter)) || 821 !isConvertibleAndHasIdentityMaps(memRefType) || 822 failed(isAsyncWithOneDependency(rewriter, memsetOp))) 823 return failure(); 824 825 auto loc = memsetOp.getLoc(); 826 auto adaptor = gpu::MemsetOpAdaptor(operands, memsetOp->getAttrDictionary()); 827 828 Type valueType = adaptor.value().getType(); 829 if (!valueType.isIntOrFloat() || valueType.getIntOrFloatBitWidth() != 32) { 830 return rewriter.notifyMatchFailure(memsetOp, 831 "value must be a 32 bit scalar"); 832 } 833 834 MemRefDescriptor dstDesc(adaptor.dst()); 835 Value numElements = getNumElements(rewriter, loc, memRefType, dstDesc); 836 837 auto value = 838 rewriter.create<LLVM::BitcastOp>(loc, llvmInt32Type, adaptor.value()); 839 auto dst = rewriter.create<LLVM::BitcastOp>( 840 loc, llvmPointerType, dstDesc.alignedPtr(rewriter, loc)); 841 842 auto stream = adaptor.asyncDependencies().front(); 843 memsetCallBuilder.create(loc, rewriter, {dst, value, numElements, stream}); 844 845 rewriter.replaceOp(memsetOp, {stream}); 846 return success(); 847 } 848 849 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> 850 mlir::createGpuToLLVMConversionPass() { 851 return std::make_unique<GpuToLLVMConversionPass>(); 852 } 853 854 void mlir::populateGpuToLLVMConversionPatterns( 855 LLVMTypeConverter &converter, OwningRewritePatternList &patterns, 856 StringRef gpuBinaryAnnotation) { 857 converter.addConversion( 858 [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { 859 return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 860 }); 861 patterns.add<ConvertAllocOpToGpuRuntimeCallPattern, 862 ConvertDeallocOpToGpuRuntimeCallPattern, 863 ConvertHostRegisterOpToGpuRuntimeCallPattern, 864 ConvertMemcpyOpToGpuRuntimeCallPattern, 865 ConvertMemsetOpToGpuRuntimeCallPattern, 866 ConvertWaitAsyncOpToGpuRuntimeCallPattern, 867 ConvertWaitOpToGpuRuntimeCallPattern, 868 ConvertAsyncYieldToGpuRuntimeCallPattern>(converter); 869 patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(converter, 870 gpuBinaryAnnotation); 871 patterns.add<EraseGpuModuleOpPattern>(&converter.getContext()); 872 } 873