1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to convert gpu.launch_func op into a sequence of 10 // GPU runtime calls. As most of GPU runtimes does not have a stable published 11 // ABI, this pass uses a slim runtime layer that builds on top of the public 12 // API from GPU runtime headers. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" 17 18 #include "../PassDetail.h" 19 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" 20 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" 21 #include "mlir/Conversion/LLVMCommon/Pattern.h" 22 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" 23 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" 24 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" 25 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" 26 #include "mlir/Dialect/Async/IR/Async.h" 27 #include "mlir/Dialect/GPU/GPUDialect.h" 28 #include "mlir/Dialect/GPU/Passes.h" 29 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 30 #include "mlir/IR/Attributes.h" 31 #include "mlir/IR/Builders.h" 32 #include "mlir/IR/BuiltinOps.h" 33 #include "mlir/IR/BuiltinTypes.h" 34 35 #include "llvm/ADT/STLExtras.h" 36 #include "llvm/Support/Error.h" 37 #include "llvm/Support/FormatVariadic.h" 38 39 using namespace mlir; 40 41 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; 42 43 namespace { 44 45 class GpuToLLVMConversionPass 46 : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> { 47 public: 48 GpuToLLVMConversionPass() = default; 49 50 GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other) 51 : GpuToLLVMConversionPassBase(other) {} 52 53 // Run the dialect converter on the module. 54 void runOnOperation() override; 55 56 private: 57 Option<std::string> gpuBinaryAnnotation{ 58 *this, "gpu-binary-annotation", 59 llvm::cl::desc("Annotation attribute string for GPU binary"), 60 llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())}; 61 }; 62 63 struct FunctionCallBuilder { 64 FunctionCallBuilder(StringRef functionName, Type returnType, 65 ArrayRef<Type> argumentTypes) 66 : functionName(functionName), 67 functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} 68 LLVM::CallOp create(Location loc, OpBuilder &builder, 69 ArrayRef<Value> arguments) const; 70 71 StringRef functionName; 72 LLVM::LLVMFunctionType functionType; 73 }; 74 75 template <typename OpTy> 76 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { 77 public: 78 explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 79 : ConvertOpToLLVMPattern<OpTy>(typeConverter) {} 80 81 protected: 82 Value getNumElements(ConversionPatternRewriter &rewriter, Location loc, 83 MemRefType type, MemRefDescriptor desc) const { 84 return type.hasStaticShape() 85 ? ConvertToLLVMPattern::createIndexConstant( 86 rewriter, loc, type.getNumElements()) 87 // For identity maps (verified by caller), the number of 88 // elements is stride[0] * size[0]. 89 : rewriter.create<LLVM::MulOp>(loc, 90 desc.stride(rewriter, loc, 0), 91 desc.size(rewriter, loc, 0)); 92 } 93 94 MLIRContext *context = &this->getTypeConverter()->getContext(); 95 96 Type llvmVoidType = LLVM::LLVMVoidType::get(context); 97 Type llvmPointerType = 98 LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 99 Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType); 100 Type llvmInt8Type = IntegerType::get(context, 8); 101 Type llvmInt32Type = IntegerType::get(context, 32); 102 Type llvmInt64Type = IntegerType::get(context, 64); 103 Type llvmIntPtrType = IntegerType::get( 104 context, this->getTypeConverter()->getPointerBitwidth(0)); 105 106 FunctionCallBuilder moduleLoadCallBuilder = { 107 "mgpuModuleLoad", 108 llvmPointerType /* void *module */, 109 {llvmPointerType /* void *cubin */}}; 110 FunctionCallBuilder moduleUnloadCallBuilder = { 111 "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; 112 FunctionCallBuilder moduleGetFunctionCallBuilder = { 113 "mgpuModuleGetFunction", 114 llvmPointerType /* void *function */, 115 { 116 llvmPointerType, /* void *module */ 117 llvmPointerType /* char *name */ 118 }}; 119 FunctionCallBuilder launchKernelCallBuilder = { 120 "mgpuLaunchKernel", 121 llvmVoidType, 122 { 123 llvmPointerType, /* void* f */ 124 llvmIntPtrType, /* intptr_t gridXDim */ 125 llvmIntPtrType, /* intptr_t gridyDim */ 126 llvmIntPtrType, /* intptr_t gridZDim */ 127 llvmIntPtrType, /* intptr_t blockXDim */ 128 llvmIntPtrType, /* intptr_t blockYDim */ 129 llvmIntPtrType, /* intptr_t blockZDim */ 130 llvmInt32Type, /* unsigned int sharedMemBytes */ 131 llvmPointerType, /* void *hstream */ 132 llvmPointerPointerType, /* void **kernelParams */ 133 llvmPointerPointerType /* void **extra */ 134 }}; 135 FunctionCallBuilder streamCreateCallBuilder = { 136 "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; 137 FunctionCallBuilder streamDestroyCallBuilder = { 138 "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}}; 139 FunctionCallBuilder streamSynchronizeCallBuilder = { 140 "mgpuStreamSynchronize", 141 llvmVoidType, 142 {llvmPointerType /* void *stream */}}; 143 FunctionCallBuilder streamWaitEventCallBuilder = { 144 "mgpuStreamWaitEvent", 145 llvmVoidType, 146 {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}}; 147 FunctionCallBuilder eventCreateCallBuilder = { 148 "mgpuEventCreate", llvmPointerType /* void *event */, {}}; 149 FunctionCallBuilder eventDestroyCallBuilder = { 150 "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}}; 151 FunctionCallBuilder eventSynchronizeCallBuilder = { 152 "mgpuEventSynchronize", 153 llvmVoidType, 154 {llvmPointerType /* void *event */}}; 155 FunctionCallBuilder eventRecordCallBuilder = { 156 "mgpuEventRecord", 157 llvmVoidType, 158 {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}}; 159 FunctionCallBuilder hostRegisterCallBuilder = { 160 "mgpuMemHostRegisterMemRef", 161 llvmVoidType, 162 {llvmIntPtrType /* intptr_t rank */, 163 llvmPointerType /* void *memrefDesc */, 164 llvmIntPtrType /* intptr_t elementSizeBytes */}}; 165 FunctionCallBuilder allocCallBuilder = { 166 "mgpuMemAlloc", 167 llvmPointerType /* void * */, 168 {llvmIntPtrType /* intptr_t sizeBytes */, 169 llvmPointerType /* void *stream */}}; 170 FunctionCallBuilder deallocCallBuilder = { 171 "mgpuMemFree", 172 llvmVoidType, 173 {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}}; 174 FunctionCallBuilder memcpyCallBuilder = { 175 "mgpuMemcpy", 176 llvmVoidType, 177 {llvmPointerType /* void *dst */, llvmPointerType /* void *src */, 178 llvmIntPtrType /* intptr_t sizeBytes */, 179 llvmPointerType /* void *stream */}}; 180 FunctionCallBuilder memsetCallBuilder = { 181 "mgpuMemset32", 182 llvmVoidType, 183 {llvmPointerType /* void *dst */, llvmInt32Type /* unsigned int value */, 184 llvmIntPtrType /* intptr_t sizeBytes */, 185 llvmPointerType /* void *stream */}}; 186 }; 187 188 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime 189 /// call. Currently it supports CUDA and ROCm (HIP). 190 class ConvertHostRegisterOpToGpuRuntimeCallPattern 191 : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> { 192 public: 193 ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 194 : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {} 195 196 private: 197 LogicalResult 198 matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 199 ConversionPatternRewriter &rewriter) const override; 200 }; 201 202 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime 203 /// call. Currently it supports CUDA and ROCm (HIP). 204 class ConvertAllocOpToGpuRuntimeCallPattern 205 : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> { 206 public: 207 ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 208 : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {} 209 210 private: 211 LogicalResult 212 matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands, 213 ConversionPatternRewriter &rewriter) const override; 214 }; 215 216 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime 217 /// call. Currently it supports CUDA and ROCm (HIP). 218 class ConvertDeallocOpToGpuRuntimeCallPattern 219 : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> { 220 public: 221 ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 222 : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {} 223 224 private: 225 LogicalResult 226 matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 227 ConversionPatternRewriter &rewriter) const override; 228 }; 229 230 class ConvertAsyncYieldToGpuRuntimeCallPattern 231 : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> { 232 public: 233 ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 234 : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {} 235 236 private: 237 LogicalResult 238 matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands, 239 ConversionPatternRewriter &rewriter) const override; 240 }; 241 242 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime 243 /// call. Currently it supports CUDA and ROCm (HIP). 244 class ConvertWaitOpToGpuRuntimeCallPattern 245 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 246 public: 247 ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 248 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 249 250 private: 251 LogicalResult 252 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 253 ConversionPatternRewriter &rewriter) const override; 254 }; 255 256 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime 257 /// call. Currently it supports CUDA and ROCm (HIP). 258 class ConvertWaitAsyncOpToGpuRuntimeCallPattern 259 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 260 public: 261 ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 262 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 263 264 private: 265 LogicalResult 266 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 267 ConversionPatternRewriter &rewriter) const override; 268 }; 269 270 /// A rewrite patter to convert gpu.launch_func operations into a sequence of 271 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). 272 /// 273 /// In essence, a gpu.launch_func operations gets compiled into the following 274 /// sequence of runtime calls: 275 /// 276 /// * moduleLoad -- loads the module given the cubin / hsaco data 277 /// * moduleGetFunction -- gets a handle to the actual kernel function 278 /// * getStreamHelper -- initializes a new compute stream on GPU 279 /// * launchKernel -- launches the kernel on a stream 280 /// * streamSynchronize -- waits for operations on the stream to finish 281 /// 282 /// Intermediate data structures are allocated on the stack. 283 class ConvertLaunchFuncOpToGpuRuntimeCallPattern 284 : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> { 285 public: 286 ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, 287 StringRef gpuBinaryAnnotation) 288 : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter), 289 gpuBinaryAnnotation(gpuBinaryAnnotation) {} 290 291 private: 292 Value generateParamsArray(gpu::LaunchFuncOp launchOp, 293 ArrayRef<Value> operands, OpBuilder &builder) const; 294 Value generateKernelNameConstant(StringRef moduleName, StringRef name, 295 Location loc, OpBuilder &builder) const; 296 297 LogicalResult 298 matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 299 ConversionPatternRewriter &rewriter) const override; 300 301 llvm::SmallString<32> gpuBinaryAnnotation; 302 }; 303 304 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> { 305 using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern; 306 307 LogicalResult matchAndRewrite(gpu::GPUModuleOp op, 308 PatternRewriter &rewriter) const override { 309 // GPU kernel modules are no longer necessary since we have a global 310 // constant with the CUBIN, or HSACO data. 311 rewriter.eraseOp(op); 312 return success(); 313 } 314 }; 315 316 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime 317 /// call. Currently it supports CUDA and ROCm (HIP). 318 class ConvertMemcpyOpToGpuRuntimeCallPattern 319 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> { 320 public: 321 ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 322 : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {} 323 324 private: 325 LogicalResult 326 matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 327 ConversionPatternRewriter &rewriter) const override; 328 }; 329 330 /// A rewrite pattern to convert gpu.memset operations into a GPU runtime 331 /// call. Currently it supports CUDA and ROCm (HIP). 332 class ConvertMemsetOpToGpuRuntimeCallPattern 333 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemsetOp> { 334 public: 335 ConvertMemsetOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 336 : ConvertOpToGpuRuntimeCallPattern<gpu::MemsetOp>(typeConverter) {} 337 338 private: 339 LogicalResult 340 matchAndRewrite(gpu::MemsetOp memsetOp, ArrayRef<Value> operands, 341 ConversionPatternRewriter &rewriter) const override; 342 }; 343 } // namespace 344 345 void GpuToLLVMConversionPass::runOnOperation() { 346 LLVMTypeConverter converter(&getContext()); 347 RewritePatternSet patterns(&getContext()); 348 LLVMConversionTarget target(getContext()); 349 350 target.addIllegalDialect<gpu::GPUDialect>(); 351 352 populateVectorToLLVMConversionPatterns(converter, patterns); 353 populateMemRefToLLVMConversionPatterns(converter, patterns); 354 populateStdToLLVMConversionPatterns(converter, patterns); 355 populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, 356 target); 357 populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); 358 359 if (failed( 360 applyPartialConversion(getOperation(), target, std::move(patterns)))) 361 signalPassFailure(); 362 } 363 364 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, 365 ArrayRef<Value> arguments) const { 366 auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>(); 367 auto function = [&] { 368 if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName)) 369 return function; 370 return OpBuilder::atBlockEnd(module.getBody()) 371 .create<LLVM::LLVMFuncOp>(loc, functionName, functionType); 372 }(); 373 return builder.create<LLVM::CallOp>(loc, function, arguments); 374 } 375 376 // Returns whether all operands are of LLVM type. 377 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, 378 ConversionPatternRewriter &rewriter) { 379 if (!llvm::all_of(operands, [](Value value) { 380 return LLVM::isCompatibleType(value.getType()); 381 })) 382 return rewriter.notifyMatchFailure( 383 op, "Cannot convert if operands aren't of LLVM type."); 384 return success(); 385 } 386 387 static LogicalResult 388 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter, 389 gpu::AsyncOpInterface op) { 390 if (op.getAsyncDependencies().size() != 1) 391 return rewriter.notifyMatchFailure( 392 op, "Can only convert with exactly one async dependency."); 393 394 if (!op.getAsyncToken()) 395 return rewriter.notifyMatchFailure(op, "Can convert only async version."); 396 397 return success(); 398 } 399 400 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( 401 gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 402 ConversionPatternRewriter &rewriter) const { 403 auto *op = hostRegisterOp.getOperation(); 404 if (failed(areAllLLVMTypes(op, operands, rewriter))) 405 return failure(); 406 407 Location loc = op->getLoc(); 408 409 auto memRefType = hostRegisterOp.value().getType(); 410 auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType(); 411 auto elementSize = getSizeInBytes(loc, elementType, rewriter); 412 413 auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(), 414 operands, rewriter); 415 arguments.push_back(elementSize); 416 hostRegisterCallBuilder.create(loc, rewriter, arguments); 417 418 rewriter.eraseOp(op); 419 return success(); 420 } 421 422 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( 423 gpu::AllocOp allocOp, ArrayRef<Value> operands, 424 ConversionPatternRewriter &rewriter) const { 425 MemRefType memRefType = allocOp.getType(); 426 427 if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) || 428 !isConvertibleAndHasIdentityMaps(memRefType) || 429 failed(isAsyncWithOneDependency(rewriter, allocOp))) 430 return failure(); 431 432 auto loc = allocOp.getLoc(); 433 auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary()); 434 435 // Get shape of the memref as values: static sizes are constant 436 // values and dynamic sizes are passed to 'alloc' as operands. 437 SmallVector<Value, 4> shape; 438 SmallVector<Value, 4> strides; 439 Value sizeBytes; 440 getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter, 441 shape, strides, sizeBytes); 442 443 // Allocate the underlying buffer and store a pointer to it in the MemRef 444 // descriptor. 445 Type elementPtrType = this->getElementPtrType(memRefType); 446 auto stream = adaptor.asyncDependencies().front(); 447 Value allocatedPtr = 448 allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0); 449 allocatedPtr = 450 rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); 451 452 // No alignment. 453 Value alignedPtr = allocatedPtr; 454 455 // Create the MemRef descriptor. 456 auto memRefDescriptor = this->createMemRefDescriptor( 457 loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter); 458 459 rewriter.replaceOp(allocOp, {memRefDescriptor, stream}); 460 461 return success(); 462 } 463 464 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( 465 gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 466 ConversionPatternRewriter &rewriter) const { 467 if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) || 468 failed(isAsyncWithOneDependency(rewriter, deallocOp))) 469 return failure(); 470 471 Location loc = deallocOp.getLoc(); 472 473 auto adaptor = 474 gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary()); 475 Value pointer = 476 MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc); 477 auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer); 478 Value stream = adaptor.asyncDependencies().front(); 479 deallocCallBuilder.create(loc, rewriter, {casted, stream}); 480 481 rewriter.replaceOp(deallocOp, {stream}); 482 return success(); 483 } 484 485 static bool isGpuAsyncTokenType(Value value) { 486 return value.getType().isa<gpu::AsyncTokenType>(); 487 } 488 489 // Converts !gpu.async.token operands of `async.yield` to runtime calls. The 490 // !gpu.async.token are lowered to stream within the async.execute region, but 491 // are passed as events between them. For each !gpu.async.token operand, we 492 // create an event and record it on the stream. 493 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( 494 async::YieldOp yieldOp, ArrayRef<Value> operands, 495 ConversionPatternRewriter &rewriter) const { 496 if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType)) 497 return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand"); 498 499 Location loc = yieldOp.getLoc(); 500 SmallVector<Value, 4> newOperands(operands.begin(), operands.end()); 501 llvm::SmallDenseSet<Value> streams; 502 for (auto &operand : yieldOp->getOpOperands()) { 503 if (!isGpuAsyncTokenType(operand.get())) 504 continue; 505 auto idx = operand.getOperandNumber(); 506 auto stream = operands[idx]; 507 auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 508 eventRecordCallBuilder.create(loc, rewriter, {event, stream}); 509 newOperands[idx] = event; 510 streams.insert(stream); 511 } 512 for (auto stream : streams) 513 streamDestroyCallBuilder.create(loc, rewriter, {stream}); 514 515 rewriter.updateRootInPlace(yieldOp, 516 [&] { yieldOp->setOperands(newOperands); }); 517 return success(); 518 } 519 520 // Returns whether `value` is the result of an LLVM::CallOp to `functionName`. 521 static bool isDefinedByCallTo(Value value, StringRef functionName) { 522 assert(value.getType().isa<LLVM::LLVMPointerType>()); 523 if (auto defOp = value.getDefiningOp<LLVM::CallOp>()) 524 return defOp.callee()->equals(functionName); 525 return false; 526 } 527 528 // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host 529 // with the stream/event operands. The operands are destroyed. That is, it 530 // assumes that it is not used afterwards or elsewhere. Otherwise we will get a 531 // runtime error. Eventually, we should guarantee this property. 532 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite( 533 gpu::WaitOp waitOp, ArrayRef<Value> operands, 534 ConversionPatternRewriter &rewriter) const { 535 if (waitOp.asyncToken()) 536 return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op."); 537 538 Location loc = waitOp.getLoc(); 539 540 for (auto operand : operands) { 541 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 542 // The converted operand's definition created a stream. 543 streamSynchronizeCallBuilder.create(loc, rewriter, {operand}); 544 streamDestroyCallBuilder.create(loc, rewriter, {operand}); 545 } else { 546 // Otherwise the converted operand is an event. This assumes that we use 547 // events in control flow code as well. 548 eventSynchronizeCallBuilder.create(loc, rewriter, {operand}); 549 eventDestroyCallBuilder.create(loc, rewriter, {operand}); 550 } 551 } 552 553 rewriter.eraseOp(waitOp); 554 return success(); 555 } 556 557 // Converts `gpu.wait async` to runtime calls. The converted op creates a new 558 // stream that is synchronized with stream/event operands. The operands are 559 // destroyed. That is, it assumes that it is not used afterwards or elsewhere. 560 // Otherwise we will get a runtime error. Eventually, we should guarantee this 561 // property. 562 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( 563 gpu::WaitOp waitOp, ArrayRef<Value> operands, 564 ConversionPatternRewriter &rewriter) const { 565 if (!waitOp.asyncToken()) 566 return rewriter.notifyMatchFailure(waitOp, "Can only convert async op."); 567 568 Location loc = waitOp.getLoc(); 569 570 auto insertionPoint = rewriter.saveInsertionPoint(); 571 SmallVector<Value, 1> events; 572 for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) { 573 auto operand = std::get<1>(pair); 574 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 575 // The converted operand's definition created a stream. Insert an event 576 // into the stream just after the last use of the original token operand. 577 auto *defOp = std::get<0>(pair).getDefiningOp(); 578 rewriter.setInsertionPointAfter(defOp); 579 auto event = 580 eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 581 eventRecordCallBuilder.create(loc, rewriter, {event, operand}); 582 events.push_back(event); 583 } else { 584 // Otherwise the converted operand is an event. This assumes that we use 585 // events in control flow code as well. 586 events.push_back(operand); 587 } 588 } 589 rewriter.restoreInsertionPoint(insertionPoint); 590 auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 591 for (auto event : events) 592 streamWaitEventCallBuilder.create(loc, rewriter, {stream, event}); 593 for (auto event : events) 594 eventDestroyCallBuilder.create(loc, rewriter, {event}); 595 rewriter.replaceOp(waitOp, {stream}); 596 597 return success(); 598 } 599 600 // Creates a struct containing all kernel parameters on the stack and returns 601 // an array of type-erased pointers to the fields of the struct. The array can 602 // then be passed to the CUDA / ROCm (HIP) kernel launch calls. 603 // The generated code is essentially as follows: 604 // 605 // %struct = alloca(sizeof(struct { Parameters... })) 606 // %array = alloca(NumParameters * sizeof(void *)) 607 // for (i : [0, NumParameters)) 608 // %fieldPtr = llvm.getelementptr %struct[0, i] 609 // llvm.store parameters[i], %fieldPtr 610 // %elementPtr = llvm.getelementptr %array[i] 611 // llvm.store %fieldPtr, %elementPtr 612 // return %array 613 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( 614 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 615 OpBuilder &builder) const { 616 auto loc = launchOp.getLoc(); 617 auto numKernelOperands = launchOp.getNumKernelOperands(); 618 auto arguments = getTypeConverter()->promoteOperands( 619 loc, launchOp.getOperands().take_back(numKernelOperands), 620 operands.take_back(numKernelOperands), builder); 621 auto numArguments = arguments.size(); 622 SmallVector<Type, 4> argumentTypes; 623 argumentTypes.reserve(numArguments); 624 for (auto argument : arguments) 625 argumentTypes.push_back(argument.getType()); 626 auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(), 627 argumentTypes); 628 auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 629 builder.getI32IntegerAttr(1)); 630 auto structPtr = builder.create<LLVM::AllocaOp>( 631 loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0); 632 auto arraySize = builder.create<LLVM::ConstantOp>( 633 loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments)); 634 auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType, 635 arraySize, /*alignment=*/0); 636 auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 637 builder.getI32IntegerAttr(0)); 638 for (auto en : llvm::enumerate(arguments)) { 639 auto index = builder.create<LLVM::ConstantOp>( 640 loc, llvmInt32Type, builder.getI32IntegerAttr(en.index())); 641 auto fieldPtr = builder.create<LLVM::GEPOp>( 642 loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr, 643 ArrayRef<Value>{zero, index.getResult()}); 644 builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr); 645 auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, 646 arrayPtr, index.getResult()); 647 auto casted = 648 builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr); 649 builder.create<LLVM::StoreOp>(loc, casted, elementPtr); 650 } 651 return arrayPtr; 652 } 653 654 // Generates an LLVM IR dialect global that contains the name of the given 655 // kernel function as a C string, and returns a pointer to its beginning. 656 // The code is essentially: 657 // 658 // llvm.global constant @kernel_name("function_name\00") 659 // func(...) { 660 // %0 = llvm.addressof @kernel_name 661 // %1 = llvm.constant (0 : index) 662 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> 663 // } 664 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( 665 StringRef moduleName, StringRef name, Location loc, 666 OpBuilder &builder) const { 667 // Make sure the trailing zero is included in the constant. 668 std::vector<char> kernelName(name.begin(), name.end()); 669 kernelName.push_back('\0'); 670 671 std::string globalName = 672 std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); 673 return LLVM::createGlobalString( 674 loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), 675 LLVM::Linkage::Internal); 676 } 677 678 // Emits LLVM IR to launch a kernel function. Expects the module that contains 679 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a 680 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. 681 // 682 // %0 = call %binarygetter 683 // %1 = call %moduleLoad(%0) 684 // %2 = <see generateKernelNameConstant> 685 // %3 = call %moduleGetFunction(%1, %2) 686 // %4 = call %streamCreate() 687 // %5 = <see generateParamsArray> 688 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr) 689 // call %streamSynchronize(%4) 690 // call %streamDestroy(%4) 691 // call %moduleUnload(%1) 692 // 693 // If the op is async, the stream corresponds to the (single) async dependency 694 // as well as the async token the op produces. 695 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( 696 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 697 ConversionPatternRewriter &rewriter) const { 698 if (failed(areAllLLVMTypes(launchOp, operands, rewriter))) 699 return failure(); 700 701 if (launchOp.asyncDependencies().size() > 1) 702 return rewriter.notifyMatchFailure( 703 launchOp, "Cannot convert with more than one async dependency."); 704 705 // Fail when the synchronous version of the op has async dependencies. The 706 // lowering destroys the stream, and we do not want to check that there is no 707 // use of the stream after this op. 708 if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty()) 709 return rewriter.notifyMatchFailure( 710 launchOp, "Cannot convert non-async op with async dependencies."); 711 712 Location loc = launchOp.getLoc(); 713 714 // Create an LLVM global with CUBIN extracted from the kernel annotation and 715 // obtain a pointer to the first byte in it. 716 auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>( 717 launchOp, launchOp.getKernelModuleName()); 718 assert(kernelModule && "expected a kernel module"); 719 720 auto binaryAttr = 721 kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation); 722 if (!binaryAttr) { 723 kernelModule.emitOpError() 724 << "missing " << gpuBinaryAnnotation << " attribute"; 725 return failure(); 726 } 727 728 SmallString<128> nameBuffer(kernelModule.getName()); 729 nameBuffer.append(kGpuBinaryStorageSuffix); 730 Value data = 731 LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), 732 binaryAttr.getValue(), LLVM::Linkage::Internal); 733 734 auto module = moduleLoadCallBuilder.create(loc, rewriter, data); 735 // Get the function from the module. The name corresponds to the name of 736 // the kernel function. 737 auto kernelName = generateKernelNameConstant( 738 launchOp.getKernelModuleName().getValue(), 739 launchOp.getKernelName().getValue(), loc, rewriter); 740 auto function = moduleGetFunctionCallBuilder.create( 741 loc, rewriter, {module.getResult(0), kernelName}); 742 auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type, 743 rewriter.getI32IntegerAttr(0)); 744 auto adaptor = 745 gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary()); 746 Value stream = 747 adaptor.asyncDependencies().empty() 748 ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0) 749 : adaptor.asyncDependencies().front(); 750 // Create array of pointers to kernel arguments. 751 auto kernelParams = generateParamsArray(launchOp, operands, rewriter); 752 auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType); 753 launchKernelCallBuilder.create(loc, rewriter, 754 {function.getResult(0), adaptor.gridSizeX(), 755 adaptor.gridSizeY(), adaptor.gridSizeZ(), 756 adaptor.blockSizeX(), adaptor.blockSizeY(), 757 adaptor.blockSizeZ(), 758 /*sharedMemBytes=*/zero, stream, kernelParams, 759 /*extra=*/nullpointer}); 760 761 if (launchOp.asyncToken()) { 762 // Async launch: make dependent ops use the same stream. 763 rewriter.replaceOp(launchOp, {stream}); 764 } else { 765 // Synchronize with host and destroy stream. This must be the stream created 766 // above (with no other uses) because we check that the synchronous version 767 // does not have any async dependencies. 768 streamSynchronizeCallBuilder.create(loc, rewriter, stream); 769 streamDestroyCallBuilder.create(loc, rewriter, stream); 770 rewriter.eraseOp(launchOp); 771 } 772 moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0)); 773 774 return success(); 775 } 776 777 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite( 778 gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 779 ConversionPatternRewriter &rewriter) const { 780 auto memRefType = memcpyOp.src().getType().cast<MemRefType>(); 781 782 if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) || 783 !isConvertibleAndHasIdentityMaps(memRefType) || 784 failed(isAsyncWithOneDependency(rewriter, memcpyOp))) 785 return failure(); 786 787 auto loc = memcpyOp.getLoc(); 788 auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary()); 789 790 MemRefDescriptor srcDesc(adaptor.src()); 791 Value numElements = getNumElements(rewriter, loc, memRefType, srcDesc); 792 793 Type elementPtrType = getElementPtrType(memRefType); 794 Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType); 795 Value gepPtr = rewriter.create<LLVM::GEPOp>( 796 loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements}); 797 auto sizeBytes = 798 rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr); 799 800 auto src = rewriter.create<LLVM::BitcastOp>( 801 loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc)); 802 auto dst = rewriter.create<LLVM::BitcastOp>( 803 loc, llvmPointerType, 804 MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc)); 805 806 auto stream = adaptor.asyncDependencies().front(); 807 memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream}); 808 809 rewriter.replaceOp(memcpyOp, {stream}); 810 811 return success(); 812 } 813 814 LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite( 815 gpu::MemsetOp memsetOp, ArrayRef<Value> operands, 816 ConversionPatternRewriter &rewriter) const { 817 auto memRefType = memsetOp.dst().getType().cast<MemRefType>(); 818 819 if (failed(areAllLLVMTypes(memsetOp, operands, rewriter)) || 820 !isConvertibleAndHasIdentityMaps(memRefType) || 821 failed(isAsyncWithOneDependency(rewriter, memsetOp))) 822 return failure(); 823 824 auto loc = memsetOp.getLoc(); 825 auto adaptor = gpu::MemsetOpAdaptor(operands, memsetOp->getAttrDictionary()); 826 827 Type valueType = adaptor.value().getType(); 828 if (!valueType.isIntOrFloat() || valueType.getIntOrFloatBitWidth() != 32) { 829 return rewriter.notifyMatchFailure(memsetOp, 830 "value must be a 32 bit scalar"); 831 } 832 833 MemRefDescriptor dstDesc(adaptor.dst()); 834 Value numElements = getNumElements(rewriter, loc, memRefType, dstDesc); 835 836 auto value = 837 rewriter.create<LLVM::BitcastOp>(loc, llvmInt32Type, adaptor.value()); 838 auto dst = rewriter.create<LLVM::BitcastOp>( 839 loc, llvmPointerType, dstDesc.alignedPtr(rewriter, loc)); 840 841 auto stream = adaptor.asyncDependencies().front(); 842 memsetCallBuilder.create(loc, rewriter, {dst, value, numElements, stream}); 843 844 rewriter.replaceOp(memsetOp, {stream}); 845 return success(); 846 } 847 848 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> 849 mlir::createGpuToLLVMConversionPass() { 850 return std::make_unique<GpuToLLVMConversionPass>(); 851 } 852 853 void mlir::populateGpuToLLVMConversionPatterns( 854 LLVMTypeConverter &converter, OwningRewritePatternList &patterns, 855 StringRef gpuBinaryAnnotation) { 856 converter.addConversion( 857 [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { 858 return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 859 }); 860 patterns.add<ConvertAllocOpToGpuRuntimeCallPattern, 861 ConvertDeallocOpToGpuRuntimeCallPattern, 862 ConvertHostRegisterOpToGpuRuntimeCallPattern, 863 ConvertMemcpyOpToGpuRuntimeCallPattern, 864 ConvertMemsetOpToGpuRuntimeCallPattern, 865 ConvertWaitAsyncOpToGpuRuntimeCallPattern, 866 ConvertWaitOpToGpuRuntimeCallPattern, 867 ConvertAsyncYieldToGpuRuntimeCallPattern>(converter); 868 patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(converter, 869 gpuBinaryAnnotation); 870 patterns.add<EraseGpuModuleOpPattern>(&converter.getContext()); 871 } 872