1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to convert gpu.launch_func op into a sequence of 10 // GPU runtime calls. As most of GPU runtimes does not have a stable published 11 // ABI, this pass uses a slim runtime layer that builds on top of the public 12 // API from GPU runtime headers. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" 17 18 #include "../PassDetail.h" 19 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" 20 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" 21 #include "mlir/Conversion/LLVMCommon/Pattern.h" 22 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" 23 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" 24 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" 25 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" 26 #include "mlir/Dialect/Async/IR/Async.h" 27 #include "mlir/Dialect/GPU/GPUDialect.h" 28 #include "mlir/Dialect/GPU/Passes.h" 29 #include "mlir/Dialect/LLVMIR/LLVMDialect.h" 30 #include "mlir/IR/Attributes.h" 31 #include "mlir/IR/Builders.h" 32 #include "mlir/IR/BuiltinOps.h" 33 #include "mlir/IR/BuiltinTypes.h" 34 35 #include "llvm/ADT/STLExtras.h" 36 #include "llvm/Support/Error.h" 37 #include "llvm/Support/FormatVariadic.h" 38 39 using namespace mlir; 40 41 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; 42 43 namespace { 44 45 class GpuToLLVMConversionPass 46 : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> { 47 public: 48 GpuToLLVMConversionPass() = default; 49 50 GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other) 51 : GpuToLLVMConversionPassBase(other) {} 52 53 // Run the dialect converter on the module. 54 void runOnOperation() override; 55 56 private: 57 Option<std::string> gpuBinaryAnnotation{ 58 *this, "gpu-binary-annotation", 59 llvm::cl::desc("Annotation attribute string for GPU binary"), 60 llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())}; 61 }; 62 63 struct FunctionCallBuilder { 64 FunctionCallBuilder(StringRef functionName, Type returnType, 65 ArrayRef<Type> argumentTypes) 66 : functionName(functionName), 67 functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {} 68 LLVM::CallOp create(Location loc, OpBuilder &builder, 69 ArrayRef<Value> arguments) const; 70 71 StringRef functionName; 72 LLVM::LLVMFunctionType functionType; 73 }; 74 75 template <typename OpTy> 76 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { 77 public: 78 explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 79 : ConvertOpToLLVMPattern<OpTy>(typeConverter) {} 80 81 protected: 82 MLIRContext *context = &this->getTypeConverter()->getContext(); 83 84 Type llvmVoidType = LLVM::LLVMVoidType::get(context); 85 Type llvmPointerType = 86 LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 87 Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType); 88 Type llvmInt8Type = IntegerType::get(context, 8); 89 Type llvmInt32Type = IntegerType::get(context, 32); 90 Type llvmInt64Type = IntegerType::get(context, 64); 91 Type llvmIntPtrType = IntegerType::get( 92 context, this->getTypeConverter()->getPointerBitwidth(0)); 93 94 FunctionCallBuilder moduleLoadCallBuilder = { 95 "mgpuModuleLoad", 96 llvmPointerType /* void *module */, 97 {llvmPointerType /* void *cubin */}}; 98 FunctionCallBuilder moduleUnloadCallBuilder = { 99 "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; 100 FunctionCallBuilder moduleGetFunctionCallBuilder = { 101 "mgpuModuleGetFunction", 102 llvmPointerType /* void *function */, 103 { 104 llvmPointerType, /* void *module */ 105 llvmPointerType /* char *name */ 106 }}; 107 FunctionCallBuilder launchKernelCallBuilder = { 108 "mgpuLaunchKernel", 109 llvmVoidType, 110 { 111 llvmPointerType, /* void* f */ 112 llvmIntPtrType, /* intptr_t gridXDim */ 113 llvmIntPtrType, /* intptr_t gridyDim */ 114 llvmIntPtrType, /* intptr_t gridZDim */ 115 llvmIntPtrType, /* intptr_t blockXDim */ 116 llvmIntPtrType, /* intptr_t blockYDim */ 117 llvmIntPtrType, /* intptr_t blockZDim */ 118 llvmInt32Type, /* unsigned int sharedMemBytes */ 119 llvmPointerType, /* void *hstream */ 120 llvmPointerPointerType, /* void **kernelParams */ 121 llvmPointerPointerType /* void **extra */ 122 }}; 123 FunctionCallBuilder streamCreateCallBuilder = { 124 "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; 125 FunctionCallBuilder streamDestroyCallBuilder = { 126 "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}}; 127 FunctionCallBuilder streamSynchronizeCallBuilder = { 128 "mgpuStreamSynchronize", 129 llvmVoidType, 130 {llvmPointerType /* void *stream */}}; 131 FunctionCallBuilder streamWaitEventCallBuilder = { 132 "mgpuStreamWaitEvent", 133 llvmVoidType, 134 {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}}; 135 FunctionCallBuilder eventCreateCallBuilder = { 136 "mgpuEventCreate", llvmPointerType /* void *event */, {}}; 137 FunctionCallBuilder eventDestroyCallBuilder = { 138 "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}}; 139 FunctionCallBuilder eventSynchronizeCallBuilder = { 140 "mgpuEventSynchronize", 141 llvmVoidType, 142 {llvmPointerType /* void *event */}}; 143 FunctionCallBuilder eventRecordCallBuilder = { 144 "mgpuEventRecord", 145 llvmVoidType, 146 {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}}; 147 FunctionCallBuilder hostRegisterCallBuilder = { 148 "mgpuMemHostRegisterMemRef", 149 llvmVoidType, 150 {llvmIntPtrType /* intptr_t rank */, 151 llvmPointerType /* void *memrefDesc */, 152 llvmIntPtrType /* intptr_t elementSizeBytes */}}; 153 FunctionCallBuilder allocCallBuilder = { 154 "mgpuMemAlloc", 155 llvmPointerType /* void * */, 156 {llvmIntPtrType /* intptr_t sizeBytes */, 157 llvmPointerType /* void *stream */}}; 158 FunctionCallBuilder deallocCallBuilder = { 159 "mgpuMemFree", 160 llvmVoidType, 161 {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}}; 162 FunctionCallBuilder memcpyCallBuilder = { 163 "mgpuMemcpy", 164 llvmVoidType, 165 {llvmPointerType /* void *dst */, llvmPointerType /* void *src */, 166 llvmIntPtrType /* intptr_t sizeBytes */, 167 llvmPointerType /* void *stream */}}; 168 }; 169 170 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime 171 /// call. Currently it supports CUDA and ROCm (HIP). 172 class ConvertHostRegisterOpToGpuRuntimeCallPattern 173 : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> { 174 public: 175 ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 176 : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {} 177 178 private: 179 LogicalResult 180 matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 181 ConversionPatternRewriter &rewriter) const override; 182 }; 183 184 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime 185 /// call. Currently it supports CUDA and ROCm (HIP). 186 class ConvertAllocOpToGpuRuntimeCallPattern 187 : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> { 188 public: 189 ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 190 : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {} 191 192 private: 193 LogicalResult 194 matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands, 195 ConversionPatternRewriter &rewriter) const override; 196 }; 197 198 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime 199 /// call. Currently it supports CUDA and ROCm (HIP). 200 class ConvertDeallocOpToGpuRuntimeCallPattern 201 : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> { 202 public: 203 ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 204 : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {} 205 206 private: 207 LogicalResult 208 matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 209 ConversionPatternRewriter &rewriter) const override; 210 }; 211 212 class ConvertAsyncYieldToGpuRuntimeCallPattern 213 : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> { 214 public: 215 ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 216 : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {} 217 218 private: 219 LogicalResult 220 matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands, 221 ConversionPatternRewriter &rewriter) const override; 222 }; 223 224 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime 225 /// call. Currently it supports CUDA and ROCm (HIP). 226 class ConvertWaitOpToGpuRuntimeCallPattern 227 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 228 public: 229 ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 230 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 231 232 private: 233 LogicalResult 234 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 235 ConversionPatternRewriter &rewriter) const override; 236 }; 237 238 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime 239 /// call. Currently it supports CUDA and ROCm (HIP). 240 class ConvertWaitAsyncOpToGpuRuntimeCallPattern 241 : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> { 242 public: 243 ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 244 : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {} 245 246 private: 247 LogicalResult 248 matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands, 249 ConversionPatternRewriter &rewriter) const override; 250 }; 251 252 /// A rewrite patter to convert gpu.launch_func operations into a sequence of 253 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). 254 /// 255 /// In essence, a gpu.launch_func operations gets compiled into the following 256 /// sequence of runtime calls: 257 /// 258 /// * moduleLoad -- loads the module given the cubin / hsaco data 259 /// * moduleGetFunction -- gets a handle to the actual kernel function 260 /// * getStreamHelper -- initializes a new compute stream on GPU 261 /// * launchKernel -- launches the kernel on a stream 262 /// * streamSynchronize -- waits for operations on the stream to finish 263 /// 264 /// Intermediate data structures are allocated on the stack. 265 class ConvertLaunchFuncOpToGpuRuntimeCallPattern 266 : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> { 267 public: 268 ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, 269 StringRef gpuBinaryAnnotation) 270 : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter), 271 gpuBinaryAnnotation(gpuBinaryAnnotation) {} 272 273 private: 274 Value generateParamsArray(gpu::LaunchFuncOp launchOp, 275 ArrayRef<Value> operands, OpBuilder &builder) const; 276 Value generateKernelNameConstant(StringRef moduleName, StringRef name, 277 Location loc, OpBuilder &builder) const; 278 279 LogicalResult 280 matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 281 ConversionPatternRewriter &rewriter) const override; 282 283 llvm::SmallString<32> gpuBinaryAnnotation; 284 }; 285 286 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> { 287 using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern; 288 289 LogicalResult matchAndRewrite(gpu::GPUModuleOp op, 290 PatternRewriter &rewriter) const override { 291 // GPU kernel modules are no longer necessary since we have a global 292 // constant with the CUBIN, or HSACO data. 293 rewriter.eraseOp(op); 294 return success(); 295 } 296 }; 297 298 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime 299 /// call. Currently it supports CUDA and ROCm (HIP). 300 class ConvertMemcpyOpToGpuRuntimeCallPattern 301 : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> { 302 public: 303 ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) 304 : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {} 305 306 private: 307 LogicalResult 308 matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 309 ConversionPatternRewriter &rewriter) const override; 310 }; 311 } // namespace 312 313 void GpuToLLVMConversionPass::runOnOperation() { 314 LLVMTypeConverter converter(&getContext()); 315 RewritePatternSet patterns(&getContext()); 316 LLVMConversionTarget target(getContext()); 317 318 target.addIllegalDialect<gpu::GPUDialect>(); 319 target.addIllegalOp<UnrealizedConversionCastOp>(); 320 321 populateVectorToLLVMConversionPatterns(converter, patterns); 322 populateMemRefToLLVMConversionPatterns(converter, patterns); 323 populateStdToLLVMConversionPatterns(converter, patterns); 324 populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, 325 target); 326 populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); 327 328 if (failed( 329 applyPartialConversion(getOperation(), target, std::move(patterns)))) 330 signalPassFailure(); 331 } 332 333 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, 334 ArrayRef<Value> arguments) const { 335 auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>(); 336 auto function = [&] { 337 if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName)) 338 return function; 339 return OpBuilder::atBlockEnd(module.getBody()) 340 .create<LLVM::LLVMFuncOp>(loc, functionName, functionType); 341 }(); 342 return builder.create<LLVM::CallOp>(loc, function, arguments); 343 } 344 345 // Returns whether all operands are of LLVM type. 346 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands, 347 ConversionPatternRewriter &rewriter) { 348 if (!llvm::all_of(operands, [](Value value) { 349 return LLVM::isCompatibleType(value.getType()); 350 })) 351 return rewriter.notifyMatchFailure( 352 op, "Cannot convert if operands aren't of LLVM type."); 353 return success(); 354 } 355 356 static LogicalResult 357 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter, 358 gpu::AsyncOpInterface op) { 359 if (op.getAsyncDependencies().size() != 1) 360 return rewriter.notifyMatchFailure( 361 op, "Can only convert with exactly one async dependency."); 362 363 if (!op.getAsyncToken()) 364 return rewriter.notifyMatchFailure(op, "Can convert only async version."); 365 366 return success(); 367 } 368 369 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite( 370 gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands, 371 ConversionPatternRewriter &rewriter) const { 372 auto *op = hostRegisterOp.getOperation(); 373 if (failed(areAllLLVMTypes(op, operands, rewriter))) 374 return failure(); 375 376 Location loc = op->getLoc(); 377 378 auto memRefType = hostRegisterOp.value().getType(); 379 auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType(); 380 auto elementSize = getSizeInBytes(loc, elementType, rewriter); 381 382 auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(), 383 operands, rewriter); 384 arguments.push_back(elementSize); 385 hostRegisterCallBuilder.create(loc, rewriter, arguments); 386 387 rewriter.eraseOp(op); 388 return success(); 389 } 390 391 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( 392 gpu::AllocOp allocOp, ArrayRef<Value> operands, 393 ConversionPatternRewriter &rewriter) const { 394 MemRefType memRefType = allocOp.getType(); 395 396 if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) || 397 !isConvertibleAndHasIdentityMaps(memRefType) || 398 failed(isAsyncWithOneDependency(rewriter, allocOp))) 399 return failure(); 400 401 auto loc = allocOp.getLoc(); 402 auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary()); 403 404 // Get shape of the memref as values: static sizes are constant 405 // values and dynamic sizes are passed to 'alloc' as operands. 406 SmallVector<Value, 4> shape; 407 SmallVector<Value, 4> strides; 408 Value sizeBytes; 409 getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter, 410 shape, strides, sizeBytes); 411 412 // Allocate the underlying buffer and store a pointer to it in the MemRef 413 // descriptor. 414 Type elementPtrType = this->getElementPtrType(memRefType); 415 auto stream = adaptor.asyncDependencies().front(); 416 Value allocatedPtr = 417 allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0); 418 allocatedPtr = 419 rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); 420 421 // No alignment. 422 Value alignedPtr = allocatedPtr; 423 424 // Create the MemRef descriptor. 425 auto memRefDescriptor = this->createMemRefDescriptor( 426 loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter); 427 428 rewriter.replaceOp(allocOp, {memRefDescriptor, stream}); 429 430 return success(); 431 } 432 433 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( 434 gpu::DeallocOp deallocOp, ArrayRef<Value> operands, 435 ConversionPatternRewriter &rewriter) const { 436 if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) || 437 failed(isAsyncWithOneDependency(rewriter, deallocOp))) 438 return failure(); 439 440 Location loc = deallocOp.getLoc(); 441 442 auto adaptor = 443 gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary()); 444 Value pointer = 445 MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc); 446 auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer); 447 Value stream = adaptor.asyncDependencies().front(); 448 deallocCallBuilder.create(loc, rewriter, {casted, stream}); 449 450 rewriter.replaceOp(deallocOp, {stream}); 451 return success(); 452 } 453 454 static bool isGpuAsyncTokenType(Value value) { 455 return value.getType().isa<gpu::AsyncTokenType>(); 456 } 457 458 // Converts !gpu.async.token operands of `async.yield` to runtime calls. The 459 // !gpu.async.token are lowered to stream within the async.execute region, but 460 // are passed as events between them. For each !gpu.async.token operand, we 461 // create an event and record it on the stream. 462 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( 463 async::YieldOp yieldOp, ArrayRef<Value> operands, 464 ConversionPatternRewriter &rewriter) const { 465 if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType)) 466 return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand"); 467 468 Location loc = yieldOp.getLoc(); 469 SmallVector<Value, 4> newOperands(operands.begin(), operands.end()); 470 llvm::SmallDenseSet<Value> streams; 471 for (auto &operand : yieldOp->getOpOperands()) { 472 if (!isGpuAsyncTokenType(operand.get())) 473 continue; 474 auto idx = operand.getOperandNumber(); 475 auto stream = operands[idx]; 476 auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 477 eventRecordCallBuilder.create(loc, rewriter, {event, stream}); 478 newOperands[idx] = event; 479 streams.insert(stream); 480 } 481 for (auto stream : streams) 482 streamDestroyCallBuilder.create(loc, rewriter, {stream}); 483 484 rewriter.updateRootInPlace(yieldOp, 485 [&] { yieldOp->setOperands(newOperands); }); 486 return success(); 487 } 488 489 // Returns whether `value` is the result of an LLVM::CallOp to `functionName`. 490 static bool isDefinedByCallTo(Value value, StringRef functionName) { 491 assert(value.getType().isa<LLVM::LLVMPointerType>()); 492 if (auto defOp = value.getDefiningOp<LLVM::CallOp>()) 493 return defOp.callee()->equals(functionName); 494 return false; 495 } 496 497 // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host 498 // with the stream/event operands. The operands are destroyed. That is, it 499 // assumes that it is not used afterwards or elsewhere. Otherwise we will get a 500 // runtime error. Eventually, we should guarantee this property. 501 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite( 502 gpu::WaitOp waitOp, ArrayRef<Value> operands, 503 ConversionPatternRewriter &rewriter) const { 504 if (waitOp.asyncToken()) 505 return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op."); 506 507 Location loc = waitOp.getLoc(); 508 509 for (auto operand : operands) { 510 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 511 // The converted operand's definition created a stream. 512 streamSynchronizeCallBuilder.create(loc, rewriter, {operand}); 513 streamDestroyCallBuilder.create(loc, rewriter, {operand}); 514 } else { 515 // Otherwise the converted operand is an event. This assumes that we use 516 // events in control flow code as well. 517 eventSynchronizeCallBuilder.create(loc, rewriter, {operand}); 518 eventDestroyCallBuilder.create(loc, rewriter, {operand}); 519 } 520 } 521 522 rewriter.eraseOp(waitOp); 523 return success(); 524 } 525 526 // Converts `gpu.wait async` to runtime calls. The converted op creates a new 527 // stream that is synchronized with stream/event operands. The operands are 528 // destroyed. That is, it assumes that it is not used afterwards or elsewhere. 529 // Otherwise we will get a runtime error. Eventually, we should guarantee this 530 // property. 531 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( 532 gpu::WaitOp waitOp, ArrayRef<Value> operands, 533 ConversionPatternRewriter &rewriter) const { 534 if (!waitOp.asyncToken()) 535 return rewriter.notifyMatchFailure(waitOp, "Can only convert async op."); 536 537 Location loc = waitOp.getLoc(); 538 539 auto insertionPoint = rewriter.saveInsertionPoint(); 540 SmallVector<Value, 1> events; 541 for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) { 542 auto operand = std::get<1>(pair); 543 if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) { 544 // The converted operand's definition created a stream. Insert an event 545 // into the stream just after the last use of the original token operand. 546 auto *defOp = std::get<0>(pair).getDefiningOp(); 547 rewriter.setInsertionPointAfter(defOp); 548 auto event = 549 eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 550 eventRecordCallBuilder.create(loc, rewriter, {event, operand}); 551 events.push_back(event); 552 } else { 553 // Otherwise the converted operand is an event. This assumes that we use 554 // events in control flow code as well. 555 events.push_back(operand); 556 } 557 } 558 rewriter.restoreInsertionPoint(insertionPoint); 559 auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0); 560 for (auto event : events) 561 streamWaitEventCallBuilder.create(loc, rewriter, {stream, event}); 562 for (auto event : events) 563 eventDestroyCallBuilder.create(loc, rewriter, {event}); 564 rewriter.replaceOp(waitOp, {stream}); 565 566 return success(); 567 } 568 569 // Creates a struct containing all kernel parameters on the stack and returns 570 // an array of type-erased pointers to the fields of the struct. The array can 571 // then be passed to the CUDA / ROCm (HIP) kernel launch calls. 572 // The generated code is essentially as follows: 573 // 574 // %struct = alloca(sizeof(struct { Parameters... })) 575 // %array = alloca(NumParameters * sizeof(void *)) 576 // for (i : [0, NumParameters)) 577 // %fieldPtr = llvm.getelementptr %struct[0, i] 578 // llvm.store parameters[i], %fieldPtr 579 // %elementPtr = llvm.getelementptr %array[i] 580 // llvm.store %fieldPtr, %elementPtr 581 // return %array 582 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( 583 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 584 OpBuilder &builder) const { 585 auto loc = launchOp.getLoc(); 586 auto numKernelOperands = launchOp.getNumKernelOperands(); 587 auto arguments = getTypeConverter()->promoteOperands( 588 loc, launchOp.getOperands().take_back(numKernelOperands), 589 operands.take_back(numKernelOperands), builder); 590 auto numArguments = arguments.size(); 591 SmallVector<Type, 4> argumentTypes; 592 argumentTypes.reserve(numArguments); 593 for (auto argument : arguments) 594 argumentTypes.push_back(argument.getType()); 595 auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(), 596 argumentTypes); 597 auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 598 builder.getI32IntegerAttr(1)); 599 auto structPtr = builder.create<LLVM::AllocaOp>( 600 loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0); 601 auto arraySize = builder.create<LLVM::ConstantOp>( 602 loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments)); 603 auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType, 604 arraySize, /*alignment=*/0); 605 auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 606 builder.getI32IntegerAttr(0)); 607 for (auto en : llvm::enumerate(arguments)) { 608 auto index = builder.create<LLVM::ConstantOp>( 609 loc, llvmInt32Type, builder.getI32IntegerAttr(en.index())); 610 auto fieldPtr = builder.create<LLVM::GEPOp>( 611 loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr, 612 ArrayRef<Value>{zero, index.getResult()}); 613 builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr); 614 auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, 615 arrayPtr, index.getResult()); 616 auto casted = 617 builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr); 618 builder.create<LLVM::StoreOp>(loc, casted, elementPtr); 619 } 620 return arrayPtr; 621 } 622 623 // Generates an LLVM IR dialect global that contains the name of the given 624 // kernel function as a C string, and returns a pointer to its beginning. 625 // The code is essentially: 626 // 627 // llvm.global constant @kernel_name("function_name\00") 628 // func(...) { 629 // %0 = llvm.addressof @kernel_name 630 // %1 = llvm.constant (0 : index) 631 // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> 632 // } 633 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( 634 StringRef moduleName, StringRef name, Location loc, 635 OpBuilder &builder) const { 636 // Make sure the trailing zero is included in the constant. 637 std::vector<char> kernelName(name.begin(), name.end()); 638 kernelName.push_back('\0'); 639 640 std::string globalName = 641 std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); 642 return LLVM::createGlobalString( 643 loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), 644 LLVM::Linkage::Internal); 645 } 646 647 // Emits LLVM IR to launch a kernel function. Expects the module that contains 648 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a 649 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. 650 // 651 // %0 = call %binarygetter 652 // %1 = call %moduleLoad(%0) 653 // %2 = <see generateKernelNameConstant> 654 // %3 = call %moduleGetFunction(%1, %2) 655 // %4 = call %streamCreate() 656 // %5 = <see generateParamsArray> 657 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr) 658 // call %streamSynchronize(%4) 659 // call %streamDestroy(%4) 660 // call %moduleUnload(%1) 661 // 662 // If the op is async, the stream corresponds to the (single) async dependency 663 // as well as the async token the op produces. 664 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( 665 gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands, 666 ConversionPatternRewriter &rewriter) const { 667 if (failed(areAllLLVMTypes(launchOp, operands, rewriter))) 668 return failure(); 669 670 if (launchOp.asyncDependencies().size() > 1) 671 return rewriter.notifyMatchFailure( 672 launchOp, "Cannot convert with more than one async dependency."); 673 674 // Fail when the synchronous version of the op has async dependencies. The 675 // lowering destroys the stream, and we do not want to check that there is no 676 // use of the stream after this op. 677 if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty()) 678 return rewriter.notifyMatchFailure( 679 launchOp, "Cannot convert non-async op with async dependencies."); 680 681 Location loc = launchOp.getLoc(); 682 683 // Create an LLVM global with CUBIN extracted from the kernel annotation and 684 // obtain a pointer to the first byte in it. 685 auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>( 686 launchOp, launchOp.getKernelModuleName()); 687 assert(kernelModule && "expected a kernel module"); 688 689 auto binaryAttr = 690 kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation); 691 if (!binaryAttr) { 692 kernelModule.emitOpError() 693 << "missing " << gpuBinaryAnnotation << " attribute"; 694 return failure(); 695 } 696 697 SmallString<128> nameBuffer(kernelModule.getName()); 698 nameBuffer.append(kGpuBinaryStorageSuffix); 699 Value data = 700 LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), 701 binaryAttr.getValue(), LLVM::Linkage::Internal); 702 703 auto module = moduleLoadCallBuilder.create(loc, rewriter, data); 704 // Get the function from the module. The name corresponds to the name of 705 // the kernel function. 706 auto kernelName = generateKernelNameConstant( 707 launchOp.getKernelModuleName().getValue(), 708 launchOp.getKernelName().getValue(), loc, rewriter); 709 auto function = moduleGetFunctionCallBuilder.create( 710 loc, rewriter, {module.getResult(0), kernelName}); 711 auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type, 712 rewriter.getI32IntegerAttr(0)); 713 auto adaptor = 714 gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary()); 715 Value stream = 716 adaptor.asyncDependencies().empty() 717 ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0) 718 : adaptor.asyncDependencies().front(); 719 // Create array of pointers to kernel arguments. 720 auto kernelParams = generateParamsArray(launchOp, operands, rewriter); 721 auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType); 722 launchKernelCallBuilder.create(loc, rewriter, 723 {function.getResult(0), launchOp.gridSizeX(), 724 launchOp.gridSizeY(), launchOp.gridSizeZ(), 725 launchOp.blockSizeX(), launchOp.blockSizeY(), 726 launchOp.blockSizeZ(), 727 /*sharedMemBytes=*/zero, stream, kernelParams, 728 /*extra=*/nullpointer}); 729 730 if (launchOp.asyncToken()) { 731 // Async launch: make dependent ops use the same stream. 732 rewriter.replaceOp(launchOp, {stream}); 733 } else { 734 // Synchronize with host and destroy stream. This must be the stream created 735 // above (with no other uses) because we check that the synchronous version 736 // does not have any async dependencies. 737 streamSynchronizeCallBuilder.create(loc, rewriter, stream); 738 streamDestroyCallBuilder.create(loc, rewriter, stream); 739 rewriter.eraseOp(launchOp); 740 } 741 moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0)); 742 743 return success(); 744 } 745 746 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite( 747 gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands, 748 ConversionPatternRewriter &rewriter) const { 749 auto memRefType = memcpyOp.src().getType().cast<MemRefType>(); 750 751 if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) || 752 !isConvertibleAndHasIdentityMaps(memRefType) || 753 failed(isAsyncWithOneDependency(rewriter, memcpyOp))) 754 return failure(); 755 756 auto loc = memcpyOp.getLoc(); 757 auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary()); 758 759 MemRefDescriptor srcDesc(adaptor.src()); 760 761 Value numElements = 762 memRefType.hasStaticShape() 763 ? createIndexConstant(rewriter, loc, memRefType.getNumElements()) 764 // For identity layouts (verified above), the number of elements is 765 // stride[0] * size[0]. 766 : rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0), 767 srcDesc.size(rewriter, loc, 0)); 768 769 Type elementPtrType = getElementPtrType(memRefType); 770 Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType); 771 Value gepPtr = rewriter.create<LLVM::GEPOp>( 772 loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements}); 773 auto sizeBytes = 774 rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr); 775 776 auto src = rewriter.create<LLVM::BitcastOp>( 777 loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc)); 778 auto dst = rewriter.create<LLVM::BitcastOp>( 779 loc, llvmPointerType, 780 MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc)); 781 782 auto stream = adaptor.asyncDependencies().front(); 783 memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream}); 784 785 rewriter.replaceOp(memcpyOp, {stream}); 786 787 return success(); 788 } 789 790 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> 791 mlir::createGpuToLLVMConversionPass() { 792 return std::make_unique<GpuToLLVMConversionPass>(); 793 } 794 795 void mlir::populateGpuToLLVMConversionPatterns( 796 LLVMTypeConverter &converter, OwningRewritePatternList &patterns, 797 StringRef gpuBinaryAnnotation) { 798 converter.addConversion( 799 [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { 800 return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); 801 }); 802 patterns.add<ConvertAllocOpToGpuRuntimeCallPattern, 803 ConvertDeallocOpToGpuRuntimeCallPattern, 804 ConvertHostRegisterOpToGpuRuntimeCallPattern, 805 ConvertMemcpyOpToGpuRuntimeCallPattern, 806 ConvertWaitAsyncOpToGpuRuntimeCallPattern, 807 ConvertWaitOpToGpuRuntimeCallPattern, 808 ConvertAsyncYieldToGpuRuntimeCallPattern>(converter); 809 patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(converter, 810 gpuBinaryAnnotation); 811 patterns.add<EraseGpuModuleOpPattern>(&converter.getContext()); 812 } 813