1 //===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to convert gpu.launch_func op into a sequence of
10 // GPU runtime calls. As most of GPU runtimes does not have a stable published
11 // ABI, this pass uses a slim runtime layer that builds on top of the public
12 // API from GPU runtime headers.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
17 
18 #include "../PassDetail.h"
19 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
20 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
21 #include "mlir/Conversion/LLVMCommon/Pattern.h"
22 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
23 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
24 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
25 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
26 #include "mlir/Dialect/Async/IR/Async.h"
27 #include "mlir/Dialect/GPU/GPUDialect.h"
28 #include "mlir/Dialect/GPU/Passes.h"
29 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
30 #include "mlir/IR/Attributes.h"
31 #include "mlir/IR/Builders.h"
32 #include "mlir/IR/BuiltinOps.h"
33 #include "mlir/IR/BuiltinTypes.h"
34 
35 #include "llvm/ADT/STLExtras.h"
36 #include "llvm/Support/Error.h"
37 #include "llvm/Support/FormatVariadic.h"
38 
39 using namespace mlir;
40 
41 static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
42 
43 namespace {
44 
45 class GpuToLLVMConversionPass
46     : public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
47 public:
48   GpuToLLVMConversionPass() = default;
49 
50   GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other)
51       : GpuToLLVMConversionPassBase(other) {}
52 
53   // Run the dialect converter on the module.
54   void runOnOperation() override;
55 
56 private:
57   Option<std::string> gpuBinaryAnnotation{
58       *this, "gpu-binary-annotation",
59       llvm::cl::desc("Annotation attribute string for GPU binary"),
60       llvm::cl::init(gpu::getDefaultGpuBinaryAnnotation())};
61 };
62 
63 struct FunctionCallBuilder {
64   FunctionCallBuilder(StringRef functionName, Type returnType,
65                       ArrayRef<Type> argumentTypes)
66       : functionName(functionName),
67         functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {}
68   LLVM::CallOp create(Location loc, OpBuilder &builder,
69                       ArrayRef<Value> arguments) const;
70 
71   StringRef functionName;
72   LLVM::LLVMFunctionType functionType;
73 };
74 
75 template <typename OpTy>
76 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
77 public:
78   explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
79       : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
80 
81 protected:
82   MLIRContext *context = &this->getTypeConverter()->getContext();
83 
84   Type llvmVoidType = LLVM::LLVMVoidType::get(context);
85   Type llvmPointerType =
86       LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
87   Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType);
88   Type llvmInt8Type = IntegerType::get(context, 8);
89   Type llvmInt32Type = IntegerType::get(context, 32);
90   Type llvmInt64Type = IntegerType::get(context, 64);
91   Type llvmIntPtrType = IntegerType::get(
92       context, this->getTypeConverter()->getPointerBitwidth(0));
93 
94   FunctionCallBuilder moduleLoadCallBuilder = {
95       "mgpuModuleLoad",
96       llvmPointerType /* void *module */,
97       {llvmPointerType /* void *cubin */}};
98   FunctionCallBuilder moduleUnloadCallBuilder = {
99       "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
100   FunctionCallBuilder moduleGetFunctionCallBuilder = {
101       "mgpuModuleGetFunction",
102       llvmPointerType /* void *function */,
103       {
104           llvmPointerType, /* void *module */
105           llvmPointerType  /* char *name   */
106       }};
107   FunctionCallBuilder launchKernelCallBuilder = {
108       "mgpuLaunchKernel",
109       llvmVoidType,
110       {
111           llvmPointerType,        /* void* f */
112           llvmIntPtrType,         /* intptr_t gridXDim */
113           llvmIntPtrType,         /* intptr_t gridyDim */
114           llvmIntPtrType,         /* intptr_t gridZDim */
115           llvmIntPtrType,         /* intptr_t blockXDim */
116           llvmIntPtrType,         /* intptr_t blockYDim */
117           llvmIntPtrType,         /* intptr_t blockZDim */
118           llvmInt32Type,          /* unsigned int sharedMemBytes */
119           llvmPointerType,        /* void *hstream */
120           llvmPointerPointerType, /* void **kernelParams */
121           llvmPointerPointerType  /* void **extra */
122       }};
123   FunctionCallBuilder streamCreateCallBuilder = {
124       "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
125   FunctionCallBuilder streamDestroyCallBuilder = {
126       "mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}};
127   FunctionCallBuilder streamSynchronizeCallBuilder = {
128       "mgpuStreamSynchronize",
129       llvmVoidType,
130       {llvmPointerType /* void *stream */}};
131   FunctionCallBuilder streamWaitEventCallBuilder = {
132       "mgpuStreamWaitEvent",
133       llvmVoidType,
134       {llvmPointerType /* void *stream */, llvmPointerType /* void *event */}};
135   FunctionCallBuilder eventCreateCallBuilder = {
136       "mgpuEventCreate", llvmPointerType /* void *event */, {}};
137   FunctionCallBuilder eventDestroyCallBuilder = {
138       "mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}};
139   FunctionCallBuilder eventSynchronizeCallBuilder = {
140       "mgpuEventSynchronize",
141       llvmVoidType,
142       {llvmPointerType /* void *event */}};
143   FunctionCallBuilder eventRecordCallBuilder = {
144       "mgpuEventRecord",
145       llvmVoidType,
146       {llvmPointerType /* void *event */, llvmPointerType /* void *stream */}};
147   FunctionCallBuilder hostRegisterCallBuilder = {
148       "mgpuMemHostRegisterMemRef",
149       llvmVoidType,
150       {llvmIntPtrType /* intptr_t rank */,
151        llvmPointerType /* void *memrefDesc */,
152        llvmIntPtrType /* intptr_t elementSizeBytes */}};
153   FunctionCallBuilder allocCallBuilder = {
154       "mgpuMemAlloc",
155       llvmPointerType /* void * */,
156       {llvmIntPtrType /* intptr_t sizeBytes */,
157        llvmPointerType /* void *stream */}};
158   FunctionCallBuilder deallocCallBuilder = {
159       "mgpuMemFree",
160       llvmVoidType,
161       {llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}};
162   FunctionCallBuilder memcpyCallBuilder = {
163       "mgpuMemcpy",
164       llvmVoidType,
165       {llvmPointerType /* void *dst */, llvmPointerType /* void *src */,
166        llvmIntPtrType /* intptr_t sizeBytes */,
167        llvmPointerType /* void *stream */}};
168 };
169 
170 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
171 /// call. Currently it supports CUDA and ROCm (HIP).
172 class ConvertHostRegisterOpToGpuRuntimeCallPattern
173     : public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
174 public:
175   ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
176       : ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}
177 
178 private:
179   LogicalResult
180   matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
181                   ConversionPatternRewriter &rewriter) const override;
182 };
183 
184 /// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
185 /// call. Currently it supports CUDA and ROCm (HIP).
186 class ConvertAllocOpToGpuRuntimeCallPattern
187     : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> {
188 public:
189   ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
190       : ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {}
191 
192 private:
193   LogicalResult
194   matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands,
195                   ConversionPatternRewriter &rewriter) const override;
196 };
197 
198 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
199 /// call. Currently it supports CUDA and ROCm (HIP).
200 class ConvertDeallocOpToGpuRuntimeCallPattern
201     : public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> {
202 public:
203   ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
204       : ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {}
205 
206 private:
207   LogicalResult
208   matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
209                   ConversionPatternRewriter &rewriter) const override;
210 };
211 
212 class ConvertAsyncYieldToGpuRuntimeCallPattern
213     : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> {
214 public:
215   ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
216       : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {}
217 
218 private:
219   LogicalResult
220   matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands,
221                   ConversionPatternRewriter &rewriter) const override;
222 };
223 
224 /// A rewrite pattern to convert gpu.wait operations into a GPU runtime
225 /// call. Currently it supports CUDA and ROCm (HIP).
226 class ConvertWaitOpToGpuRuntimeCallPattern
227     : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
228 public:
229   ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
230       : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
231 
232 private:
233   LogicalResult
234   matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
235                   ConversionPatternRewriter &rewriter) const override;
236 };
237 
238 /// A rewrite pattern to convert gpu.wait async operations into a GPU runtime
239 /// call. Currently it supports CUDA and ROCm (HIP).
240 class ConvertWaitAsyncOpToGpuRuntimeCallPattern
241     : public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
242 public:
243   ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
244       : ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
245 
246 private:
247   LogicalResult
248   matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
249                   ConversionPatternRewriter &rewriter) const override;
250 };
251 
252 /// A rewrite patter to convert gpu.launch_func operations into a sequence of
253 /// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
254 ///
255 /// In essence, a gpu.launch_func operations gets compiled into the following
256 /// sequence of runtime calls:
257 ///
258 /// * moduleLoad        -- loads the module given the cubin / hsaco data
259 /// * moduleGetFunction -- gets a handle to the actual kernel function
260 /// * getStreamHelper   -- initializes a new compute stream on GPU
261 /// * launchKernel      -- launches the kernel on a stream
262 /// * streamSynchronize -- waits for operations on the stream to finish
263 ///
264 /// Intermediate data structures are allocated on the stack.
265 class ConvertLaunchFuncOpToGpuRuntimeCallPattern
266     : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
267 public:
268   ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
269                                              StringRef gpuBinaryAnnotation)
270       : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
271         gpuBinaryAnnotation(gpuBinaryAnnotation) {}
272 
273 private:
274   Value generateParamsArray(gpu::LaunchFuncOp launchOp,
275                             ArrayRef<Value> operands, OpBuilder &builder) const;
276   Value generateKernelNameConstant(StringRef moduleName, StringRef name,
277                                    Location loc, OpBuilder &builder) const;
278 
279   LogicalResult
280   matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
281                   ConversionPatternRewriter &rewriter) const override;
282 
283   llvm::SmallString<32> gpuBinaryAnnotation;
284 };
285 
286 class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
287   using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
288 
289   LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
290                                 PatternRewriter &rewriter) const override {
291     // GPU kernel modules are no longer necessary since we have a global
292     // constant with the CUBIN, or HSACO data.
293     rewriter.eraseOp(op);
294     return success();
295   }
296 };
297 
298 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
299 /// call. Currently it supports CUDA and ROCm (HIP).
300 class ConvertMemcpyOpToGpuRuntimeCallPattern
301     : public ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp> {
302 public:
303   ConvertMemcpyOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
304       : ConvertOpToGpuRuntimeCallPattern<gpu::MemcpyOp>(typeConverter) {}
305 
306 private:
307   LogicalResult
308   matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
309                   ConversionPatternRewriter &rewriter) const override;
310 };
311 } // namespace
312 
313 void GpuToLLVMConversionPass::runOnOperation() {
314   LLVMTypeConverter converter(&getContext());
315   RewritePatternSet patterns(&getContext());
316   LLVMConversionTarget target(getContext());
317 
318   target.addIllegalDialect<gpu::GPUDialect>();
319   target.addIllegalOp<UnrealizedConversionCastOp>();
320 
321   populateVectorToLLVMConversionPatterns(converter, patterns);
322   populateMemRefToLLVMConversionPatterns(converter, patterns);
323   populateStdToLLVMConversionPatterns(converter, patterns);
324   populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
325                                                     target);
326   populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
327 
328   if (failed(
329           applyPartialConversion(getOperation(), target, std::move(patterns))))
330     signalPassFailure();
331 }
332 
333 LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
334                                          ArrayRef<Value> arguments) const {
335   auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
336   auto function = [&] {
337     if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
338       return function;
339     return OpBuilder::atBlockEnd(module.getBody())
340         .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
341   }();
342   return builder.create<LLVM::CallOp>(loc, function, arguments);
343 }
344 
345 // Returns whether all operands are of LLVM type.
346 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
347                                      ConversionPatternRewriter &rewriter) {
348   if (!llvm::all_of(operands, [](Value value) {
349         return LLVM::isCompatibleType(value.getType());
350       }))
351     return rewriter.notifyMatchFailure(
352         op, "Cannot convert if operands aren't of LLVM type.");
353   return success();
354 }
355 
356 static LogicalResult
357 isAsyncWithOneDependency(ConversionPatternRewriter &rewriter,
358                          gpu::AsyncOpInterface op) {
359   if (op.getAsyncDependencies().size() != 1)
360     return rewriter.notifyMatchFailure(
361         op, "Can only convert with exactly one async dependency.");
362 
363   if (!op.getAsyncToken())
364     return rewriter.notifyMatchFailure(op, "Can convert only async version.");
365 
366   return success();
367 }
368 
369 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
370     gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
371     ConversionPatternRewriter &rewriter) const {
372   auto *op = hostRegisterOp.getOperation();
373   if (failed(areAllLLVMTypes(op, operands, rewriter)))
374     return failure();
375 
376   Location loc = op->getLoc();
377 
378   auto memRefType = hostRegisterOp.value().getType();
379   auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
380   auto elementSize = getSizeInBytes(loc, elementType, rewriter);
381 
382   auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(),
383                                                        operands, rewriter);
384   arguments.push_back(elementSize);
385   hostRegisterCallBuilder.create(loc, rewriter, arguments);
386 
387   rewriter.eraseOp(op);
388   return success();
389 }
390 
391 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
392     gpu::AllocOp allocOp, ArrayRef<Value> operands,
393     ConversionPatternRewriter &rewriter) const {
394   MemRefType memRefType = allocOp.getType();
395 
396   if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) ||
397       !isConvertibleAndHasIdentityMaps(memRefType) ||
398       failed(isAsyncWithOneDependency(rewriter, allocOp)))
399     return failure();
400 
401   auto loc = allocOp.getLoc();
402   auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary());
403 
404   // Get shape of the memref as values: static sizes are constant
405   // values and dynamic sizes are passed to 'alloc' as operands.
406   SmallVector<Value, 4> shape;
407   SmallVector<Value, 4> strides;
408   Value sizeBytes;
409   getMemRefDescriptorSizes(loc, memRefType, adaptor.dynamicSizes(), rewriter,
410                            shape, strides, sizeBytes);
411 
412   // Allocate the underlying buffer and store a pointer to it in the MemRef
413   // descriptor.
414   Type elementPtrType = this->getElementPtrType(memRefType);
415   auto stream = adaptor.asyncDependencies().front();
416   Value allocatedPtr =
417       allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0);
418   allocatedPtr =
419       rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
420 
421   // No alignment.
422   Value alignedPtr = allocatedPtr;
423 
424   // Create the MemRef descriptor.
425   auto memRefDescriptor = this->createMemRefDescriptor(
426       loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
427 
428   rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
429 
430   return success();
431 }
432 
433 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
434     gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
435     ConversionPatternRewriter &rewriter) const {
436   if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) ||
437       failed(isAsyncWithOneDependency(rewriter, deallocOp)))
438     return failure();
439 
440   Location loc = deallocOp.getLoc();
441 
442   auto adaptor =
443       gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary());
444   Value pointer =
445       MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc);
446   auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
447   Value stream = adaptor.asyncDependencies().front();
448   deallocCallBuilder.create(loc, rewriter, {casted, stream});
449 
450   rewriter.replaceOp(deallocOp, {stream});
451   return success();
452 }
453 
454 static bool isGpuAsyncTokenType(Value value) {
455   return value.getType().isa<gpu::AsyncTokenType>();
456 }
457 
458 // Converts !gpu.async.token operands of `async.yield` to runtime calls. The
459 // !gpu.async.token are lowered to stream within the async.execute region, but
460 // are passed as events between them. For each !gpu.async.token operand, we
461 // create an event and record it on the stream.
462 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite(
463     async::YieldOp yieldOp, ArrayRef<Value> operands,
464     ConversionPatternRewriter &rewriter) const {
465   if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType))
466     return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand");
467 
468   Location loc = yieldOp.getLoc();
469   SmallVector<Value, 4> newOperands(operands.begin(), operands.end());
470   llvm::SmallDenseSet<Value> streams;
471   for (auto &operand : yieldOp->getOpOperands()) {
472     if (!isGpuAsyncTokenType(operand.get()))
473       continue;
474     auto idx = operand.getOperandNumber();
475     auto stream = operands[idx];
476     auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
477     eventRecordCallBuilder.create(loc, rewriter, {event, stream});
478     newOperands[idx] = event;
479     streams.insert(stream);
480   }
481   for (auto stream : streams)
482     streamDestroyCallBuilder.create(loc, rewriter, {stream});
483 
484   rewriter.updateRootInPlace(yieldOp,
485                              [&] { yieldOp->setOperands(newOperands); });
486   return success();
487 }
488 
489 // Returns whether `value` is the result of an LLVM::CallOp to `functionName`.
490 static bool isDefinedByCallTo(Value value, StringRef functionName) {
491   assert(value.getType().isa<LLVM::LLVMPointerType>());
492   if (auto defOp = value.getDefiningOp<LLVM::CallOp>())
493     return defOp.callee()->equals(functionName);
494   return false;
495 }
496 
497 // Converts `gpu.wait` to runtime calls. The converted op synchronizes the host
498 // with the stream/event operands. The operands are destroyed. That is, it
499 // assumes that it is not used afterwards or elsewhere. Otherwise we will get a
500 // runtime error. Eventually, we should guarantee this property.
501 LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
502     gpu::WaitOp waitOp, ArrayRef<Value> operands,
503     ConversionPatternRewriter &rewriter) const {
504   if (waitOp.asyncToken())
505     return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op.");
506 
507   Location loc = waitOp.getLoc();
508 
509   for (auto operand : operands) {
510     if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) {
511       // The converted operand's definition created a stream.
512       streamSynchronizeCallBuilder.create(loc, rewriter, {operand});
513       streamDestroyCallBuilder.create(loc, rewriter, {operand});
514     } else {
515       // Otherwise the converted operand is an event. This assumes that we use
516       // events in control flow code as well.
517       eventSynchronizeCallBuilder.create(loc, rewriter, {operand});
518       eventDestroyCallBuilder.create(loc, rewriter, {operand});
519     }
520   }
521 
522   rewriter.eraseOp(waitOp);
523   return success();
524 }
525 
526 // Converts `gpu.wait async` to runtime calls. The converted op creates a new
527 // stream that is synchronized with stream/event operands. The operands are
528 // destroyed. That is, it assumes that it is not used afterwards or elsewhere.
529 // Otherwise we will get a runtime error. Eventually, we should guarantee this
530 // property.
531 LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
532     gpu::WaitOp waitOp, ArrayRef<Value> operands,
533     ConversionPatternRewriter &rewriter) const {
534   if (!waitOp.asyncToken())
535     return rewriter.notifyMatchFailure(waitOp, "Can only convert async op.");
536 
537   Location loc = waitOp.getLoc();
538 
539   auto insertionPoint = rewriter.saveInsertionPoint();
540   SmallVector<Value, 1> events;
541   for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) {
542     auto operand = std::get<1>(pair);
543     if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) {
544       // The converted operand's definition created a stream. Insert an event
545       // into the stream just after the last use of the original token operand.
546       auto *defOp = std::get<0>(pair).getDefiningOp();
547       rewriter.setInsertionPointAfter(defOp);
548       auto event =
549           eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
550       eventRecordCallBuilder.create(loc, rewriter, {event, operand});
551       events.push_back(event);
552     } else {
553       // Otherwise the converted operand is an event. This assumes that we use
554       // events in control flow code as well.
555       events.push_back(operand);
556     }
557   }
558   rewriter.restoreInsertionPoint(insertionPoint);
559   auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
560   for (auto event : events)
561     streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
562   for (auto event : events)
563     eventDestroyCallBuilder.create(loc, rewriter, {event});
564   rewriter.replaceOp(waitOp, {stream});
565 
566   return success();
567 }
568 
569 // Creates a struct containing all kernel parameters on the stack and returns
570 // an array of type-erased pointers to the fields of the struct. The array can
571 // then be passed to the CUDA / ROCm (HIP) kernel launch calls.
572 // The generated code is essentially as follows:
573 //
574 // %struct = alloca(sizeof(struct { Parameters... }))
575 // %array = alloca(NumParameters * sizeof(void *))
576 // for (i : [0, NumParameters))
577 //   %fieldPtr = llvm.getelementptr %struct[0, i]
578 //   llvm.store parameters[i], %fieldPtr
579 //   %elementPtr = llvm.getelementptr %array[i]
580 //   llvm.store %fieldPtr, %elementPtr
581 // return %array
582 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
583     gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
584     OpBuilder &builder) const {
585   auto loc = launchOp.getLoc();
586   auto numKernelOperands = launchOp.getNumKernelOperands();
587   auto arguments = getTypeConverter()->promoteOperands(
588       loc, launchOp.getOperands().take_back(numKernelOperands),
589       operands.take_back(numKernelOperands), builder);
590   auto numArguments = arguments.size();
591   SmallVector<Type, 4> argumentTypes;
592   argumentTypes.reserve(numArguments);
593   for (auto argument : arguments)
594     argumentTypes.push_back(argument.getType());
595   auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(),
596                                                            argumentTypes);
597   auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
598                                               builder.getI32IntegerAttr(1));
599   auto structPtr = builder.create<LLVM::AllocaOp>(
600       loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0);
601   auto arraySize = builder.create<LLVM::ConstantOp>(
602       loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
603   auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
604                                                  arraySize, /*alignment=*/0);
605   auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
606                                                builder.getI32IntegerAttr(0));
607   for (auto en : llvm::enumerate(arguments)) {
608     auto index = builder.create<LLVM::ConstantOp>(
609         loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
610     auto fieldPtr = builder.create<LLVM::GEPOp>(
611         loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr,
612         ArrayRef<Value>{zero, index.getResult()});
613     builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
614     auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
615                                                   arrayPtr, index.getResult());
616     auto casted =
617         builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
618     builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
619   }
620   return arrayPtr;
621 }
622 
623 // Generates an LLVM IR dialect global that contains the name of the given
624 // kernel function as a C string, and returns a pointer to its beginning.
625 // The code is essentially:
626 //
627 // llvm.global constant @kernel_name("function_name\00")
628 // func(...) {
629 //   %0 = llvm.addressof @kernel_name
630 //   %1 = llvm.constant (0 : index)
631 //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
632 // }
633 Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
634     StringRef moduleName, StringRef name, Location loc,
635     OpBuilder &builder) const {
636   // Make sure the trailing zero is included in the constant.
637   std::vector<char> kernelName(name.begin(), name.end());
638   kernelName.push_back('\0');
639 
640   std::string globalName =
641       std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
642   return LLVM::createGlobalString(
643       loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
644       LLVM::Linkage::Internal);
645 }
646 
647 // Emits LLVM IR to launch a kernel function. Expects the module that contains
648 // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
649 // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
650 //
651 // %0 = call %binarygetter
652 // %1 = call %moduleLoad(%0)
653 // %2 = <see generateKernelNameConstant>
654 // %3 = call %moduleGetFunction(%1, %2)
655 // %4 = call %streamCreate()
656 // %5 = <see generateParamsArray>
657 // call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
658 // call %streamSynchronize(%4)
659 // call %streamDestroy(%4)
660 // call %moduleUnload(%1)
661 //
662 // If the op is async, the stream corresponds to the (single) async dependency
663 // as well as the async token the op produces.
664 LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
665     gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
666     ConversionPatternRewriter &rewriter) const {
667   if (failed(areAllLLVMTypes(launchOp, operands, rewriter)))
668     return failure();
669 
670   if (launchOp.asyncDependencies().size() > 1)
671     return rewriter.notifyMatchFailure(
672         launchOp, "Cannot convert with more than one async dependency.");
673 
674   // Fail when the synchronous version of the op has async dependencies. The
675   // lowering destroys the stream, and we do not want to check that there is no
676   // use of the stream after this op.
677   if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty())
678     return rewriter.notifyMatchFailure(
679         launchOp, "Cannot convert non-async op with async dependencies.");
680 
681   Location loc = launchOp.getLoc();
682 
683   // Create an LLVM global with CUBIN extracted from the kernel annotation and
684   // obtain a pointer to the first byte in it.
685   auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
686       launchOp, launchOp.getKernelModuleName());
687   assert(kernelModule && "expected a kernel module");
688 
689   auto binaryAttr =
690       kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
691   if (!binaryAttr) {
692     kernelModule.emitOpError()
693         << "missing " << gpuBinaryAnnotation << " attribute";
694     return failure();
695   }
696 
697   SmallString<128> nameBuffer(kernelModule.getName());
698   nameBuffer.append(kGpuBinaryStorageSuffix);
699   Value data =
700       LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
701                                binaryAttr.getValue(), LLVM::Linkage::Internal);
702 
703   auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
704   // Get the function from the module. The name corresponds to the name of
705   // the kernel function.
706   auto kernelName = generateKernelNameConstant(
707       launchOp.getKernelModuleName().getValue(),
708       launchOp.getKernelName().getValue(), loc, rewriter);
709   auto function = moduleGetFunctionCallBuilder.create(
710       loc, rewriter, {module.getResult(0), kernelName});
711   auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
712                                                 rewriter.getI32IntegerAttr(0));
713   auto adaptor =
714       gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary());
715   Value stream =
716       adaptor.asyncDependencies().empty()
717           ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0)
718           : adaptor.asyncDependencies().front();
719   // Create array of pointers to kernel arguments.
720   auto kernelParams = generateParamsArray(launchOp, operands, rewriter);
721   auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
722   launchKernelCallBuilder.create(loc, rewriter,
723                                  {function.getResult(0), launchOp.gridSizeX(),
724                                   launchOp.gridSizeY(), launchOp.gridSizeZ(),
725                                   launchOp.blockSizeX(), launchOp.blockSizeY(),
726                                   launchOp.blockSizeZ(),
727                                   /*sharedMemBytes=*/zero, stream, kernelParams,
728                                   /*extra=*/nullpointer});
729 
730   if (launchOp.asyncToken()) {
731     // Async launch: make dependent ops use the same stream.
732     rewriter.replaceOp(launchOp, {stream});
733   } else {
734     // Synchronize with host and destroy stream. This must be the stream created
735     // above (with no other uses) because we check that the synchronous version
736     // does not have any async dependencies.
737     streamSynchronizeCallBuilder.create(loc, rewriter, stream);
738     streamDestroyCallBuilder.create(loc, rewriter, stream);
739     rewriter.eraseOp(launchOp);
740   }
741   moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));
742 
743   return success();
744 }
745 
746 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
747     gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
748     ConversionPatternRewriter &rewriter) const {
749   auto memRefType = memcpyOp.src().getType().cast<MemRefType>();
750 
751   if (failed(areAllLLVMTypes(memcpyOp, operands, rewriter)) ||
752       !isConvertibleAndHasIdentityMaps(memRefType) ||
753       failed(isAsyncWithOneDependency(rewriter, memcpyOp)))
754     return failure();
755 
756   auto loc = memcpyOp.getLoc();
757   auto adaptor = gpu::MemcpyOpAdaptor(operands, memcpyOp->getAttrDictionary());
758 
759   MemRefDescriptor srcDesc(adaptor.src());
760 
761   Value numElements =
762       memRefType.hasStaticShape()
763           ? createIndexConstant(rewriter, loc, memRefType.getNumElements())
764           // For identity layouts (verified above), the number of elements is
765           // stride[0] * size[0].
766           : rewriter.create<LLVM::MulOp>(loc, srcDesc.stride(rewriter, loc, 0),
767                                          srcDesc.size(rewriter, loc, 0));
768 
769   Type elementPtrType = getElementPtrType(memRefType);
770   Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
771   Value gepPtr = rewriter.create<LLVM::GEPOp>(
772       loc, elementPtrType, ArrayRef<Value>{nullPtr, numElements});
773   auto sizeBytes =
774       rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
775 
776   auto src = rewriter.create<LLVM::BitcastOp>(
777       loc, llvmPointerType, srcDesc.alignedPtr(rewriter, loc));
778   auto dst = rewriter.create<LLVM::BitcastOp>(
779       loc, llvmPointerType,
780       MemRefDescriptor(adaptor.dst()).alignedPtr(rewriter, loc));
781 
782   auto stream = adaptor.asyncDependencies().front();
783   memcpyCallBuilder.create(loc, rewriter, {dst, src, sizeBytes, stream});
784 
785   rewriter.replaceOp(memcpyOp, {stream});
786 
787   return success();
788 }
789 
790 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
791 mlir::createGpuToLLVMConversionPass() {
792   return std::make_unique<GpuToLLVMConversionPass>();
793 }
794 
795 void mlir::populateGpuToLLVMConversionPatterns(
796     LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
797     StringRef gpuBinaryAnnotation) {
798   converter.addConversion(
799       [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
800         return LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
801       });
802   patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
803                ConvertDeallocOpToGpuRuntimeCallPattern,
804                ConvertHostRegisterOpToGpuRuntimeCallPattern,
805                ConvertMemcpyOpToGpuRuntimeCallPattern,
806                ConvertWaitAsyncOpToGpuRuntimeCallPattern,
807                ConvertWaitOpToGpuRuntimeCallPattern,
808                ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
809   patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(converter,
810                                                            gpuBinaryAnnotation);
811   patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
812 }
813