1 //===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "GPUOpsLowering.h"
10 #include "mlir/Dialect/StandardOps/IR/Ops.h"
11 #include "mlir/IR/Builders.h"
12 #include "llvm/Support/FormatVariadic.h"
13 
14 using namespace mlir;
15 
16 LogicalResult
17 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
18                                    ArrayRef<Value> operands,
19                                    ConversionPatternRewriter &rewriter) const {
20   assert(operands.empty() && "func op is not expected to have operands");
21   Location loc = gpuFuncOp.getLoc();
22 
23   SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
24   workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
25   for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
26     Value attribution = en.value();
27 
28     auto type = attribution.getType().dyn_cast<MemRefType>();
29     assert(type && type.hasStaticShape() && "unexpected type in attribution");
30 
31     uint64_t numElements = type.getNumElements();
32 
33     auto elementType =
34         typeConverter->convertType(type.getElementType()).template cast<Type>();
35     auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
36     std::string name = std::string(
37         llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
38     auto globalOp = rewriter.create<LLVM::GlobalOp>(
39         gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
40         LLVM::Linkage::Internal, name, /*value=*/Attribute(),
41         /*alignment=*/0, gpu::GPUDialect::getWorkgroupAddressSpace());
42     workgroupBuffers.push_back(globalOp);
43   }
44 
45   // Rewrite the original GPU function to an LLVM function.
46   auto funcType = typeConverter->convertType(gpuFuncOp.getType())
47                       .template cast<LLVM::LLVMPointerType>()
48                       .getElementType();
49 
50   // Remap proper input types.
51   TypeConverter::SignatureConversion signatureConversion(
52       gpuFuncOp.front().getNumArguments());
53   getTypeConverter()->convertFunctionSignature(
54       gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
55 
56   // Create the new function operation. Only copy those attributes that are
57   // not specific to function modeling.
58   SmallVector<NamedAttribute, 4> attributes;
59   for (const auto &attr : gpuFuncOp->getAttrs()) {
60     if (attr.first == SymbolTable::getSymbolAttrName() ||
61         attr.first == function_like_impl::getTypeAttrName() ||
62         attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
63       continue;
64     attributes.push_back(attr);
65   }
66   // Add a dialect specific kernel attribute in addition to GPU kernel
67   // attribute. The former is necessary for further translation while the
68   // latter is expected by gpu.launch_func.
69   if (gpuFuncOp.isKernel())
70     attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
71   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
72       gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
73       LLVM::Linkage::External, /*dsoLocal*/ false, attributes);
74 
75   {
76     // Insert operations that correspond to converted workgroup and private
77     // memory attributions to the body of the function. This must operate on
78     // the original function, before the body region is inlined in the new
79     // function to maintain the relation between block arguments and the
80     // parent operation that assigns their semantics.
81     OpBuilder::InsertionGuard guard(rewriter);
82 
83     // Rewrite workgroup memory attributions to addresses of global buffers.
84     rewriter.setInsertionPointToStart(&gpuFuncOp.front());
85     unsigned numProperArguments = gpuFuncOp.getNumArguments();
86     auto i32Type = IntegerType::get(rewriter.getContext(), 32);
87 
88     Value zero = nullptr;
89     if (!workgroupBuffers.empty())
90       zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
91                                                rewriter.getI32IntegerAttr(0));
92     for (auto en : llvm::enumerate(workgroupBuffers)) {
93       LLVM::GlobalOp global = en.value();
94       Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
95       auto elementType =
96           global.getType().cast<LLVM::LLVMArrayType>().getElementType();
97       Value memory = rewriter.create<LLVM::GEPOp>(
98           loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
99           address, ArrayRef<Value>{zero, zero});
100 
101       // Build a memref descriptor pointing to the buffer to plug with the
102       // existing memref infrastructure. This may use more registers than
103       // otherwise necessary given that memref sizes are fixed, but we can try
104       // and canonicalize that away later.
105       Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
106       auto type = attribution.getType().cast<MemRefType>();
107       auto descr = MemRefDescriptor::fromStaticShape(
108           rewriter, loc, *getTypeConverter(), type, memory);
109       signatureConversion.remapInput(numProperArguments + en.index(), descr);
110     }
111 
112     // Rewrite private memory attributions to alloca'ed buffers.
113     unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
114     auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
115     for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
116       Value attribution = en.value();
117       auto type = attribution.getType().cast<MemRefType>();
118       assert(type && type.hasStaticShape() && "unexpected type in attribution");
119 
120       // Explicitly drop memory space when lowering private memory
121       // attributions since NVVM models it as `alloca`s in the default
122       // memory space and does not support `alloca`s with addrspace(5).
123       auto ptrType = LLVM::LLVMPointerType::get(
124           typeConverter->convertType(type.getElementType())
125               .template cast<Type>(),
126           allocaAddrSpace);
127       Value numElements = rewriter.create<LLVM::ConstantOp>(
128           gpuFuncOp.getLoc(), int64Ty,
129           rewriter.getI64IntegerAttr(type.getNumElements()));
130       Value allocated = rewriter.create<LLVM::AllocaOp>(
131           gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
132       auto descr = MemRefDescriptor::fromStaticShape(
133           rewriter, loc, *getTypeConverter(), type, allocated);
134       signatureConversion.remapInput(
135           numProperArguments + numWorkgroupAttributions + en.index(), descr);
136     }
137   }
138 
139   // Move the region to the new function, update the entry block signature.
140   rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
141                               llvmFuncOp.end());
142   if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
143                                          &signatureConversion)))
144     return failure();
145 
146   rewriter.eraseOp(gpuFuncOp);
147   return success();
148 }
149