1 //===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "GPUOpsLowering.h" 10 #include "mlir/Dialect/StandardOps/IR/Ops.h" 11 #include "mlir/IR/Builders.h" 12 #include "llvm/Support/FormatVariadic.h" 13 14 using namespace mlir; 15 16 LogicalResult 17 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, 18 ArrayRef<Value> operands, 19 ConversionPatternRewriter &rewriter) const { 20 assert(operands.empty() && "func op is not expected to have operands"); 21 Location loc = gpuFuncOp.getLoc(); 22 23 SmallVector<LLVM::GlobalOp, 3> workgroupBuffers; 24 workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); 25 for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { 26 Value attribution = en.value(); 27 28 auto type = attribution.getType().dyn_cast<MemRefType>(); 29 assert(type && type.hasStaticShape() && "unexpected type in attribution"); 30 31 uint64_t numElements = type.getNumElements(); 32 33 auto elementType = 34 typeConverter->convertType(type.getElementType()).template cast<Type>(); 35 auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements); 36 std::string name = std::string( 37 llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); 38 auto globalOp = rewriter.create<LLVM::GlobalOp>( 39 gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, 40 LLVM::Linkage::Internal, name, /*value=*/Attribute(), 41 /*alignment=*/0, gpu::GPUDialect::getWorkgroupAddressSpace()); 42 workgroupBuffers.push_back(globalOp); 43 } 44 45 // Rewrite the original GPU function to an LLVM function. 46 auto funcType = typeConverter->convertType(gpuFuncOp.getType()) 47 .template cast<LLVM::LLVMPointerType>() 48 .getElementType(); 49 50 // Remap proper input types. 51 TypeConverter::SignatureConversion signatureConversion( 52 gpuFuncOp.front().getNumArguments()); 53 getTypeConverter()->convertFunctionSignature( 54 gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); 55 56 // Create the new function operation. Only copy those attributes that are 57 // not specific to function modeling. 58 SmallVector<NamedAttribute, 4> attributes; 59 for (const auto &attr : gpuFuncOp->getAttrs()) { 60 if (attr.first == SymbolTable::getSymbolAttrName() || 61 attr.first == function_like_impl::getTypeAttrName() || 62 attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) 63 continue; 64 attributes.push_back(attr); 65 } 66 // Add a dialect specific kernel attribute in addition to GPU kernel 67 // attribute. The former is necessary for further translation while the 68 // latter is expected by gpu.launch_func. 69 if (gpuFuncOp.isKernel()) 70 attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr()); 71 auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>( 72 gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, 73 LLVM::Linkage::External, /*dsoLocal*/ false, attributes); 74 75 { 76 // Insert operations that correspond to converted workgroup and private 77 // memory attributions to the body of the function. This must operate on 78 // the original function, before the body region is inlined in the new 79 // function to maintain the relation between block arguments and the 80 // parent operation that assigns their semantics. 81 OpBuilder::InsertionGuard guard(rewriter); 82 83 // Rewrite workgroup memory attributions to addresses of global buffers. 84 rewriter.setInsertionPointToStart(&gpuFuncOp.front()); 85 unsigned numProperArguments = gpuFuncOp.getNumArguments(); 86 auto i32Type = IntegerType::get(rewriter.getContext(), 32); 87 88 Value zero = nullptr; 89 if (!workgroupBuffers.empty()) 90 zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 91 rewriter.getI32IntegerAttr(0)); 92 for (auto en : llvm::enumerate(workgroupBuffers)) { 93 LLVM::GlobalOp global = en.value(); 94 Value address = rewriter.create<LLVM::AddressOfOp>(loc, global); 95 auto elementType = 96 global.getType().cast<LLVM::LLVMArrayType>().getElementType(); 97 Value memory = rewriter.create<LLVM::GEPOp>( 98 loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()), 99 address, ArrayRef<Value>{zero, zero}); 100 101 // Build a memref descriptor pointing to the buffer to plug with the 102 // existing memref infrastructure. This may use more registers than 103 // otherwise necessary given that memref sizes are fixed, but we can try 104 // and canonicalize that away later. 105 Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; 106 auto type = attribution.getType().cast<MemRefType>(); 107 auto descr = MemRefDescriptor::fromStaticShape( 108 rewriter, loc, *getTypeConverter(), type, memory); 109 signatureConversion.remapInput(numProperArguments + en.index(), descr); 110 } 111 112 // Rewrite private memory attributions to alloca'ed buffers. 113 unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions(); 114 auto int64Ty = IntegerType::get(rewriter.getContext(), 64); 115 for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { 116 Value attribution = en.value(); 117 auto type = attribution.getType().cast<MemRefType>(); 118 assert(type && type.hasStaticShape() && "unexpected type in attribution"); 119 120 // Explicitly drop memory space when lowering private memory 121 // attributions since NVVM models it as `alloca`s in the default 122 // memory space and does not support `alloca`s with addrspace(5). 123 auto ptrType = LLVM::LLVMPointerType::get( 124 typeConverter->convertType(type.getElementType()) 125 .template cast<Type>(), 126 allocaAddrSpace); 127 Value numElements = rewriter.create<LLVM::ConstantOp>( 128 gpuFuncOp.getLoc(), int64Ty, 129 rewriter.getI64IntegerAttr(type.getNumElements())); 130 Value allocated = rewriter.create<LLVM::AllocaOp>( 131 gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); 132 auto descr = MemRefDescriptor::fromStaticShape( 133 rewriter, loc, *getTypeConverter(), type, allocated); 134 signatureConversion.remapInput( 135 numProperArguments + numWorkgroupAttributions + en.index(), descr); 136 } 137 } 138 139 // Move the region to the new function, update the entry block signature. 140 rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), 141 llvmFuncOp.end()); 142 if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter, 143 &signatureConversion))) 144 return failure(); 145 146 rewriter.eraseOp(gpuFuncOp); 147 return success(); 148 } 149