1 //===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass that serializes a gpu module into HSAco blob and
10 // adds that blob as a string attribute of the module.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "mlir/Dialect/GPU/Transforms/Passes.h"
15 #include "mlir/IR/Location.h"
16 #include "mlir/IR/MLIRContext.h"
17
18 #if MLIR_GPU_TO_HSACO_PASS_ENABLE
19 #include "mlir/ExecutionEngine/OptUtils.h"
20 #include "mlir/Pass/Pass.h"
21 #include "mlir/Support/FileUtilities.h"
22 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
23 #include "mlir/Target/LLVMIR/Export.h"
24
25 #include "llvm/IR/Constants.h"
26 #include "llvm/IR/GlobalVariable.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IRReader/IRReader.h"
29 #include "llvm/Linker/Linker.h"
30
31 #include "llvm/MC/MCAsmBackend.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/MC/MCCodeEmitter.h"
34 #include "llvm/MC/MCContext.h"
35 #include "llvm/MC/MCInstrInfo.h"
36 #include "llvm/MC/MCObjectFileInfo.h"
37 #include "llvm/MC/MCObjectWriter.h"
38 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
39 #include "llvm/MC/MCRegisterInfo.h"
40 #include "llvm/MC/MCStreamer.h"
41 #include "llvm/MC/MCSubtargetInfo.h"
42 #include "llvm/MC/TargetRegistry.h"
43
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/FileUtilities.h"
46 #include "llvm/Support/Path.h"
47 #include "llvm/Support/Program.h"
48 #include "llvm/Support/SourceMgr.h"
49 #include "llvm/Support/TargetSelect.h"
50 #include "llvm/Support/WithColor.h"
51
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Target/TargetOptions.h"
54
55 #include "llvm/Transforms/IPO/Internalize.h"
56
57 #include <mutex>
58
59 using namespace mlir;
60
61 namespace {
62 class SerializeToHsacoPass
63 : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
64 public:
65 MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToHsacoPass)
66
67 SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
68 int optLevel);
69 SerializeToHsacoPass(const SerializeToHsacoPass &other);
getArgument() const70 StringRef getArgument() const override { return "gpu-to-hsaco"; }
getDescription() const71 StringRef getDescription() const override {
72 return "Lower GPU kernel function to HSACO binary annotations";
73 }
74
75 protected:
76 Option<int> optLevel{
77 *this, "opt-level",
78 llvm::cl::desc("Optimization level for HSACO compilation"),
79 llvm::cl::init(2)};
80
81 Option<std::string> rocmPath{*this, "rocm-path",
82 llvm::cl::desc("Path to ROCm install")};
83
84 // Overload to allow linking in device libs
85 std::unique_ptr<llvm::Module>
86 translateToLLVMIR(llvm::LLVMContext &llvmContext) override;
87
88 /// Adds LLVM optimization passes
89 LogicalResult optimizeLlvm(llvm::Module &llvmModule,
90 llvm::TargetMachine &targetMachine) override;
91
92 private:
93 void getDependentDialects(DialectRegistry ®istry) const override;
94
95 // Loads LLVM bitcode libraries
96 Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
97 loadLibraries(SmallVectorImpl<char> &path,
98 SmallVectorImpl<StringRef> &libraries,
99 llvm::LLVMContext &context);
100
101 // Serializes ROCDL to HSACO.
102 std::unique_ptr<std::vector<char>>
103 serializeISA(const std::string &isa) override;
104
105 std::unique_ptr<SmallVectorImpl<char>> assembleIsa(const std::string &isa);
106 std::unique_ptr<std::vector<char>>
107 createHsaco(const SmallVectorImpl<char> &isaBinary);
108
109 std::string getRocmPath();
110 };
111 } // namespace
112
SerializeToHsacoPass(const SerializeToHsacoPass & other)113 SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other)
114 : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {}
115
116 /// Get a user-specified path to ROCm
117 // Tries, in order, the --rocm-path option, the ROCM_PATH environment variable
118 // and a compile-time default
getRocmPath()119 std::string SerializeToHsacoPass::getRocmPath() {
120 if (rocmPath.getNumOccurrences() > 0)
121 return rocmPath.getValue();
122
123 return __DEFAULT_ROCM_PATH__;
124 }
125
126 // Sets the 'option' to 'value' unless it already has a value.
maybeSetOption(Pass::Option<std::string> & option,function_ref<std::string ()> getValue)127 static void maybeSetOption(Pass::Option<std::string> &option,
128 function_ref<std::string()> getValue) {
129 if (!option.hasValue())
130 option = getValue();
131 }
132
SerializeToHsacoPass(StringRef triple,StringRef arch,StringRef features,int optLevel)133 SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
134 StringRef features, int optLevel) {
135 maybeSetOption(this->triple, [&triple] { return triple.str(); });
136 maybeSetOption(this->chip, [&arch] { return arch.str(); });
137 maybeSetOption(this->features, [&features] { return features.str(); });
138 if (this->optLevel.getNumOccurrences() == 0)
139 this->optLevel.setValue(optLevel);
140 }
141
getDependentDialects(DialectRegistry & registry) const142 void SerializeToHsacoPass::getDependentDialects(
143 DialectRegistry ®istry) const {
144 registerROCDLDialectTranslation(registry);
145 gpu::SerializeToBlobPass::getDependentDialects(registry);
146 }
147
148 Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
loadLibraries(SmallVectorImpl<char> & path,SmallVectorImpl<StringRef> & libraries,llvm::LLVMContext & context)149 SerializeToHsacoPass::loadLibraries(SmallVectorImpl<char> &path,
150 SmallVectorImpl<StringRef> &libraries,
151 llvm::LLVMContext &context) {
152 SmallVector<std::unique_ptr<llvm::Module>, 3> ret;
153 size_t dirLength = path.size();
154
155 if (!llvm::sys::fs::is_directory(path)) {
156 getOperation().emitRemark() << "Bitcode path: " << path
157 << " does not exist or is not a directory\n";
158 return llvm::None;
159 }
160
161 for (const StringRef file : libraries) {
162 llvm::SMDiagnostic error;
163 llvm::sys::path::append(path, file);
164 llvm::StringRef pathRef(path.data(), path.size());
165 std::unique_ptr<llvm::Module> library =
166 llvm::getLazyIRFileModule(pathRef, error, context);
167 path.truncate(dirLength);
168 if (!library) {
169 getOperation().emitError() << "Failed to load library " << file
170 << " from " << path << error.getMessage();
171 return llvm::None;
172 }
173 // Some ROCM builds don't strip this like they should
174 if (auto *openclVersion = library->getNamedMetadata("opencl.ocl.version"))
175 library->eraseNamedMetadata(openclVersion);
176 // Stop spamming us with clang version numbers
177 if (auto *ident = library->getNamedMetadata("llvm.ident"))
178 library->eraseNamedMetadata(ident);
179 ret.push_back(std::move(library));
180 }
181
182 return ret;
183 }
184
185 std::unique_ptr<llvm::Module>
translateToLLVMIR(llvm::LLVMContext & llvmContext)186 SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
187 // MLIR -> LLVM translation
188 std::unique_ptr<llvm::Module> ret =
189 gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext);
190
191 if (!ret) {
192 getOperation().emitOpError("Module lowering failed");
193 return ret;
194 }
195 // Walk the LLVM module in order to determine if we need to link in device
196 // libs
197 bool needOpenCl = false;
198 bool needOckl = false;
199 bool needOcml = false;
200 for (llvm::Function &f : ret->functions()) {
201 if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
202 StringRef funcName = f.getName();
203 if ("printf" == funcName)
204 needOpenCl = true;
205 if (funcName.startswith("__ockl_"))
206 needOckl = true;
207 if (funcName.startswith("__ocml_"))
208 needOcml = true;
209 }
210 }
211
212 if (needOpenCl)
213 needOcml = needOckl = true;
214
215 // No libraries needed (the typical case)
216 if (!(needOpenCl || needOcml || needOckl))
217 return ret;
218
219 // Define one of the control constants the ROCm device libraries expect to be
220 // present These constants can either be defined in the module or can be
221 // imported by linking in bitcode that defines the constant. To simplify our
222 // logic, we define the constants into the module we are compiling
223 auto addControlConstant = [&module = *ret](StringRef name, uint32_t value,
224 uint32_t bitwidth) {
225 using llvm::GlobalVariable;
226 if (module.getNamedGlobal(name)) {
227 return;
228 }
229 llvm::IntegerType *type =
230 llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
231 auto *initializer = llvm::ConstantInt::get(type, value, /*isSigned=*/false);
232 auto *constant = new GlobalVariable(
233 module, type,
234 /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage,
235 initializer, name,
236 /*before=*/nullptr,
237 /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal,
238 /*addressSpace=*/4);
239 constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local);
240 constant->setVisibility(
241 GlobalVariable::VisibilityTypes::ProtectedVisibility);
242 constant->setAlignment(llvm::MaybeAlign(bitwidth / 8));
243 };
244
245 // Set up control variables in the module instead of linking in tiny bitcode
246 if (needOcml) {
247 // TODO(kdrewnia): Enable math optimizations once we have support for
248 // `-ffast-math`-like options
249 addControlConstant("__oclc_finite_only_opt", 0, 8);
250 addControlConstant("__oclc_daz_opt", 0, 8);
251 addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8);
252 addControlConstant("__oclc_unsafe_math_opt", 0, 8);
253 }
254 if (needOcml || needOckl) {
255 addControlConstant("__oclc_wavefrontsize64", 1, 8);
256 StringRef chipSet = this->chip.getValue();
257 if (chipSet.startswith("gfx"))
258 chipSet = chipSet.substr(3);
259 uint32_t minor =
260 llvm::APInt(32, chipSet.substr(chipSet.size() - 2), 16).getZExtValue();
261 uint32_t major = llvm::APInt(32, chipSet.substr(0, chipSet.size() - 2), 10)
262 .getZExtValue();
263 uint32_t isaNumber = minor + 1000 * major;
264 addControlConstant("__oclc_ISA_version", isaNumber, 32);
265
266 // This constant must always match the default code object ABI version
267 // of the AMDGPU backend.
268 addControlConstant("__oclc_ABI_version", 400, 32);
269 }
270
271 // Determine libraries we need to link - order matters due to dependencies
272 llvm::SmallVector<StringRef, 4> libraries;
273 if (needOpenCl)
274 libraries.push_back("opencl.bc");
275 if (needOcml)
276 libraries.push_back("ocml.bc");
277 if (needOckl)
278 libraries.push_back("ockl.bc");
279
280 Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> mbModules;
281 std::string theRocmPath = getRocmPath();
282 llvm::SmallString<32> bitcodePath(theRocmPath);
283 llvm::sys::path::append(bitcodePath, "amdgcn", "bitcode");
284 mbModules = loadLibraries(bitcodePath, libraries, llvmContext);
285
286 if (!mbModules) {
287 getOperation()
288 .emitWarning("Could not load required device libraries")
289 .attachNote()
290 << "This will probably cause link-time or run-time failures";
291 return ret; // We can still abort here
292 }
293
294 llvm::Linker linker(*ret);
295 for (std::unique_ptr<llvm::Module> &libModule : *mbModules) {
296 // This bitcode linking code is substantially similar to what is used in
297 // hip-clang It imports the library functions into the module, allowing LLVM
298 // optimization passes (which must run after linking) to optimize across the
299 // libraries and the module's code. We also only import symbols if they are
300 // referenced by the module or a previous library since there will be no
301 // other source of references to those symbols in this compilation and since
302 // we don't want to bloat the resulting code object.
303 bool err = linker.linkInModule(
304 std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded,
305 [](llvm::Module &m, const StringSet<> &gvs) {
306 llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) {
307 return !gv.hasName() || (gvs.count(gv.getName()) == 0);
308 });
309 });
310 // True is linker failure
311 if (err) {
312 getOperation().emitError(
313 "Unrecoverable failure during device library linking.");
314 // We have no guaranties about the state of `ret`, so bail
315 return nullptr;
316 }
317 }
318
319 return ret;
320 }
321
322 LogicalResult
optimizeLlvm(llvm::Module & llvmModule,llvm::TargetMachine & targetMachine)323 SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule,
324 llvm::TargetMachine &targetMachine) {
325 int optLevel = this->optLevel.getValue();
326 if (optLevel < 0 || optLevel > 3)
327 return getOperation().emitError()
328 << "Invalid HSA optimization level" << optLevel << "\n";
329
330 targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
331
332 auto transformer =
333 makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
334 auto error = transformer(&llvmModule);
335 if (error) {
336 InFlightDiagnostic mlirError = getOperation()->emitError();
337 llvm::handleAllErrors(
338 std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) {
339 mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n";
340 });
341 return mlirError;
342 }
343 return success();
344 }
345
346 std::unique_ptr<SmallVectorImpl<char>>
assembleIsa(const std::string & isa)347 SerializeToHsacoPass::assembleIsa(const std::string &isa) {
348 auto loc = getOperation().getLoc();
349
350 SmallVector<char, 0> result;
351 llvm::raw_svector_ostream os(result);
352
353 llvm::Triple triple(llvm::Triple::normalize(this->triple));
354 std::string error;
355 const llvm::Target *target =
356 llvm::TargetRegistry::lookupTarget(triple.normalize(), error);
357 if (!target) {
358 emitError(loc, Twine("failed to lookup target: ") + error);
359 return {};
360 }
361
362 llvm::SourceMgr srcMgr;
363 srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc());
364
365 const llvm::MCTargetOptions mcOptions;
366 std::unique_ptr<llvm::MCRegisterInfo> mri(
367 target->createMCRegInfo(this->triple));
368 std::unique_ptr<llvm::MCAsmInfo> mai(
369 target->createMCAsmInfo(*mri, this->triple, mcOptions));
370 mai->setRelaxELFRelocations(true);
371 std::unique_ptr<llvm::MCSubtargetInfo> sti(
372 target->createMCSubtargetInfo(this->triple, this->chip, this->features));
373
374 llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
375 &mcOptions);
376 std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
377 ctx, /*PIC=*/false, /*LargeCodeModel=*/false));
378 ctx.setObjectFileInfo(mofi.get());
379
380 SmallString<128> cwd;
381 if (!llvm::sys::fs::current_path(cwd))
382 ctx.setCompilationDir(cwd);
383
384 std::unique_ptr<llvm::MCStreamer> mcStreamer;
385 std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());
386
387 llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx);
388 llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions);
389 mcStreamer.reset(target->createMCObjectStreamer(
390 triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
391 mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
392 *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
393 /*DWARFMustBeAtTheEnd*/ false));
394 mcStreamer->setUseAssemblerInfoForParsing(true);
395
396 std::unique_ptr<llvm::MCAsmParser> parser(
397 createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
398 std::unique_ptr<llvm::MCTargetAsmParser> tap(
399 target->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
400
401 if (!tap) {
402 emitError(loc, "assembler initialization error");
403 return {};
404 }
405
406 parser->setTargetParser(*tap);
407 parser->Run(false);
408
409 return std::make_unique<SmallVector<char, 0>>(std::move(result));
410 }
411
412 std::unique_ptr<std::vector<char>>
createHsaco(const SmallVectorImpl<char> & isaBinary)413 SerializeToHsacoPass::createHsaco(const SmallVectorImpl<char> &isaBinary) {
414 auto loc = getOperation().getLoc();
415
416 // Save the ISA binary to a temp file.
417 int tempIsaBinaryFd = -1;
418 SmallString<128> tempIsaBinaryFilename;
419 if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd,
420 tempIsaBinaryFilename)) {
421 emitError(loc, "temporary file for ISA binary creation error");
422 return {};
423 }
424 llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
425 llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
426 tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size());
427 tempIsaBinaryOs.close();
428
429 // Create a temp file for HSA code object.
430 int tempHsacoFD = -1;
431 SmallString<128> tempHsacoFilename;
432 if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
433 tempHsacoFilename)) {
434 emitError(loc, "temporary file for HSA code object creation error");
435 return {};
436 }
437 llvm::FileRemover cleanupHsaco(tempHsacoFilename);
438
439 std::string theRocmPath = getRocmPath();
440 llvm::SmallString<32> lldPath(theRocmPath);
441 llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld");
442 int lldResult = llvm::sys::ExecuteAndWait(
443 lldPath,
444 {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
445 if (lldResult != 0) {
446 emitError(loc, "lld invocation error");
447 return {};
448 }
449
450 // Load the HSA code object.
451 auto hsacoFile = openInputFile(tempHsacoFilename);
452 if (!hsacoFile) {
453 emitError(loc, "read HSA code object from temp file error");
454 return {};
455 }
456
457 StringRef buffer = hsacoFile->getBuffer();
458 return std::make_unique<std::vector<char>>(buffer.begin(), buffer.end());
459 }
460
461 std::unique_ptr<std::vector<char>>
serializeISA(const std::string & isa)462 SerializeToHsacoPass::serializeISA(const std::string &isa) {
463 auto isaBinary = assembleIsa(isa);
464 if (!isaBinary)
465 return {};
466 return createHsaco(*isaBinary);
467 }
468
469 // Register pass to serialize GPU kernel functions to a HSACO binary annotation.
registerGpuSerializeToHsacoPass()470 void mlir::registerGpuSerializeToHsacoPass() {
471 PassRegistration<SerializeToHsacoPass> registerSerializeToHSACO([] {
472 // Initialize LLVM AMDGPU backend.
473 LLVMInitializeAMDGPUAsmParser();
474 LLVMInitializeAMDGPUAsmPrinter();
475 LLVMInitializeAMDGPUTarget();
476 LLVMInitializeAMDGPUTargetInfo();
477 LLVMInitializeAMDGPUTargetMC();
478
479 return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "", "",
480 2);
481 });
482 }
483
484 /// Create an instance of the GPU kernel function to HSAco binary serialization
485 /// pass.
createGpuSerializeToHsacoPass(StringRef triple,StringRef arch,StringRef features,int optLevel)486 std::unique_ptr<Pass> mlir::createGpuSerializeToHsacoPass(StringRef triple,
487 StringRef arch,
488 StringRef features,
489 int optLevel) {
490 return std::make_unique<SerializeToHsacoPass>(triple, arch, features,
491 optLevel);
492 }
493
494 #else // MLIR_GPU_TO_HSACO_PASS_ENABLE
registerGpuSerializeToHsacoPass()495 void mlir::registerGpuSerializeToHsacoPass() {}
496 #endif // MLIR_GPU_TO_HSACO_PASS_ENABLE
497