1 //===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass that serializes a gpu module into HSAco blob and 10 // adds that blob as a string attribute of the module. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "mlir/Dialect/GPU/Transforms/Passes.h" 15 #include "mlir/IR/Location.h" 16 #include "mlir/IR/MLIRContext.h" 17 18 #if MLIR_GPU_TO_HSACO_PASS_ENABLE 19 #include "mlir/ExecutionEngine/OptUtils.h" 20 #include "mlir/Pass/Pass.h" 21 #include "mlir/Support/FileUtilities.h" 22 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" 23 #include "mlir/Target/LLVMIR/Export.h" 24 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/GlobalVariable.h" 27 #include "llvm/IR/Module.h" 28 #include "llvm/IRReader/IRReader.h" 29 #include "llvm/Linker/Linker.h" 30 31 #include "llvm/MC/MCAsmBackend.h" 32 #include "llvm/MC/MCAsmInfo.h" 33 #include "llvm/MC/MCCodeEmitter.h" 34 #include "llvm/MC/MCContext.h" 35 #include "llvm/MC/MCInstrInfo.h" 36 #include "llvm/MC/MCObjectFileInfo.h" 37 #include "llvm/MC/MCObjectWriter.h" 38 #include "llvm/MC/MCParser/MCTargetAsmParser.h" 39 #include "llvm/MC/MCRegisterInfo.h" 40 #include "llvm/MC/MCStreamer.h" 41 #include "llvm/MC/MCSubtargetInfo.h" 42 #include "llvm/MC/TargetRegistry.h" 43 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/FileUtilities.h" 46 #include "llvm/Support/Path.h" 47 #include "llvm/Support/Program.h" 48 #include "llvm/Support/SourceMgr.h" 49 #include "llvm/Support/TargetSelect.h" 50 #include "llvm/Support/WithColor.h" 51 52 #include "llvm/Target/TargetMachine.h" 53 #include "llvm/Target/TargetOptions.h" 54 55 #include "llvm/Transforms/IPO/Internalize.h" 56 57 #include <mutex> 58 59 using namespace mlir; 60 61 namespace { 62 class SerializeToHsacoPass 63 : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> { 64 public: 65 MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToHsacoPass) 66 67 SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, 68 int optLevel); 69 SerializeToHsacoPass(const SerializeToHsacoPass &other); 70 StringRef getArgument() const override { return "gpu-to-hsaco"; } 71 StringRef getDescription() const override { 72 return "Lower GPU kernel function to HSACO binary annotations"; 73 } 74 75 protected: 76 Option<int> optLevel{ 77 *this, "opt-level", 78 llvm::cl::desc("Optimization level for HSACO compilation"), 79 llvm::cl::init(2)}; 80 81 Option<std::string> rocmPath{*this, "rocm-path", 82 llvm::cl::desc("Path to ROCm install")}; 83 84 // Overload to allow linking in device libs 85 std::unique_ptr<llvm::Module> 86 translateToLLVMIR(llvm::LLVMContext &llvmContext) override; 87 88 /// Adds LLVM optimization passes 89 LogicalResult optimizeLlvm(llvm::Module &llvmModule, 90 llvm::TargetMachine &targetMachine) override; 91 92 private: 93 void getDependentDialects(DialectRegistry ®istry) const override; 94 95 // Loads LLVM bitcode libraries 96 Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> 97 loadLibraries(SmallVectorImpl<char> &path, 98 SmallVectorImpl<StringRef> &libraries, 99 llvm::LLVMContext &context); 100 101 // Serializes ROCDL to HSACO. 102 std::unique_ptr<std::vector<char>> 103 serializeISA(const std::string &isa) override; 104 105 std::unique_ptr<SmallVectorImpl<char>> assembleIsa(const std::string &isa); 106 std::unique_ptr<std::vector<char>> 107 createHsaco(const SmallVectorImpl<char> &isaBinary); 108 109 std::string getRocmPath(); 110 }; 111 } // namespace 112 113 SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other) 114 : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {} 115 116 /// Get a user-specified path to ROCm 117 // Tries, in order, the --rocm-path option, the ROCM_PATH environment variable 118 // and a compile-time default 119 std::string SerializeToHsacoPass::getRocmPath() { 120 if (rocmPath.getNumOccurrences() > 0) 121 return rocmPath.getValue(); 122 123 return __DEFAULT_ROCM_PATH__; 124 } 125 126 // Sets the 'option' to 'value' unless it already has a value. 127 static void maybeSetOption(Pass::Option<std::string> &option, 128 function_ref<std::string()> getValue) { 129 if (!option.hasValue()) 130 option = getValue(); 131 } 132 133 SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch, 134 StringRef features, int optLevel) { 135 maybeSetOption(this->triple, [&triple] { return triple.str(); }); 136 maybeSetOption(this->chip, [&arch] { return arch.str(); }); 137 maybeSetOption(this->features, [&features] { return features.str(); }); 138 if (this->optLevel.getNumOccurrences() == 0) 139 this->optLevel.setValue(optLevel); 140 } 141 142 void SerializeToHsacoPass::getDependentDialects( 143 DialectRegistry ®istry) const { 144 registerROCDLDialectTranslation(registry); 145 gpu::SerializeToBlobPass::getDependentDialects(registry); 146 } 147 148 Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> 149 SerializeToHsacoPass::loadLibraries(SmallVectorImpl<char> &path, 150 SmallVectorImpl<StringRef> &libraries, 151 llvm::LLVMContext &context) { 152 SmallVector<std::unique_ptr<llvm::Module>, 3> ret; 153 size_t dirLength = path.size(); 154 155 if (!llvm::sys::fs::is_directory(path)) { 156 getOperation().emitRemark() << "Bitcode path: " << path 157 << " does not exist or is not a directory\n"; 158 return llvm::None; 159 } 160 161 for (const StringRef file : libraries) { 162 llvm::SMDiagnostic error; 163 llvm::sys::path::append(path, file); 164 llvm::StringRef pathRef(path.data(), path.size()); 165 std::unique_ptr<llvm::Module> library = 166 llvm::getLazyIRFileModule(pathRef, error, context); 167 path.truncate(dirLength); 168 if (!library) { 169 getOperation().emitError() << "Failed to load library " << file 170 << " from " << path << error.getMessage(); 171 return llvm::None; 172 } 173 // Some ROCM builds don't strip this like they should 174 if (auto *openclVersion = library->getNamedMetadata("opencl.ocl.version")) 175 library->eraseNamedMetadata(openclVersion); 176 // Stop spamming us with clang version numbers 177 if (auto *ident = library->getNamedMetadata("llvm.ident")) 178 library->eraseNamedMetadata(ident); 179 ret.push_back(std::move(library)); 180 } 181 182 return ret; 183 } 184 185 std::unique_ptr<llvm::Module> 186 SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) { 187 // MLIR -> LLVM translation 188 std::unique_ptr<llvm::Module> ret = 189 gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext); 190 191 if (!ret) { 192 getOperation().emitOpError("Module lowering failed"); 193 return ret; 194 } 195 // Walk the LLVM module in order to determine if we need to link in device 196 // libs 197 bool needOpenCl = false; 198 bool needOckl = false; 199 bool needOcml = false; 200 for (llvm::Function &f : ret->functions()) { 201 if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) { 202 StringRef funcName = f.getName(); 203 if ("printf" == funcName) 204 needOpenCl = true; 205 if (funcName.startswith("__ockl_")) 206 needOckl = true; 207 if (funcName.startswith("__ocml_")) 208 needOcml = true; 209 } 210 } 211 212 if (needOpenCl) 213 needOcml = needOckl = true; 214 215 // No libraries needed (the typical case) 216 if (!(needOpenCl || needOcml || needOckl)) 217 return ret; 218 219 // Define one of the control constants the ROCm device libraries expect to be 220 // present These constants can either be defined in the module or can be 221 // imported by linking in bitcode that defines the constant. To simplify our 222 // logic, we define the constants into the module we are compiling 223 auto addControlConstant = [&module = *ret](StringRef name, uint32_t value, 224 uint32_t bitwidth) { 225 using llvm::GlobalVariable; 226 if (module.getNamedGlobal(name)) { 227 return; 228 } 229 llvm::IntegerType *type = 230 llvm::IntegerType::getIntNTy(module.getContext(), bitwidth); 231 auto *initializer = llvm::ConstantInt::get(type, value, /*isSigned=*/false); 232 auto *constant = new GlobalVariable( 233 module, type, 234 /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage, 235 initializer, name, 236 /*before=*/nullptr, 237 /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal, 238 /*addressSpace=*/4); 239 constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local); 240 constant->setVisibility( 241 GlobalVariable::VisibilityTypes::ProtectedVisibility); 242 constant->setAlignment(llvm::MaybeAlign(bitwidth / 8)); 243 }; 244 245 // Set up control variables in the module instead of linking in tiny bitcode 246 if (needOcml) { 247 // TODO(kdrewnia): Enable math optimizations once we have support for 248 // `-ffast-math`-like options 249 addControlConstant("__oclc_finite_only_opt", 0, 8); 250 addControlConstant("__oclc_daz_opt", 0, 8); 251 addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8); 252 addControlConstant("__oclc_unsafe_math_opt", 0, 8); 253 } 254 if (needOcml || needOckl) { 255 addControlConstant("__oclc_wavefrontsize64", 1, 8); 256 StringRef chipSet = this->chip.getValue(); 257 if (chipSet.startswith("gfx")) 258 chipSet = chipSet.substr(3); 259 uint32_t minor = 260 llvm::APInt(32, chipSet.substr(chipSet.size() - 2), 16).getZExtValue(); 261 uint32_t major = llvm::APInt(32, chipSet.substr(0, chipSet.size() - 2), 10) 262 .getZExtValue(); 263 uint32_t isaNumber = minor + 1000 * major; 264 addControlConstant("__oclc_ISA_version", isaNumber, 32); 265 266 // This constant must always match the default code object ABI version 267 // of the AMDGPU backend. 268 addControlConstant("__oclc_ABI_version", 400, 32); 269 } 270 271 // Determine libraries we need to link - order matters due to dependencies 272 llvm::SmallVector<StringRef, 4> libraries; 273 if (needOpenCl) 274 libraries.push_back("opencl.bc"); 275 if (needOcml) 276 libraries.push_back("ocml.bc"); 277 if (needOckl) 278 libraries.push_back("ockl.bc"); 279 280 Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> mbModules; 281 std::string theRocmPath = getRocmPath(); 282 llvm::SmallString<32> bitcodePath(theRocmPath); 283 llvm::sys::path::append(bitcodePath, "amdgcn", "bitcode"); 284 mbModules = loadLibraries(bitcodePath, libraries, llvmContext); 285 286 if (!mbModules) { 287 getOperation() 288 .emitWarning("Could not load required device libraries") 289 .attachNote() 290 << "This will probably cause link-time or run-time failures"; 291 return ret; // We can still abort here 292 } 293 294 llvm::Linker linker(*ret); 295 for (std::unique_ptr<llvm::Module> &libModule : *mbModules) { 296 // This bitcode linking code is substantially similar to what is used in 297 // hip-clang It imports the library functions into the module, allowing LLVM 298 // optimization passes (which must run after linking) to optimize across the 299 // libraries and the module's code. We also only import symbols if they are 300 // referenced by the module or a previous library since there will be no 301 // other source of references to those symbols in this compilation and since 302 // we don't want to bloat the resulting code object. 303 bool err = linker.linkInModule( 304 std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded, 305 [](llvm::Module &m, const StringSet<> &gvs) { 306 llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) { 307 return !gv.hasName() || (gvs.count(gv.getName()) == 0); 308 }); 309 }); 310 // True is linker failure 311 if (err) { 312 getOperation().emitError( 313 "Unrecoverable failure during device library linking."); 314 // We have no guaranties about the state of `ret`, so bail 315 return nullptr; 316 } 317 } 318 319 return ret; 320 } 321 322 LogicalResult 323 SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule, 324 llvm::TargetMachine &targetMachine) { 325 int optLevel = this->optLevel.getValue(); 326 if (optLevel < 0 || optLevel > 3) 327 return getOperation().emitError() 328 << "Invalid HSA optimization level" << optLevel << "\n"; 329 330 targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel)); 331 332 auto transformer = 333 makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine); 334 auto error = transformer(&llvmModule); 335 if (error) { 336 InFlightDiagnostic mlirError = getOperation()->emitError(); 337 llvm::handleAllErrors( 338 std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) { 339 mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n"; 340 }); 341 return mlirError; 342 } 343 return success(); 344 } 345 346 std::unique_ptr<SmallVectorImpl<char>> 347 SerializeToHsacoPass::assembleIsa(const std::string &isa) { 348 auto loc = getOperation().getLoc(); 349 350 SmallVector<char, 0> result; 351 llvm::raw_svector_ostream os(result); 352 353 llvm::Triple triple(llvm::Triple::normalize(this->triple)); 354 std::string error; 355 const llvm::Target *target = 356 llvm::TargetRegistry::lookupTarget(triple.normalize(), error); 357 if (!target) { 358 emitError(loc, Twine("failed to lookup target: ") + error); 359 return {}; 360 } 361 362 llvm::SourceMgr srcMgr; 363 srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), 364 SMLoc()); 365 366 const llvm::MCTargetOptions mcOptions; 367 std::unique_ptr<llvm::MCRegisterInfo> mri( 368 target->createMCRegInfo(this->triple)); 369 std::unique_ptr<llvm::MCAsmInfo> mai( 370 target->createMCAsmInfo(*mri, this->triple, mcOptions)); 371 mai->setRelaxELFRelocations(true); 372 std::unique_ptr<llvm::MCSubtargetInfo> sti( 373 target->createMCSubtargetInfo(this->triple, this->chip, this->features)); 374 375 llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr, 376 &mcOptions); 377 std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo( 378 ctx, /*PIC=*/false, /*LargeCodeModel=*/false)); 379 ctx.setObjectFileInfo(mofi.get()); 380 381 SmallString<128> cwd; 382 if (!llvm::sys::fs::current_path(cwd)) 383 ctx.setCompilationDir(cwd); 384 385 std::unique_ptr<llvm::MCStreamer> mcStreamer; 386 std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo()); 387 388 llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx); 389 llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions); 390 mcStreamer.reset(target->createMCObjectStreamer( 391 triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab), 392 mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce), 393 *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, 394 /*DWARFMustBeAtTheEnd*/ false)); 395 mcStreamer->setUseAssemblerInfoForParsing(true); 396 397 std::unique_ptr<llvm::MCAsmParser> parser( 398 createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); 399 std::unique_ptr<llvm::MCTargetAsmParser> tap( 400 target->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); 401 402 if (!tap) { 403 emitError(loc, "assembler initialization error"); 404 return {}; 405 } 406 407 parser->setTargetParser(*tap); 408 parser->Run(false); 409 410 return std::make_unique<SmallVector<char, 0>>(std::move(result)); 411 } 412 413 std::unique_ptr<std::vector<char>> 414 SerializeToHsacoPass::createHsaco(const SmallVectorImpl<char> &isaBinary) { 415 auto loc = getOperation().getLoc(); 416 417 // Save the ISA binary to a temp file. 418 int tempIsaBinaryFd = -1; 419 SmallString<128> tempIsaBinaryFilename; 420 if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd, 421 tempIsaBinaryFilename)) { 422 emitError(loc, "temporary file for ISA binary creation error"); 423 return {}; 424 } 425 llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename); 426 llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); 427 tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size()); 428 tempIsaBinaryOs.close(); 429 430 // Create a temp file for HSA code object. 431 int tempHsacoFD = -1; 432 SmallString<128> tempHsacoFilename; 433 if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD, 434 tempHsacoFilename)) { 435 emitError(loc, "temporary file for HSA code object creation error"); 436 return {}; 437 } 438 llvm::FileRemover cleanupHsaco(tempHsacoFilename); 439 440 std::string theRocmPath = getRocmPath(); 441 llvm::SmallString<32> lldPath(theRocmPath); 442 llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld"); 443 int lldResult = llvm::sys::ExecuteAndWait( 444 lldPath, 445 {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename}); 446 if (lldResult != 0) { 447 emitError(loc, "lld invocation error"); 448 return {}; 449 } 450 451 // Load the HSA code object. 452 auto hsacoFile = openInputFile(tempHsacoFilename); 453 if (!hsacoFile) { 454 emitError(loc, "read HSA code object from temp file error"); 455 return {}; 456 } 457 458 StringRef buffer = hsacoFile->getBuffer(); 459 return std::make_unique<std::vector<char>>(buffer.begin(), buffer.end()); 460 } 461 462 std::unique_ptr<std::vector<char>> 463 SerializeToHsacoPass::serializeISA(const std::string &isa) { 464 auto isaBinary = assembleIsa(isa); 465 if (!isaBinary) 466 return {}; 467 return createHsaco(*isaBinary); 468 } 469 470 // Register pass to serialize GPU kernel functions to a HSACO binary annotation. 471 void mlir::registerGpuSerializeToHsacoPass() { 472 PassRegistration<SerializeToHsacoPass> registerSerializeToHSACO( 473 [] { 474 // Initialize LLVM AMDGPU backend. 475 LLVMInitializeAMDGPUAsmParser(); 476 LLVMInitializeAMDGPUAsmPrinter(); 477 LLVMInitializeAMDGPUTarget(); 478 LLVMInitializeAMDGPUTargetInfo(); 479 LLVMInitializeAMDGPUTargetMC(); 480 481 return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "", 482 "", 2); 483 }); 484 } 485 486 /// Create an instance of the GPU kernel function to HSAco binary serialization 487 /// pass. 488 std::unique_ptr<Pass> mlir::createGpuSerializeToHsacoPass(StringRef triple, 489 StringRef arch, 490 StringRef features, 491 int optLevel) { 492 return std::make_unique<SerializeToHsacoPass>(triple, arch, features, 493 optLevel); 494 } 495 496 #else // MLIR_GPU_TO_HSACO_PASS_ENABLE 497 void mlir::registerGpuSerializeToHsacoPass() {} 498 #endif // MLIR_GPU_TO_HSACO_PASS_ENABLE 499