1 //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // \brief This post-linking pass replaces the function pointer of enqueued 12 // block kernel with a global variable (runtime handle) and adds 13 // "runtime-handle" attribute to the enqueued block kernel. 14 // 15 // In LLVM CodeGen the runtime-handle metadata will be translated to 16 // RuntimeHandle metadata in code object. Runtime allocates a global buffer 17 // for each kernel with RuntimeHandel metadata and saves the kernel address 18 // required for the AQL packet into the buffer. __enqueue_kernel function 19 // in device library knows that the invoke function pointer in the block 20 // literal is actually runtime handle and loads the kernel address from it 21 // and put it into AQL packet for dispatching. 22 // 23 // This cannot be done in FE since FE cannot create a unique global variable 24 // with external linkage across LLVM modules. The global variable with internal 25 // linkage does not work since optimization passes will try to replace loads 26 // of the global variable with its initialization value. 27 // 28 // It also identifies the kernels directly or indirectly enqueues kernels 29 // and adds "calls-enqueue-kernel" function attribute to them, which will 30 // be used to determine whether to emit runtime metadata for the kernel 31 // enqueue related hidden kernel arguments. 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "AMDGPU.h" 36 #include "llvm/ADT/DenseSet.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Mangler.h" 41 #include "llvm/IR/Module.h" 42 #include "llvm/IR/User.h" 43 #include "llvm/Pass.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/Support/raw_ostream.h" 46 47 #define DEBUG_TYPE "amdgpu-lower-enqueued-block" 48 49 using namespace llvm; 50 51 namespace { 52 53 /// \brief Lower enqueued blocks. 54 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { 55 public: 56 static char ID; 57 58 explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} 59 60 private: 61 bool runOnModule(Module &M) override; 62 }; 63 64 } // end anonymous namespace 65 66 char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; 67 68 char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = 69 AMDGPUOpenCLEnqueuedBlockLowering::ID; 70 71 INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, 72 "Lower OpenCL enqueued blocks", false, false) 73 74 ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { 75 return new AMDGPUOpenCLEnqueuedBlockLowering(); 76 } 77 78 /// Collect direct or indrect callers of \p F and save them 79 /// to \p Callers. 80 static void collectCallers(Function *F, DenseSet<Function *> &Callers) { 81 for (auto U : F->users()) { 82 if (auto *CI = dyn_cast<CallInst>(&*U)) { 83 auto *Caller = CI->getParent()->getParent(); 84 if (Callers.count(Caller)) 85 continue; 86 Callers.insert(Caller); 87 collectCallers(Caller, Callers); 88 } 89 } 90 } 91 92 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { 93 DenseSet<Function *> Callers; 94 auto &C = M.getContext(); 95 bool Changed = false; 96 for (auto &F : M.functions()) { 97 if (F.hasFnAttribute("enqueued-block")) { 98 if (!F.hasName()) { 99 SmallString<64> Name; 100 Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel", 101 M.getDataLayout()); 102 F.setName(Name); 103 } 104 auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); 105 auto *GV = new GlobalVariable( 106 M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), 107 /*IsConstant=*/true, GlobalValue::ExternalLinkage, 108 /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, 109 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, 110 /*IsExternallyInitialized=*/true); 111 DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); 112 113 for (auto U : F.users()) { 114 if (!isa<ConstantExpr>(&*U)) 115 continue; 116 auto *BitCast = cast<ConstantExpr>(&*U); 117 auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType()); 118 BitCast->replaceAllUsesWith(NewPtr); 119 F.addFnAttr("runtime-handle", RuntimeHandle); 120 F.setLinkage(GlobalValue::ExternalLinkage); 121 122 // Collect direct or indirect callers of enqueue_kernel. 123 for (auto U : NewPtr->users()) { 124 if (auto *I = dyn_cast<Instruction>(&*U)) { 125 auto *F = I->getParent()->getParent(); 126 Callers.insert(F); 127 collectCallers(F, Callers); 128 } 129 } 130 Changed = true; 131 } 132 } 133 } 134 135 for (auto F : Callers) { 136 if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) 137 continue; 138 F->addFnAttr("calls-enqueue-kernel"); 139 } 140 return Changed; 141 } 142