1 //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // \brief This post-linking pass replaces the function pointer of enqueued 12 // block kernel with a global variable (runtime handle) and adds 13 // "runtime-handle" attribute to the enqueued block kernel. 14 // 15 // In LLVM CodeGen the runtime-handle metadata will be translated to 16 // RuntimeHandle metadata in code object. Runtime allocates a global buffer 17 // for each kernel with RuntimeHandel metadata and saves the kernel address 18 // required for the AQL packet into the buffer. __enqueue_kernel function 19 // in device library knows that the invoke function pointer in the block 20 // literal is actually runtime handle and loads the kernel address from it 21 // and put it into AQL packet for dispatching. 22 // 23 // This cannot be done in FE since FE cannot create a unique global variable 24 // with external linkage across LLVM modules. The global variable with internal 25 // linkage does not work since optimization passes will try to replace loads 26 // of the global variable with its initialization value. 27 // 28 // It also identifies the kernels directly or indirectly enqueues kernels 29 // and adds "calls-enqueue-kernel" function attribute to them, which will 30 // be used to determine whether to emit runtime metadata for the kernel 31 // enqueue related hidden kernel arguments. 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "AMDGPU.h" 36 #include "llvm/ADT/DenseSet.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Module.h" 41 #include "llvm/IR/User.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/raw_ostream.h" 45 46 #define DEBUG_TYPE "amdgpu-lower-enqueued-block" 47 48 using namespace llvm; 49 50 namespace { 51 52 /// \brief Lower enqueued blocks. 53 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { 54 public: 55 static char ID; 56 57 explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} 58 59 private: 60 bool runOnModule(Module &M) override; 61 }; 62 63 } // end anonymous namespace 64 65 char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; 66 67 char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = 68 AMDGPUOpenCLEnqueuedBlockLowering::ID; 69 70 INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, 71 "Lower OpenCL enqueued blocks", false, false) 72 73 ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { 74 return new AMDGPUOpenCLEnqueuedBlockLowering(); 75 } 76 77 /// Collect direct or indrect callers of \p F and save them 78 /// to \p Callers. 79 static void collectCallers(Function *F, DenseSet<Function *> &Callers) { 80 for (auto U : F->users()) { 81 if (auto *CI = dyn_cast<CallInst>(&*U)) { 82 auto *Caller = CI->getParent()->getParent(); 83 if (Callers.count(Caller)) 84 continue; 85 Callers.insert(Caller); 86 collectCallers(Caller, Callers); 87 } 88 } 89 } 90 91 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { 92 DenseSet<Function *> Callers; 93 auto &C = M.getContext(); 94 bool Changed = false; 95 for (auto &F : M.functions()) { 96 if (F.hasFnAttribute("enqueued-block")) { 97 if (!F.hasOneUse() || !F.user_begin()->hasOneUse() || 98 !isa<ConstantExpr>(*F.user_begin()) || 99 !isa<ConstantExpr>(*F.user_begin()->user_begin())) { 100 continue; 101 } 102 auto *BitCast = cast<ConstantExpr>(*F.user_begin()); 103 auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin()); 104 auto RuntimeHandle = (F.getName() + "_runtime_handle").str(); 105 auto *GV = new GlobalVariable( 106 M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), 107 /*IsConstant=*/true, GlobalValue::ExternalLinkage, 108 /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, 109 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, 110 /*IsExternallyInitialized=*/true); 111 DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); 112 auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType()); 113 AddrCast->replaceAllUsesWith(NewPtr); 114 F.addFnAttr("runtime-handle", RuntimeHandle); 115 F.setLinkage(GlobalValue::ExternalLinkage); 116 117 // Collect direct or indirect callers of enqueue_kernel. 118 for (auto U : NewPtr->users()) { 119 if (auto *I = dyn_cast<Instruction>(&*U)) { 120 auto *F = I->getParent()->getParent(); 121 Callers.insert(F); 122 collectCallers(F, Callers); 123 } 124 } 125 Changed = true; 126 } 127 } 128 129 for (auto F : Callers) { 130 if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) 131 continue; 132 F->addFnAttr("calls-enqueue-kernel"); 133 } 134 return Changed; 135 } 136