1 //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // \brief This post-linking pass replaces the function pointer of enqueued 12 // block kernel with a global variable (runtime handle) and adds 13 // "runtime-handle" attribute to the enqueued block kernel. 14 // 15 // In LLVM CodeGen the runtime-handle metadata will be translated to 16 // RuntimeHandle metadata in code object. Runtime allocates a global buffer 17 // for each kernel with RuntimeHandel metadata and saves the kernel address 18 // required for the AQL packet into the buffer. __enqueue_kernel function 19 // in device library knows that the invoke function pointer in the block 20 // literal is actually runtime handle and loads the kernel address from it 21 // and put it into AQL packet for dispatching. 22 // 23 // This cannot be done in FE since FE cannot create a unique global variable 24 // with external linkage across LLVM modules. The global variable with internal 25 // linkage does not work since optimization passes will try to replace loads 26 // of the global variable with its initialization value. 27 // 28 // It also identifies the kernels directly or indirectly enqueues kernels 29 // and adds "calls-enqueue-kernel" function attribute to them, which will 30 // be used to determine whether to emit runtime metadata for the kernel 31 // enqueue related hidden kernel arguments. 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "AMDGPU.h" 36 #include "llvm/ADT/DenseSet.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Module.h" 41 #include "llvm/IR/User.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/raw_ostream.h" 45 46 #define DEBUG_TYPE "amdgpu-lower-enqueued-block" 47 48 using namespace llvm; 49 50 namespace { 51 52 /// \brief Lower enqueued blocks. 53 class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { 54 public: 55 static char ID; 56 57 explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} 58 59 private: 60 bool runOnModule(Module &M) override; 61 }; 62 63 } // end anonymous namespace 64 65 char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; 66 67 char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = 68 AMDGPUOpenCLEnqueuedBlockLowering::ID; 69 70 INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, 71 "Lower OpenCL enqueued blocks", false, false) 72 73 ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { 74 return new AMDGPUOpenCLEnqueuedBlockLowering(); 75 } 76 77 /// Collect direct or indrect callers of \p F and save them 78 /// to \p Callers. 79 static void collectCallers(Function *F, DenseSet<Function *> &Callers) { 80 for (auto U : F->users()) { 81 if (auto *CI = dyn_cast<CallInst>(&*U)) { 82 auto *Caller = CI->getParent()->getParent(); 83 if (Callers.count(Caller)) 84 continue; 85 Callers.insert(Caller); 86 collectCallers(Caller, Callers); 87 } 88 } 89 } 90 91 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { 92 DenseSet<Function *> Callers; 93 auto &C = M.getContext(); 94 auto AS = AMDGPU::getAMDGPUAS(M); 95 bool Changed = false; 96 for (auto &F : M.functions()) { 97 if (F.hasFnAttribute("enqueued-block")) { 98 if (!F.hasOneUse() || !F.user_begin()->hasOneUse() || 99 !isa<ConstantExpr>(*F.user_begin()) || 100 !isa<ConstantExpr>(*F.user_begin()->user_begin())) { 101 continue; 102 } 103 auto *BitCast = cast<ConstantExpr>(*F.user_begin()); 104 auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin()); 105 auto RuntimeHandle = (F.getName() + "_runtime_handle").str(); 106 auto *GV = new GlobalVariable( 107 M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS), 108 /*IsConstant=*/true, GlobalValue::ExternalLinkage, 109 /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, 110 GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS, 111 /*IsExternallyInitialized=*/true); 112 DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); 113 auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType()); 114 AddrCast->replaceAllUsesWith(NewPtr); 115 F.addFnAttr("runtime-handle", RuntimeHandle); 116 F.setLinkage(GlobalValue::ExternalLinkage); 117 118 // Collect direct or indirect callers of enqueue_kernel. 119 for (auto U : NewPtr->users()) { 120 if (auto *I = dyn_cast<Instruction>(&*U)) { 121 auto *F = I->getParent()->getParent(); 122 Callers.insert(F); 123 collectCallers(F, Callers); 124 } 125 } 126 Changed = true; 127 } 128 } 129 130 for (auto F : Callers) { 131 if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) 132 continue; 133 F->addFnAttr("calls-enqueue-kernel"); 134 } 135 return Changed; 136 } 137