12cab237bSDimitry Andric //===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
22cab237bSDimitry Andric //
32cab237bSDimitry Andric // The LLVM Compiler Infrastructure
42cab237bSDimitry Andric //
52cab237bSDimitry Andric // This file is distributed under the University of Illinois Open Source
62cab237bSDimitry Andric // License. See LICENSE.TXT for details.
72cab237bSDimitry Andric //
82cab237bSDimitry Andric //===----------------------------------------------------------------------===//
92cab237bSDimitry Andric //
102cab237bSDimitry Andric // \file
11*4ba319b5SDimitry Andric // This post-linking pass replaces the function pointer of enqueued
122cab237bSDimitry Andric // block kernel with a global variable (runtime handle) and adds
132cab237bSDimitry Andric // "runtime-handle" attribute to the enqueued block kernel.
142cab237bSDimitry Andric //
152cab237bSDimitry Andric // In LLVM CodeGen the runtime-handle metadata will be translated to
162cab237bSDimitry Andric // RuntimeHandle metadata in code object. Runtime allocates a global buffer
172cab237bSDimitry Andric // for each kernel with RuntimeHandel metadata and saves the kernel address
182cab237bSDimitry Andric // required for the AQL packet into the buffer. __enqueue_kernel function
192cab237bSDimitry Andric // in device library knows that the invoke function pointer in the block
202cab237bSDimitry Andric // literal is actually runtime handle and loads the kernel address from it
212cab237bSDimitry Andric // and put it into AQL packet for dispatching.
222cab237bSDimitry Andric //
232cab237bSDimitry Andric // This cannot be done in FE since FE cannot create a unique global variable
242cab237bSDimitry Andric // with external linkage across LLVM modules. The global variable with internal
252cab237bSDimitry Andric // linkage does not work since optimization passes will try to replace loads
262cab237bSDimitry Andric // of the global variable with its initialization value.
272cab237bSDimitry Andric //
282cab237bSDimitry Andric // It also identifies the kernels directly or indirectly enqueues kernels
292cab237bSDimitry Andric // and adds "calls-enqueue-kernel" function attribute to them, which will
302cab237bSDimitry Andric // be used to determine whether to emit runtime metadata for the kernel
312cab237bSDimitry Andric // enqueue related hidden kernel arguments.
322cab237bSDimitry Andric //
332cab237bSDimitry Andric //===----------------------------------------------------------------------===//
342cab237bSDimitry Andric
352cab237bSDimitry Andric #include "AMDGPU.h"
362cab237bSDimitry Andric #include "llvm/ADT/DenseSet.h"
372cab237bSDimitry Andric #include "llvm/ADT/StringRef.h"
382cab237bSDimitry Andric #include "llvm/IR/Constants.h"
39*4ba319b5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
402cab237bSDimitry Andric #include "llvm/IR/Instructions.h"
41*4ba319b5SDimitry Andric #include "llvm/IR/Mangler.h"
422cab237bSDimitry Andric #include "llvm/IR/Module.h"
432cab237bSDimitry Andric #include "llvm/IR/User.h"
442cab237bSDimitry Andric #include "llvm/Pass.h"
452cab237bSDimitry Andric #include "llvm/Support/Debug.h"
462cab237bSDimitry Andric #include "llvm/Support/raw_ostream.h"
472cab237bSDimitry Andric
482cab237bSDimitry Andric #define DEBUG_TYPE "amdgpu-lower-enqueued-block"
492cab237bSDimitry Andric
502cab237bSDimitry Andric using namespace llvm;
512cab237bSDimitry Andric
522cab237bSDimitry Andric namespace {
532cab237bSDimitry Andric
54*4ba319b5SDimitry Andric /// Lower enqueued blocks.
552cab237bSDimitry Andric class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
562cab237bSDimitry Andric public:
572cab237bSDimitry Andric static char ID;
582cab237bSDimitry Andric
AMDGPUOpenCLEnqueuedBlockLowering()592cab237bSDimitry Andric explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
602cab237bSDimitry Andric
612cab237bSDimitry Andric private:
622cab237bSDimitry Andric bool runOnModule(Module &M) override;
632cab237bSDimitry Andric };
642cab237bSDimitry Andric
652cab237bSDimitry Andric } // end anonymous namespace
662cab237bSDimitry Andric
672cab237bSDimitry Andric char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
682cab237bSDimitry Andric
692cab237bSDimitry Andric char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
702cab237bSDimitry Andric AMDGPUOpenCLEnqueuedBlockLowering::ID;
712cab237bSDimitry Andric
722cab237bSDimitry Andric INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
732cab237bSDimitry Andric "Lower OpenCL enqueued blocks", false, false)
742cab237bSDimitry Andric
createAMDGPUOpenCLEnqueuedBlockLoweringPass()752cab237bSDimitry Andric ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
762cab237bSDimitry Andric return new AMDGPUOpenCLEnqueuedBlockLowering();
772cab237bSDimitry Andric }
782cab237bSDimitry Andric
792cab237bSDimitry Andric /// Collect direct or indrect callers of \p F and save them
802cab237bSDimitry Andric /// to \p Callers.
collectCallers(Function * F,DenseSet<Function * > & Callers)812cab237bSDimitry Andric static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
822cab237bSDimitry Andric for (auto U : F->users()) {
832cab237bSDimitry Andric if (auto *CI = dyn_cast<CallInst>(&*U)) {
842cab237bSDimitry Andric auto *Caller = CI->getParent()->getParent();
85*4ba319b5SDimitry Andric if (Callers.insert(Caller).second)
862cab237bSDimitry Andric collectCallers(Caller, Callers);
872cab237bSDimitry Andric }
882cab237bSDimitry Andric }
892cab237bSDimitry Andric }
902cab237bSDimitry Andric
91*4ba319b5SDimitry Andric /// If \p U is instruction or constant, collect functions which directly or
92*4ba319b5SDimitry Andric /// indirectly use it.
collectFunctionUsers(User * U,DenseSet<Function * > & Funcs)93*4ba319b5SDimitry Andric static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
94*4ba319b5SDimitry Andric if (auto *I = dyn_cast<Instruction>(U)) {
95*4ba319b5SDimitry Andric auto *F = I->getParent()->getParent();
96*4ba319b5SDimitry Andric if (Funcs.insert(F).second)
97*4ba319b5SDimitry Andric collectCallers(F, Funcs);
98*4ba319b5SDimitry Andric return;
99*4ba319b5SDimitry Andric }
100*4ba319b5SDimitry Andric if (!isa<Constant>(U))
101*4ba319b5SDimitry Andric return;
102*4ba319b5SDimitry Andric for (auto UU : U->users())
103*4ba319b5SDimitry Andric collectFunctionUsers(&*UU, Funcs);
104*4ba319b5SDimitry Andric }
105*4ba319b5SDimitry Andric
runOnModule(Module & M)1062cab237bSDimitry Andric bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
1072cab237bSDimitry Andric DenseSet<Function *> Callers;
1082cab237bSDimitry Andric auto &C = M.getContext();
1092cab237bSDimitry Andric bool Changed = false;
1102cab237bSDimitry Andric for (auto &F : M.functions()) {
1112cab237bSDimitry Andric if (F.hasFnAttribute("enqueued-block")) {
112*4ba319b5SDimitry Andric if (!F.hasName()) {
113*4ba319b5SDimitry Andric SmallString<64> Name;
114*4ba319b5SDimitry Andric Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
115*4ba319b5SDimitry Andric M.getDataLayout());
116*4ba319b5SDimitry Andric F.setName(Name);
1172cab237bSDimitry Andric }
118*4ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
119*4ba319b5SDimitry Andric auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
120*4ba319b5SDimitry Andric auto T = ArrayType::get(Type::getInt64Ty(C), 2);
1212cab237bSDimitry Andric auto *GV = new GlobalVariable(
122*4ba319b5SDimitry Andric M, T,
123*4ba319b5SDimitry Andric /*IsConstant=*/false, GlobalValue::ExternalLinkage,
124*4ba319b5SDimitry Andric /*Initializer=*/Constant::getNullValue(T), RuntimeHandle,
125*4ba319b5SDimitry Andric /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
126*4ba319b5SDimitry Andric AMDGPUAS::GLOBAL_ADDRESS,
127*4ba319b5SDimitry Andric /*IsExternallyInitialized=*/false);
128*4ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
129*4ba319b5SDimitry Andric
130*4ba319b5SDimitry Andric for (auto U : F.users()) {
131*4ba319b5SDimitry Andric auto *UU = &*U;
132*4ba319b5SDimitry Andric if (!isa<ConstantExpr>(UU))
133*4ba319b5SDimitry Andric continue;
134*4ba319b5SDimitry Andric collectFunctionUsers(UU, Callers);
135*4ba319b5SDimitry Andric auto *BitCast = cast<ConstantExpr>(UU);
136*4ba319b5SDimitry Andric auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
137*4ba319b5SDimitry Andric BitCast->replaceAllUsesWith(NewPtr);
1382cab237bSDimitry Andric F.addFnAttr("runtime-handle", RuntimeHandle);
1392cab237bSDimitry Andric F.setLinkage(GlobalValue::ExternalLinkage);
1402cab237bSDimitry Andric Changed = true;
1412cab237bSDimitry Andric }
1422cab237bSDimitry Andric }
143*4ba319b5SDimitry Andric }
1442cab237bSDimitry Andric
1452cab237bSDimitry Andric for (auto F : Callers) {
1462cab237bSDimitry Andric if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
1472cab237bSDimitry Andric continue;
1482cab237bSDimitry Andric F->addFnAttr("calls-enqueue-kernel");
149*4ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
1502cab237bSDimitry Andric }
1512cab237bSDimitry Andric return Changed;
1522cab237bSDimitry Andric }
153