1fe6060f1SDimitry Andric //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// 2fe6060f1SDimitry Andric // 3fe6060f1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4fe6060f1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5fe6060f1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6fe6060f1SDimitry Andric // 7fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 8fe6060f1SDimitry Andric // 9bdd1243dSDimitry Andric // This pass eliminates local data store, LDS, uses from non-kernel functions. 10bdd1243dSDimitry Andric // LDS is contiguous memory allocated per kernel execution. 11fe6060f1SDimitry Andric // 12bdd1243dSDimitry Andric // Background. 13fe6060f1SDimitry Andric // 14bdd1243dSDimitry Andric // The programming model is global variables, or equivalently function local 15bdd1243dSDimitry Andric // static variables, accessible from kernels or other functions. For uses from 16bdd1243dSDimitry Andric // kernels this is straightforward - assign an integer to the kernel for the 17bdd1243dSDimitry Andric // memory required by all the variables combined, allocate them within that. 18bdd1243dSDimitry Andric // For uses from functions there are performance tradeoffs to choose between. 19bdd1243dSDimitry Andric // 20bdd1243dSDimitry Andric // This model means the GPU runtime can specify the amount of memory allocated. 21bdd1243dSDimitry Andric // If this is more than the kernel assumed, the excess can be made available 22bdd1243dSDimitry Andric // using a language specific feature, which IR represents as a variable with 23*fe013be4SDimitry Andric // no initializer. This feature is referred to here as "Dynamic LDS" and is 24*fe013be4SDimitry Andric // lowered slightly differently to the normal case. 25bdd1243dSDimitry Andric // 26bdd1243dSDimitry Andric // Consequences of this GPU feature: 27bdd1243dSDimitry Andric // - memory is limited and exceeding it halts compilation 28bdd1243dSDimitry Andric // - a global accessed by one kernel exists independent of other kernels 29bdd1243dSDimitry Andric // - a global exists independent of simultaneous execution of the same kernel 30bdd1243dSDimitry Andric // - the address of the global may be different from different kernels as they 31bdd1243dSDimitry Andric // do not alias, which permits only allocating variables they use 32bdd1243dSDimitry Andric // - if the address is allowed to differ, functions need help to find it 33bdd1243dSDimitry Andric // 34bdd1243dSDimitry Andric // Uses from kernels are implemented here by grouping them in a per-kernel 35bdd1243dSDimitry Andric // struct instance. This duplicates the variables, accurately modelling their 36bdd1243dSDimitry Andric // aliasing properties relative to a single global representation. It also 37bdd1243dSDimitry Andric // permits control over alignment via padding. 38bdd1243dSDimitry Andric // 39bdd1243dSDimitry Andric // Uses from functions are more complicated and the primary purpose of this 40bdd1243dSDimitry Andric // IR pass. Several different lowering are chosen between to meet requirements 41bdd1243dSDimitry Andric // to avoid allocating any LDS where it is not necessary, as that impacts 42bdd1243dSDimitry Andric // occupancy and may fail the compilation, while not imposing overhead on a 43bdd1243dSDimitry Andric // feature whose primary advantage over global memory is performance. The basic 44bdd1243dSDimitry Andric // design goal is to avoid one kernel imposing overhead on another. 45bdd1243dSDimitry Andric // 46bdd1243dSDimitry Andric // Implementation. 47bdd1243dSDimitry Andric // 48bdd1243dSDimitry Andric // LDS variables with constant annotation or non-undef initializer are passed 4981ad6265SDimitry Andric // through unchanged for simplification or error diagnostics in later passes. 50bdd1243dSDimitry Andric // Non-undef initializers are not yet implemented for LDS. 51fe6060f1SDimitry Andric // 52bdd1243dSDimitry Andric // LDS variables that are always allocated at the same address can be found 53bdd1243dSDimitry Andric // by lookup at that address. Otherwise runtime information/cost is required. 54fe6060f1SDimitry Andric // 55bdd1243dSDimitry Andric // The simplest strategy possible is to group all LDS variables in a single 56bdd1243dSDimitry Andric // struct and allocate that struct in every kernel such that the original 57bdd1243dSDimitry Andric // variables are always at the same address. LDS is however a limited resource 58bdd1243dSDimitry Andric // so this strategy is unusable in practice. It is not implemented here. 59bdd1243dSDimitry Andric // 60bdd1243dSDimitry Andric // Strategy | Precise allocation | Zero runtime cost | General purpose | 61bdd1243dSDimitry Andric // --------+--------------------+-------------------+-----------------+ 62bdd1243dSDimitry Andric // Module | No | Yes | Yes | 63bdd1243dSDimitry Andric // Table | Yes | No | Yes | 64bdd1243dSDimitry Andric // Kernel | Yes | Yes | No | 65bdd1243dSDimitry Andric // Hybrid | Yes | Partial | Yes | 66bdd1243dSDimitry Andric // 67*fe013be4SDimitry Andric // "Module" spends LDS memory to save cycles. "Table" spends cycles and global 68*fe013be4SDimitry Andric // memory to save LDS. "Kernel" is as fast as kernel allocation but only works 69*fe013be4SDimitry Andric // for variables that are known reachable from a single kernel. "Hybrid" picks 70*fe013be4SDimitry Andric // between all three. When forced to choose between LDS and cycles we minimise 71bdd1243dSDimitry Andric // LDS use. 72bdd1243dSDimitry Andric 73bdd1243dSDimitry Andric // The "module" lowering implemented here finds LDS variables which are used by 74bdd1243dSDimitry Andric // non-kernel functions and creates a new struct with a field for each of those 75bdd1243dSDimitry Andric // LDS variables. Variables that are only used from kernels are excluded. 76bdd1243dSDimitry Andric // 77bdd1243dSDimitry Andric // The "table" lowering implemented here has three components. 78bdd1243dSDimitry Andric // First kernels are assigned a unique integer identifier which is available in 79bdd1243dSDimitry Andric // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer 80bdd1243dSDimitry Andric // is passed through a specific SGPR, thus works with indirect calls. 81bdd1243dSDimitry Andric // Second, each kernel allocates LDS variables independent of other kernels and 82bdd1243dSDimitry Andric // writes the addresses it chose for each variable into an array in consistent 83bdd1243dSDimitry Andric // order. If the kernel does not allocate a given variable, it writes undef to 84bdd1243dSDimitry Andric // the corresponding array location. These arrays are written to a constant 85bdd1243dSDimitry Andric // table in the order matching the kernel unique integer identifier. 86bdd1243dSDimitry Andric // Third, uses from non-kernel functions are replaced with a table lookup using 87bdd1243dSDimitry Andric // the intrinsic function to find the address of the variable. 88bdd1243dSDimitry Andric // 89bdd1243dSDimitry Andric // "Kernel" lowering is only applicable for variables that are unambiguously 90bdd1243dSDimitry Andric // reachable from exactly one kernel. For those cases, accesses to the variable 91bdd1243dSDimitry Andric // can be lowered to ConstantExpr address of a struct instance specific to that 92bdd1243dSDimitry Andric // one kernel. This is zero cost in space and in compute. It will raise a fatal 93bdd1243dSDimitry Andric // error on any variable that might be reachable from multiple kernels and is 94bdd1243dSDimitry Andric // thus most easily used as part of the hybrid lowering strategy. 95bdd1243dSDimitry Andric // 96bdd1243dSDimitry Andric // Hybrid lowering is a mixture of the above. It uses the zero cost kernel 97bdd1243dSDimitry Andric // lowering where it can. It lowers the variable accessed by the greatest 98bdd1243dSDimitry Andric // number of kernels using the module strategy as that is free for the first 99bdd1243dSDimitry Andric // variable. Any futher variables that can be lowered with the module strategy 100bdd1243dSDimitry Andric // without incurring LDS memory overhead are. The remaining ones are lowered 101bdd1243dSDimitry Andric // via table. 102bdd1243dSDimitry Andric // 103bdd1243dSDimitry Andric // Consequences 104bdd1243dSDimitry Andric // - No heuristics or user controlled magic numbers, hybrid is the right choice 105bdd1243dSDimitry Andric // - Kernels that don't use functions (or have had them all inlined) are not 106bdd1243dSDimitry Andric // affected by any lowering for kernels that do. 107bdd1243dSDimitry Andric // - Kernels that don't make indirect function calls are not affected by those 108bdd1243dSDimitry Andric // that do. 109bdd1243dSDimitry Andric // - Variables which are used by lots of kernels, e.g. those injected by a 110bdd1243dSDimitry Andric // language runtime in most kernels, are expected to have no overhead 111bdd1243dSDimitry Andric // - Implementations that instantiate templates per-kernel where those templates 112bdd1243dSDimitry Andric // use LDS are expected to hit the "Kernel" lowering strategy 113bdd1243dSDimitry Andric // - The runtime properties impose a cost in compiler implementation complexity 114fe6060f1SDimitry Andric // 115*fe013be4SDimitry Andric // Dynamic LDS implementation 116*fe013be4SDimitry Andric // Dynamic LDS is lowered similarly to the "table" strategy above and uses the 117*fe013be4SDimitry Andric // same intrinsic to identify which kernel is at the root of the dynamic call 118*fe013be4SDimitry Andric // graph. This relies on the specified behaviour that all dynamic LDS variables 119*fe013be4SDimitry Andric // alias one another, i.e. are at the same address, with respect to a given 120*fe013be4SDimitry Andric // kernel. Therefore this pass creates new dynamic LDS variables for each kernel 121*fe013be4SDimitry Andric // that allocates any dynamic LDS and builds a table of addresses out of those. 122*fe013be4SDimitry Andric // The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. 123*fe013be4SDimitry Andric // The corresponding optimisation for "kernel" lowering where the table lookup 124*fe013be4SDimitry Andric // is elided is not implemented. 125*fe013be4SDimitry Andric // 126*fe013be4SDimitry Andric // 127*fe013be4SDimitry Andric // Implementation notes / limitations 128*fe013be4SDimitry Andric // A single LDS global variable represents an instance per kernel that can reach 129*fe013be4SDimitry Andric // said variables. This pass essentially specialises said variables per kernel. 130*fe013be4SDimitry Andric // Handling ConstantExpr during the pass complicated this significantly so now 131*fe013be4SDimitry Andric // all ConstantExpr uses of LDS variables are expanded to instructions. This 132*fe013be4SDimitry Andric // may need amending when implementing non-undef initialisers. 133*fe013be4SDimitry Andric // 134*fe013be4SDimitry Andric // Lowering is split between this IR pass and the back end. This pass chooses 135*fe013be4SDimitry Andric // where given variables should be allocated and marks them with metadata, 136*fe013be4SDimitry Andric // MD_absolute_symbol. The backend places the variables in coincidentally the 137*fe013be4SDimitry Andric // same location and raises a fatal error if something has gone awry. This works 138*fe013be4SDimitry Andric // in practice because the only pass between this one and the backend that 139*fe013be4SDimitry Andric // changes LDS is PromoteAlloca and the changes it makes do not conflict. 140*fe013be4SDimitry Andric // 141*fe013be4SDimitry Andric // Addresses are written to constant global arrays based on the same metadata. 142*fe013be4SDimitry Andric // 143*fe013be4SDimitry Andric // The backend lowers LDS variables in the order of traversal of the function. 144*fe013be4SDimitry Andric // This is at odds with the deterministic layout required. The workaround is to 145*fe013be4SDimitry Andric // allocate the fixed-address variables immediately upon starting the function 146*fe013be4SDimitry Andric // where they can be placed as intended. This requires a means of mapping from 147*fe013be4SDimitry Andric // the function to the variables that it allocates. For the module scope lds, 148*fe013be4SDimitry Andric // this is via metadata indicating whether the variable is not required. If a 149*fe013be4SDimitry Andric // pass deletes that metadata, a fatal error on disagreement with the absolute 150*fe013be4SDimitry Andric // symbol metadata will occur. For kernel scope and dynamic, this is by _name_ 151*fe013be4SDimitry Andric // correspondence between the function and the variable. It requires the 152*fe013be4SDimitry Andric // kernel to have a name (which is only a limitation for tests in practice) and 153*fe013be4SDimitry Andric // for nothing to rename the corresponding symbols. This is a hazard if the pass 154*fe013be4SDimitry Andric // is run multiple times during debugging. Alternative schemes considered all 155*fe013be4SDimitry Andric // involve bespoke metadata. 156*fe013be4SDimitry Andric // 157*fe013be4SDimitry Andric // If the name correspondence can be replaced, multiple distinct kernels that 158*fe013be4SDimitry Andric // have the same memory layout can map to the same kernel id (as the address 159*fe013be4SDimitry Andric // itself is handled by the absolute symbol metadata) and that will allow more 160*fe013be4SDimitry Andric // uses of the "kernel" style faster lowering and reduce the size of the lookup 161*fe013be4SDimitry Andric // tables. 162*fe013be4SDimitry Andric // 163*fe013be4SDimitry Andric // There is a test that checks this does not fire for a graphics shader. This 164*fe013be4SDimitry Andric // lowering is expected to work for graphics if the isKernel test is changed. 165*fe013be4SDimitry Andric // 166*fe013be4SDimitry Andric // The current markUsedByKernel is sufficient for PromoteAlloca but is elided 167*fe013be4SDimitry Andric // before codegen. Replacing this with an equivalent intrinsic which lasts until 168*fe013be4SDimitry Andric // shortly after the machine function lowering of LDS would help break the name 169*fe013be4SDimitry Andric // mapping. The other part needed is probably to amend PromoteAlloca to embed 170*fe013be4SDimitry Andric // the LDS variables it creates in the same struct created here. That avoids the 171*fe013be4SDimitry Andric // current hazard where a PromoteAlloca LDS variable might be allocated before 172*fe013be4SDimitry Andric // the kernel scope (and thus error on the address check). Given a new invariant 173*fe013be4SDimitry Andric // that no LDS variables exist outside of the structs managed here, and an 174*fe013be4SDimitry Andric // intrinsic that lasts until after the LDS frame lowering, it should be 175*fe013be4SDimitry Andric // possible to drop the name mapping and fold equivalent memory layouts. 176*fe013be4SDimitry Andric // 177fe6060f1SDimitry Andric //===----------------------------------------------------------------------===// 178fe6060f1SDimitry Andric 179fe6060f1SDimitry Andric #include "AMDGPU.h" 180fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 18181ad6265SDimitry Andric #include "Utils/AMDGPUMemoryUtils.h" 182972a253aSDimitry Andric #include "llvm/ADT/BitVector.h" 183972a253aSDimitry Andric #include "llvm/ADT/DenseMap.h" 184bdd1243dSDimitry Andric #include "llvm/ADT/DenseSet.h" 185fe6060f1SDimitry Andric #include "llvm/ADT/STLExtras.h" 186bdd1243dSDimitry Andric #include "llvm/ADT/SetOperations.h" 187bdd1243dSDimitry Andric #include "llvm/ADT/SetVector.h" 18881ad6265SDimitry Andric #include "llvm/Analysis/CallGraph.h" 189fe6060f1SDimitry Andric #include "llvm/IR/Constants.h" 190fe6060f1SDimitry Andric #include "llvm/IR/DerivedTypes.h" 191fe6060f1SDimitry Andric #include "llvm/IR/IRBuilder.h" 192fe6060f1SDimitry Andric #include "llvm/IR/InlineAsm.h" 193fe6060f1SDimitry Andric #include "llvm/IR/Instructions.h" 194bdd1243dSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 195349cc55cSDimitry Andric #include "llvm/IR/MDBuilder.h" 196*fe013be4SDimitry Andric #include "llvm/IR/ReplaceConstant.h" 197fe6060f1SDimitry Andric #include "llvm/InitializePasses.h" 198fe6060f1SDimitry Andric #include "llvm/Pass.h" 199fe6060f1SDimitry Andric #include "llvm/Support/CommandLine.h" 200fe6060f1SDimitry Andric #include "llvm/Support/Debug.h" 201*fe013be4SDimitry Andric #include "llvm/Support/Format.h" 202fe6060f1SDimitry Andric #include "llvm/Support/OptimizedStructLayout.h" 203*fe013be4SDimitry Andric #include "llvm/Support/raw_ostream.h" 204bdd1243dSDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 205fe6060f1SDimitry Andric #include "llvm/Transforms/Utils/ModuleUtils.h" 206bdd1243dSDimitry Andric 207972a253aSDimitry Andric #include <tuple> 208fe6060f1SDimitry Andric #include <vector> 209fe6060f1SDimitry Andric 210bdd1243dSDimitry Andric #include <cstdio> 211bdd1243dSDimitry Andric 212fe6060f1SDimitry Andric #define DEBUG_TYPE "amdgpu-lower-module-lds" 213fe6060f1SDimitry Andric 214fe6060f1SDimitry Andric using namespace llvm; 215fe6060f1SDimitry Andric 216bdd1243dSDimitry Andric namespace { 217bdd1243dSDimitry Andric 218bdd1243dSDimitry Andric cl::opt<bool> SuperAlignLDSGlobals( 219fe6060f1SDimitry Andric "amdgpu-super-align-lds-globals", 220fe6060f1SDimitry Andric cl::desc("Increase alignment of LDS if it is not on align boundary"), 221fe6060f1SDimitry Andric cl::init(true), cl::Hidden); 222fe6060f1SDimitry Andric 223bdd1243dSDimitry Andric enum class LoweringKind { module, table, kernel, hybrid }; 224bdd1243dSDimitry Andric cl::opt<LoweringKind> LoweringKindLoc( 225bdd1243dSDimitry Andric "amdgpu-lower-module-lds-strategy", 226bdd1243dSDimitry Andric cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden, 227*fe013be4SDimitry Andric cl::init(LoweringKind::hybrid), 228bdd1243dSDimitry Andric cl::values( 229bdd1243dSDimitry Andric clEnumValN(LoweringKind::table, "table", "Lower via table lookup"), 230bdd1243dSDimitry Andric clEnumValN(LoweringKind::module, "module", "Lower via module struct"), 231bdd1243dSDimitry Andric clEnumValN( 232bdd1243dSDimitry Andric LoweringKind::kernel, "kernel", 233bdd1243dSDimitry Andric "Lower variables reachable from one kernel, otherwise abort"), 234bdd1243dSDimitry Andric clEnumValN(LoweringKind::hybrid, "hybrid", 235bdd1243dSDimitry Andric "Lower via mixture of above strategies"))); 236bdd1243dSDimitry Andric 237bdd1243dSDimitry Andric bool isKernelLDS(const Function *F) { 238bdd1243dSDimitry Andric // Some weirdness here. AMDGPU::isKernelCC does not call into 239bdd1243dSDimitry Andric // AMDGPU::isKernel with the calling conv, it instead calls into 240bdd1243dSDimitry Andric // isModuleEntryFunction which returns true for more calling conventions 241bdd1243dSDimitry Andric // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. 242bdd1243dSDimitry Andric // There's also a test that checks that the LDS lowering does not hit on 243bdd1243dSDimitry Andric // a graphics shader, denoted amdgpu_ps, so stay with the limited case. 244bdd1243dSDimitry Andric // Putting LDS in the name of the function to draw attention to this. 245bdd1243dSDimitry Andric return AMDGPU::isKernel(F->getCallingConv()); 246bdd1243dSDimitry Andric } 247bdd1243dSDimitry Andric 248*fe013be4SDimitry Andric template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { 249*fe013be4SDimitry Andric llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { 250*fe013be4SDimitry Andric return L->getName() < R->getName(); 251*fe013be4SDimitry Andric }); 252*fe013be4SDimitry Andric return {std::move(V)}; 253*fe013be4SDimitry Andric } 254*fe013be4SDimitry Andric 255fe6060f1SDimitry Andric class AMDGPULowerModuleLDS : public ModulePass { 256fe6060f1SDimitry Andric 257fe6060f1SDimitry Andric static void 258bdd1243dSDimitry Andric removeLocalVarsFromUsedLists(Module &M, 259bdd1243dSDimitry Andric const DenseSet<GlobalVariable *> &LocalVars) { 260972a253aSDimitry Andric // The verifier rejects used lists containing an inttoptr of a constant 261972a253aSDimitry Andric // so remove the variables from these lists before replaceAllUsesWith 262bdd1243dSDimitry Andric SmallPtrSet<Constant *, 8> LocalVarsSet; 2630eae32dcSDimitry Andric for (GlobalVariable *LocalVar : LocalVars) 264bdd1243dSDimitry Andric LocalVarsSet.insert(cast<Constant>(LocalVar->stripPointerCasts())); 265bdd1243dSDimitry Andric 266bdd1243dSDimitry Andric removeFromUsedLists( 267bdd1243dSDimitry Andric M, [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(C); }); 268bdd1243dSDimitry Andric 269bdd1243dSDimitry Andric for (GlobalVariable *LocalVar : LocalVars) 270bdd1243dSDimitry Andric LocalVar->removeDeadConstantUsers(); 271fe6060f1SDimitry Andric } 272fe6060f1SDimitry Andric 273*fe013be4SDimitry Andric static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { 274fe6060f1SDimitry Andric // The llvm.amdgcn.module.lds instance is implicitly used by all kernels 275fe6060f1SDimitry Andric // that might call a function which accesses a field within it. This is 276fe6060f1SDimitry Andric // presently approximated to 'all kernels' if there are any such functions 277349cc55cSDimitry Andric // in the module. This implicit use is redefined as an explicit use here so 278fe6060f1SDimitry Andric // that later passes, specifically PromoteAlloca, account for the required 279fe6060f1SDimitry Andric // memory without any knowledge of this transform. 280fe6060f1SDimitry Andric 281fe6060f1SDimitry Andric // An operand bundle on llvm.donothing works because the call instruction 282fe6060f1SDimitry Andric // survives until after the last pass that needs to account for LDS. It is 283fe6060f1SDimitry Andric // better than inline asm as the latter survives until the end of codegen. A 284fe6060f1SDimitry Andric // totally robust solution would be a function with the same semantics as 285fe6060f1SDimitry Andric // llvm.donothing that takes a pointer to the instance and is lowered to a 286fe6060f1SDimitry Andric // no-op after LDS is allocated, but that is not presently necessary. 287fe6060f1SDimitry Andric 288*fe013be4SDimitry Andric // This intrinsic is eliminated shortly before instruction selection. It 289*fe013be4SDimitry Andric // does not suffice to indicate to ISel that a given global which is not 290*fe013be4SDimitry Andric // immediately used by the kernel must still be allocated by it. An 291*fe013be4SDimitry Andric // equivalent target specific intrinsic which lasts until immediately after 292*fe013be4SDimitry Andric // codegen would suffice for that, but one would still need to ensure that 293*fe013be4SDimitry Andric // the variables are allocated in the anticpated order. 294*fe013be4SDimitry Andric IRBuilder<> Builder(Func->getEntryBlock().getFirstNonPHI()); 295fe6060f1SDimitry Andric 296fe6060f1SDimitry Andric Function *Decl = 297fe6060f1SDimitry Andric Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); 298fe6060f1SDimitry Andric 299*fe013be4SDimitry Andric Value *UseInstance[1] = { 300*fe013be4SDimitry Andric Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; 301fe6060f1SDimitry Andric 302*fe013be4SDimitry Andric Builder.CreateCall( 303*fe013be4SDimitry Andric Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}); 304fe6060f1SDimitry Andric } 305fe6060f1SDimitry Andric 306bdd1243dSDimitry Andric static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { 307bdd1243dSDimitry Andric // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS 308bdd1243dSDimitry Andric // global may have uses from multiple different functions as a result. 309bdd1243dSDimitry Andric // This pass specialises LDS variables with respect to the kernel that 310bdd1243dSDimitry Andric // allocates them. 311bdd1243dSDimitry Andric 312*fe013be4SDimitry Andric // This is semantically equivalent to (the unimplemented as slow): 313bdd1243dSDimitry Andric // for (auto &F : M.functions()) 314bdd1243dSDimitry Andric // for (auto &BB : F) 315bdd1243dSDimitry Andric // for (auto &I : BB) 316bdd1243dSDimitry Andric // for (Use &Op : I.operands()) 317bdd1243dSDimitry Andric // if (constantExprUsesLDS(Op)) 318bdd1243dSDimitry Andric // replaceConstantExprInFunction(I, Op); 319bdd1243dSDimitry Andric 320*fe013be4SDimitry Andric SmallVector<Constant *> LDSGlobals; 321bdd1243dSDimitry Andric for (auto &GV : M.globals()) 322bdd1243dSDimitry Andric if (AMDGPU::isLDSVariableToLower(GV)) 323*fe013be4SDimitry Andric LDSGlobals.push_back(&GV); 324bdd1243dSDimitry Andric 325*fe013be4SDimitry Andric return convertUsersOfConstantsToInstructions(LDSGlobals); 326bdd1243dSDimitry Andric } 327bdd1243dSDimitry Andric 328fe6060f1SDimitry Andric public: 329fe6060f1SDimitry Andric static char ID; 330fe6060f1SDimitry Andric 331fe6060f1SDimitry Andric AMDGPULowerModuleLDS() : ModulePass(ID) { 332fe6060f1SDimitry Andric initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); 333fe6060f1SDimitry Andric } 334fe6060f1SDimitry Andric 335bdd1243dSDimitry Andric using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>; 336bdd1243dSDimitry Andric 337bdd1243dSDimitry Andric using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>; 338bdd1243dSDimitry Andric 339bdd1243dSDimitry Andric static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M, 340bdd1243dSDimitry Andric FunctionVariableMap &kernels, 341bdd1243dSDimitry Andric FunctionVariableMap &functions) { 342bdd1243dSDimitry Andric 343bdd1243dSDimitry Andric // Get uses from the current function, excluding uses by called functions 344bdd1243dSDimitry Andric // Two output variables to avoid walking the globals list twice 345bdd1243dSDimitry Andric for (auto &GV : M.globals()) { 346bdd1243dSDimitry Andric if (!AMDGPU::isLDSVariableToLower(GV)) { 347bdd1243dSDimitry Andric continue; 348bdd1243dSDimitry Andric } 349bdd1243dSDimitry Andric 350*fe013be4SDimitry Andric if (GV.isAbsoluteSymbolRef()) { 351*fe013be4SDimitry Andric report_fatal_error( 352*fe013be4SDimitry Andric "LDS variables with absolute addresses are unimplemented."); 353*fe013be4SDimitry Andric } 354*fe013be4SDimitry Andric 355bdd1243dSDimitry Andric for (User *V : GV.users()) { 356bdd1243dSDimitry Andric if (auto *I = dyn_cast<Instruction>(V)) { 357bdd1243dSDimitry Andric Function *F = I->getFunction(); 358bdd1243dSDimitry Andric if (isKernelLDS(F)) { 359bdd1243dSDimitry Andric kernels[F].insert(&GV); 360bdd1243dSDimitry Andric } else { 361bdd1243dSDimitry Andric functions[F].insert(&GV); 362bdd1243dSDimitry Andric } 363bdd1243dSDimitry Andric } 364bdd1243dSDimitry Andric } 365bdd1243dSDimitry Andric } 366bdd1243dSDimitry Andric } 367bdd1243dSDimitry Andric 368bdd1243dSDimitry Andric struct LDSUsesInfoTy { 369bdd1243dSDimitry Andric FunctionVariableMap direct_access; 370bdd1243dSDimitry Andric FunctionVariableMap indirect_access; 371bdd1243dSDimitry Andric }; 372bdd1243dSDimitry Andric 373bdd1243dSDimitry Andric static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) { 374bdd1243dSDimitry Andric 375bdd1243dSDimitry Andric FunctionVariableMap direct_map_kernel; 376bdd1243dSDimitry Andric FunctionVariableMap direct_map_function; 377bdd1243dSDimitry Andric getUsesOfLDSByFunction(CG, M, direct_map_kernel, direct_map_function); 378bdd1243dSDimitry Andric 379bdd1243dSDimitry Andric // Collect variables that are used by functions whose address has escaped 380bdd1243dSDimitry Andric DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; 381bdd1243dSDimitry Andric for (Function &F : M.functions()) { 382bdd1243dSDimitry Andric if (!isKernelLDS(&F)) 383bdd1243dSDimitry Andric if (F.hasAddressTaken(nullptr, 384bdd1243dSDimitry Andric /* IgnoreCallbackUses */ false, 385bdd1243dSDimitry Andric /* IgnoreAssumeLikeCalls */ false, 386bdd1243dSDimitry Andric /* IgnoreLLVMUsed */ true, 387bdd1243dSDimitry Andric /* IgnoreArcAttachedCall */ false)) { 388bdd1243dSDimitry Andric set_union(VariablesReachableThroughFunctionPointer, 389bdd1243dSDimitry Andric direct_map_function[&F]); 390bdd1243dSDimitry Andric } 391bdd1243dSDimitry Andric } 392bdd1243dSDimitry Andric 393bdd1243dSDimitry Andric auto functionMakesUnknownCall = [&](const Function *F) -> bool { 394bdd1243dSDimitry Andric assert(!F->isDeclaration()); 395*fe013be4SDimitry Andric for (const CallGraphNode::CallRecord &R : *CG[F]) { 396bdd1243dSDimitry Andric if (!R.second->getFunction()) { 397bdd1243dSDimitry Andric return true; 398bdd1243dSDimitry Andric } 399bdd1243dSDimitry Andric } 400bdd1243dSDimitry Andric return false; 401bdd1243dSDimitry Andric }; 402bdd1243dSDimitry Andric 403bdd1243dSDimitry Andric // Work out which variables are reachable through function calls 404bdd1243dSDimitry Andric FunctionVariableMap transitive_map_function = direct_map_function; 405bdd1243dSDimitry Andric 406bdd1243dSDimitry Andric // If the function makes any unknown call, assume the worst case that it can 407bdd1243dSDimitry Andric // access all variables accessed by functions whose address escaped 408bdd1243dSDimitry Andric for (Function &F : M.functions()) { 409bdd1243dSDimitry Andric if (!F.isDeclaration() && functionMakesUnknownCall(&F)) { 410bdd1243dSDimitry Andric if (!isKernelLDS(&F)) { 411bdd1243dSDimitry Andric set_union(transitive_map_function[&F], 412bdd1243dSDimitry Andric VariablesReachableThroughFunctionPointer); 413bdd1243dSDimitry Andric } 414bdd1243dSDimitry Andric } 415bdd1243dSDimitry Andric } 416bdd1243dSDimitry Andric 417bdd1243dSDimitry Andric // Direct implementation of collecting all variables reachable from each 418bdd1243dSDimitry Andric // function 419bdd1243dSDimitry Andric for (Function &Func : M.functions()) { 420bdd1243dSDimitry Andric if (Func.isDeclaration() || isKernelLDS(&Func)) 421bdd1243dSDimitry Andric continue; 422bdd1243dSDimitry Andric 423bdd1243dSDimitry Andric DenseSet<Function *> seen; // catches cycles 424bdd1243dSDimitry Andric SmallVector<Function *, 4> wip{&Func}; 425bdd1243dSDimitry Andric 426bdd1243dSDimitry Andric while (!wip.empty()) { 427bdd1243dSDimitry Andric Function *F = wip.pop_back_val(); 428bdd1243dSDimitry Andric 429bdd1243dSDimitry Andric // Can accelerate this by referring to transitive map for functions that 430bdd1243dSDimitry Andric // have already been computed, with more care than this 431bdd1243dSDimitry Andric set_union(transitive_map_function[&Func], direct_map_function[F]); 432bdd1243dSDimitry Andric 433*fe013be4SDimitry Andric for (const CallGraphNode::CallRecord &R : *CG[F]) { 434bdd1243dSDimitry Andric Function *ith = R.second->getFunction(); 435bdd1243dSDimitry Andric if (ith) { 436bdd1243dSDimitry Andric if (!seen.contains(ith)) { 437bdd1243dSDimitry Andric seen.insert(ith); 438bdd1243dSDimitry Andric wip.push_back(ith); 439bdd1243dSDimitry Andric } 440bdd1243dSDimitry Andric } 441bdd1243dSDimitry Andric } 442bdd1243dSDimitry Andric } 443bdd1243dSDimitry Andric } 444bdd1243dSDimitry Andric 445bdd1243dSDimitry Andric // direct_map_kernel lists which variables are used by the kernel 446bdd1243dSDimitry Andric // find the variables which are used through a function call 447bdd1243dSDimitry Andric FunctionVariableMap indirect_map_kernel; 448bdd1243dSDimitry Andric 449bdd1243dSDimitry Andric for (Function &Func : M.functions()) { 450bdd1243dSDimitry Andric if (Func.isDeclaration() || !isKernelLDS(&Func)) 451bdd1243dSDimitry Andric continue; 452bdd1243dSDimitry Andric 453*fe013be4SDimitry Andric for (const CallGraphNode::CallRecord &R : *CG[&Func]) { 454bdd1243dSDimitry Andric Function *ith = R.second->getFunction(); 455bdd1243dSDimitry Andric if (ith) { 456bdd1243dSDimitry Andric set_union(indirect_map_kernel[&Func], transitive_map_function[ith]); 457bdd1243dSDimitry Andric } else { 458bdd1243dSDimitry Andric set_union(indirect_map_kernel[&Func], 459bdd1243dSDimitry Andric VariablesReachableThroughFunctionPointer); 460bdd1243dSDimitry Andric } 461bdd1243dSDimitry Andric } 462bdd1243dSDimitry Andric } 463bdd1243dSDimitry Andric 464bdd1243dSDimitry Andric return {std::move(direct_map_kernel), std::move(indirect_map_kernel)}; 465bdd1243dSDimitry Andric } 466bdd1243dSDimitry Andric 467bdd1243dSDimitry Andric struct LDSVariableReplacement { 468bdd1243dSDimitry Andric GlobalVariable *SGV = nullptr; 469bdd1243dSDimitry Andric DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; 470bdd1243dSDimitry Andric }; 471bdd1243dSDimitry Andric 472bdd1243dSDimitry Andric // remap from lds global to a constantexpr gep to where it has been moved to 473bdd1243dSDimitry Andric // for each kernel 474bdd1243dSDimitry Andric // an array with an element for each kernel containing where the corresponding 475bdd1243dSDimitry Andric // variable was remapped to 476bdd1243dSDimitry Andric 477bdd1243dSDimitry Andric static Constant *getAddressesOfVariablesInKernel( 478bdd1243dSDimitry Andric LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, 479*fe013be4SDimitry Andric const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { 480bdd1243dSDimitry Andric // Create a ConstantArray containing the address of each Variable within the 481bdd1243dSDimitry Andric // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel 482bdd1243dSDimitry Andric // does not allocate it 483bdd1243dSDimitry Andric // TODO: Drop the ptrtoint conversion 484bdd1243dSDimitry Andric 485bdd1243dSDimitry Andric Type *I32 = Type::getInt32Ty(Ctx); 486bdd1243dSDimitry Andric 487bdd1243dSDimitry Andric ArrayType *KernelOffsetsType = ArrayType::get(I32, Variables.size()); 488bdd1243dSDimitry Andric 489bdd1243dSDimitry Andric SmallVector<Constant *> Elements; 490bdd1243dSDimitry Andric for (size_t i = 0; i < Variables.size(); i++) { 491bdd1243dSDimitry Andric GlobalVariable *GV = Variables[i]; 492*fe013be4SDimitry Andric auto ConstantGepIt = LDSVarsToConstantGEP.find(GV); 493*fe013be4SDimitry Andric if (ConstantGepIt != LDSVarsToConstantGEP.end()) { 494*fe013be4SDimitry Andric auto elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32); 495bdd1243dSDimitry Andric Elements.push_back(elt); 496bdd1243dSDimitry Andric } else { 497bdd1243dSDimitry Andric Elements.push_back(PoisonValue::get(I32)); 498bdd1243dSDimitry Andric } 499bdd1243dSDimitry Andric } 500bdd1243dSDimitry Andric return ConstantArray::get(KernelOffsetsType, Elements); 501bdd1243dSDimitry Andric } 502bdd1243dSDimitry Andric 503bdd1243dSDimitry Andric static GlobalVariable *buildLookupTable( 504bdd1243dSDimitry Andric Module &M, ArrayRef<GlobalVariable *> Variables, 505bdd1243dSDimitry Andric ArrayRef<Function *> kernels, 506bdd1243dSDimitry Andric DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) { 507bdd1243dSDimitry Andric if (Variables.empty()) { 508bdd1243dSDimitry Andric return nullptr; 509bdd1243dSDimitry Andric } 510bdd1243dSDimitry Andric LLVMContext &Ctx = M.getContext(); 511bdd1243dSDimitry Andric 512bdd1243dSDimitry Andric const size_t NumberVariables = Variables.size(); 513bdd1243dSDimitry Andric const size_t NumberKernels = kernels.size(); 514bdd1243dSDimitry Andric 515bdd1243dSDimitry Andric ArrayType *KernelOffsetsType = 516bdd1243dSDimitry Andric ArrayType::get(Type::getInt32Ty(Ctx), NumberVariables); 517bdd1243dSDimitry Andric 518bdd1243dSDimitry Andric ArrayType *AllKernelsOffsetsType = 519bdd1243dSDimitry Andric ArrayType::get(KernelOffsetsType, NumberKernels); 520bdd1243dSDimitry Andric 521*fe013be4SDimitry Andric Constant *Missing = PoisonValue::get(KernelOffsetsType); 522bdd1243dSDimitry Andric std::vector<Constant *> overallConstantExprElts(NumberKernels); 523bdd1243dSDimitry Andric for (size_t i = 0; i < NumberKernels; i++) { 524*fe013be4SDimitry Andric auto Replacement = KernelToReplacement.find(kernels[i]); 525*fe013be4SDimitry Andric overallConstantExprElts[i] = 526*fe013be4SDimitry Andric (Replacement == KernelToReplacement.end()) 527*fe013be4SDimitry Andric ? Missing 528*fe013be4SDimitry Andric : getAddressesOfVariablesInKernel( 529*fe013be4SDimitry Andric Ctx, Variables, Replacement->second.LDSVarsToConstantGEP); 530bdd1243dSDimitry Andric } 531bdd1243dSDimitry Andric 532bdd1243dSDimitry Andric Constant *init = 533bdd1243dSDimitry Andric ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts); 534bdd1243dSDimitry Andric 535bdd1243dSDimitry Andric return new GlobalVariable( 536bdd1243dSDimitry Andric M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, 537bdd1243dSDimitry Andric "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal, 538bdd1243dSDimitry Andric AMDGPUAS::CONSTANT_ADDRESS); 539bdd1243dSDimitry Andric } 540bdd1243dSDimitry Andric 541*fe013be4SDimitry Andric void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, 542*fe013be4SDimitry Andric GlobalVariable *LookupTable, 543*fe013be4SDimitry Andric GlobalVariable *GV, Use &U, 544*fe013be4SDimitry Andric Value *OptionalIndex) { 545*fe013be4SDimitry Andric // Table is a constant array of the same length as OrderedKernels 546bdd1243dSDimitry Andric LLVMContext &Ctx = M.getContext(); 547bdd1243dSDimitry Andric Type *I32 = Type::getInt32Ty(Ctx); 548*fe013be4SDimitry Andric auto *I = cast<Instruction>(U.getUser()); 549bdd1243dSDimitry Andric 550*fe013be4SDimitry Andric Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction()); 551bdd1243dSDimitry Andric 552bdd1243dSDimitry Andric if (auto *Phi = dyn_cast<PHINode>(I)) { 553bdd1243dSDimitry Andric BasicBlock *BB = Phi->getIncomingBlock(U); 554bdd1243dSDimitry Andric Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); 555bdd1243dSDimitry Andric } else { 556bdd1243dSDimitry Andric Builder.SetInsertPoint(I); 557bdd1243dSDimitry Andric } 558bdd1243dSDimitry Andric 559*fe013be4SDimitry Andric SmallVector<Value *, 3> GEPIdx = { 560bdd1243dSDimitry Andric ConstantInt::get(I32, 0), 561bdd1243dSDimitry Andric tableKernelIndex, 562bdd1243dSDimitry Andric }; 563*fe013be4SDimitry Andric if (OptionalIndex) 564*fe013be4SDimitry Andric GEPIdx.push_back(OptionalIndex); 565bdd1243dSDimitry Andric 566bdd1243dSDimitry Andric Value *Address = Builder.CreateInBoundsGEP( 567bdd1243dSDimitry Andric LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); 568bdd1243dSDimitry Andric 569bdd1243dSDimitry Andric Value *loaded = Builder.CreateLoad(I32, Address); 570bdd1243dSDimitry Andric 571bdd1243dSDimitry Andric Value *replacement = 572bdd1243dSDimitry Andric Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); 573bdd1243dSDimitry Andric 574bdd1243dSDimitry Andric U.set(replacement); 575bdd1243dSDimitry Andric } 576*fe013be4SDimitry Andric 577*fe013be4SDimitry Andric void replaceUsesInInstructionsWithTableLookup( 578*fe013be4SDimitry Andric Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, 579*fe013be4SDimitry Andric GlobalVariable *LookupTable) { 580*fe013be4SDimitry Andric 581*fe013be4SDimitry Andric LLVMContext &Ctx = M.getContext(); 582*fe013be4SDimitry Andric IRBuilder<> Builder(Ctx); 583*fe013be4SDimitry Andric Type *I32 = Type::getInt32Ty(Ctx); 584*fe013be4SDimitry Andric 585*fe013be4SDimitry Andric for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { 586*fe013be4SDimitry Andric auto *GV = ModuleScopeVariables[Index]; 587*fe013be4SDimitry Andric 588*fe013be4SDimitry Andric for (Use &U : make_early_inc_range(GV->uses())) { 589*fe013be4SDimitry Andric auto *I = dyn_cast<Instruction>(U.getUser()); 590*fe013be4SDimitry Andric if (!I) 591*fe013be4SDimitry Andric continue; 592*fe013be4SDimitry Andric 593*fe013be4SDimitry Andric replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, 594*fe013be4SDimitry Andric ConstantInt::get(I32, Index)); 595*fe013be4SDimitry Andric } 596bdd1243dSDimitry Andric } 597bdd1243dSDimitry Andric } 598bdd1243dSDimitry Andric 599bdd1243dSDimitry Andric static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables( 600bdd1243dSDimitry Andric Module &M, LDSUsesInfoTy &LDSUsesInfo, 601bdd1243dSDimitry Andric DenseSet<GlobalVariable *> const &VariableSet) { 602bdd1243dSDimitry Andric 603bdd1243dSDimitry Andric DenseSet<Function *> KernelSet; 604bdd1243dSDimitry Andric 605*fe013be4SDimitry Andric if (VariableSet.empty()) 606*fe013be4SDimitry Andric return KernelSet; 607bdd1243dSDimitry Andric 608bdd1243dSDimitry Andric for (Function &Func : M.functions()) { 609bdd1243dSDimitry Andric if (Func.isDeclaration() || !isKernelLDS(&Func)) 610bdd1243dSDimitry Andric continue; 611bdd1243dSDimitry Andric for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { 612bdd1243dSDimitry Andric if (VariableSet.contains(GV)) { 613bdd1243dSDimitry Andric KernelSet.insert(&Func); 614bdd1243dSDimitry Andric break; 615bdd1243dSDimitry Andric } 616bdd1243dSDimitry Andric } 617bdd1243dSDimitry Andric } 618bdd1243dSDimitry Andric 619bdd1243dSDimitry Andric return KernelSet; 620bdd1243dSDimitry Andric } 621bdd1243dSDimitry Andric 622bdd1243dSDimitry Andric static GlobalVariable * 623bdd1243dSDimitry Andric chooseBestVariableForModuleStrategy(const DataLayout &DL, 624bdd1243dSDimitry Andric VariableFunctionMap &LDSVars) { 625bdd1243dSDimitry Andric // Find the global variable with the most indirect uses from kernels 626bdd1243dSDimitry Andric 627bdd1243dSDimitry Andric struct CandidateTy { 628bdd1243dSDimitry Andric GlobalVariable *GV = nullptr; 629bdd1243dSDimitry Andric size_t UserCount = 0; 630bdd1243dSDimitry Andric size_t Size = 0; 631bdd1243dSDimitry Andric 632bdd1243dSDimitry Andric CandidateTy() = default; 633bdd1243dSDimitry Andric 634bdd1243dSDimitry Andric CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize) 635bdd1243dSDimitry Andric : GV(GV), UserCount(UserCount), Size(AllocSize) {} 636bdd1243dSDimitry Andric 637bdd1243dSDimitry Andric bool operator<(const CandidateTy &Other) const { 638bdd1243dSDimitry Andric // Fewer users makes module scope variable less attractive 639bdd1243dSDimitry Andric if (UserCount < Other.UserCount) { 640bdd1243dSDimitry Andric return true; 641bdd1243dSDimitry Andric } 642bdd1243dSDimitry Andric if (UserCount > Other.UserCount) { 643bdd1243dSDimitry Andric return false; 644bdd1243dSDimitry Andric } 645bdd1243dSDimitry Andric 646bdd1243dSDimitry Andric // Bigger makes module scope variable less attractive 647bdd1243dSDimitry Andric if (Size < Other.Size) { 648bdd1243dSDimitry Andric return false; 649bdd1243dSDimitry Andric } 650bdd1243dSDimitry Andric 651bdd1243dSDimitry Andric if (Size > Other.Size) { 652bdd1243dSDimitry Andric return true; 653bdd1243dSDimitry Andric } 654bdd1243dSDimitry Andric 655bdd1243dSDimitry Andric // Arbitrary but consistent 656bdd1243dSDimitry Andric return GV->getName() < Other.GV->getName(); 657bdd1243dSDimitry Andric } 658bdd1243dSDimitry Andric }; 659bdd1243dSDimitry Andric 660bdd1243dSDimitry Andric CandidateTy MostUsed; 661bdd1243dSDimitry Andric 662bdd1243dSDimitry Andric for (auto &K : LDSVars) { 663bdd1243dSDimitry Andric GlobalVariable *GV = K.first; 664bdd1243dSDimitry Andric if (K.second.size() <= 1) { 665bdd1243dSDimitry Andric // A variable reachable by only one kernel is best lowered with kernel 666bdd1243dSDimitry Andric // strategy 667bdd1243dSDimitry Andric continue; 668bdd1243dSDimitry Andric } 669*fe013be4SDimitry Andric CandidateTy Candidate( 670*fe013be4SDimitry Andric GV, K.second.size(), 671bdd1243dSDimitry Andric DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); 672bdd1243dSDimitry Andric if (MostUsed < Candidate) 673bdd1243dSDimitry Andric MostUsed = Candidate; 674bdd1243dSDimitry Andric } 675bdd1243dSDimitry Andric 676bdd1243dSDimitry Andric return MostUsed.GV; 677bdd1243dSDimitry Andric } 678bdd1243dSDimitry Andric 679*fe013be4SDimitry Andric static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, 680*fe013be4SDimitry Andric uint32_t Address) { 681*fe013be4SDimitry Andric // Write the specified address into metadata where it can be retrieved by 682*fe013be4SDimitry Andric // the assembler. Format is a half open range, [Address Address+1) 683*fe013be4SDimitry Andric LLVMContext &Ctx = M->getContext(); 684*fe013be4SDimitry Andric auto *IntTy = 685*fe013be4SDimitry Andric M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); 686*fe013be4SDimitry Andric auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); 687*fe013be4SDimitry Andric auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); 688*fe013be4SDimitry Andric GV->setMetadata(LLVMContext::MD_absolute_symbol, 689*fe013be4SDimitry Andric MDNode::get(Ctx, {MinC, MaxC})); 690*fe013be4SDimitry Andric } 691972a253aSDimitry Andric 692*fe013be4SDimitry Andric DenseMap<Function *, Value *> tableKernelIndexCache; 693*fe013be4SDimitry Andric Value *getTableLookupKernelIndex(Module &M, Function *F) { 694*fe013be4SDimitry Andric // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which 695*fe013be4SDimitry Andric // lowers to a read from a live in register. Emit it once in the entry 696*fe013be4SDimitry Andric // block to spare deduplicating it later. 697*fe013be4SDimitry Andric auto [It, Inserted] = tableKernelIndexCache.try_emplace(F); 698*fe013be4SDimitry Andric if (Inserted) { 699*fe013be4SDimitry Andric Function *Decl = 700*fe013be4SDimitry Andric Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); 701fe6060f1SDimitry Andric 702*fe013be4SDimitry Andric auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); 703*fe013be4SDimitry Andric IRBuilder<> Builder(&*InsertAt); 704972a253aSDimitry Andric 705*fe013be4SDimitry Andric It->second = Builder.CreateCall(Decl, {}); 706*fe013be4SDimitry Andric } 707972a253aSDimitry Andric 708*fe013be4SDimitry Andric return It->second; 709*fe013be4SDimitry Andric } 710*fe013be4SDimitry Andric 711*fe013be4SDimitry Andric static std::vector<Function *> assignLDSKernelIDToEachKernel( 712*fe013be4SDimitry Andric Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, 713*fe013be4SDimitry Andric DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { 714*fe013be4SDimitry Andric // Associate kernels in the set with an arbirary but reproducible order and 715*fe013be4SDimitry Andric // annotate them with that order in metadata. This metadata is recognised by 716*fe013be4SDimitry Andric // the backend and lowered to a SGPR which can be read from using 717*fe013be4SDimitry Andric // amdgcn_lds_kernel_id. 718*fe013be4SDimitry Andric 719*fe013be4SDimitry Andric std::vector<Function *> OrderedKernels; 720*fe013be4SDimitry Andric if (!KernelsThatAllocateTableLDS.empty() || 721*fe013be4SDimitry Andric !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { 722*fe013be4SDimitry Andric 723*fe013be4SDimitry Andric for (Function &Func : M->functions()) { 724*fe013be4SDimitry Andric if (Func.isDeclaration()) 725*fe013be4SDimitry Andric continue; 726*fe013be4SDimitry Andric if (!isKernelLDS(&Func)) 727*fe013be4SDimitry Andric continue; 728*fe013be4SDimitry Andric 729*fe013be4SDimitry Andric if (KernelsThatAllocateTableLDS.contains(&Func) || 730*fe013be4SDimitry Andric KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) { 731*fe013be4SDimitry Andric assert(Func.hasName()); // else fatal error earlier 732*fe013be4SDimitry Andric OrderedKernels.push_back(&Func); 733bdd1243dSDimitry Andric } 734bdd1243dSDimitry Andric } 735972a253aSDimitry Andric 736*fe013be4SDimitry Andric // Put them in an arbitrary but reproducible order 737*fe013be4SDimitry Andric OrderedKernels = sortByName(std::move(OrderedKernels)); 738972a253aSDimitry Andric 739*fe013be4SDimitry Andric // Annotate the kernels with their order in this vector 740*fe013be4SDimitry Andric LLVMContext &Ctx = M->getContext(); 741*fe013be4SDimitry Andric IRBuilder<> Builder(Ctx); 742*fe013be4SDimitry Andric 743*fe013be4SDimitry Andric if (OrderedKernels.size() > UINT32_MAX) { 744*fe013be4SDimitry Andric // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU 745*fe013be4SDimitry Andric report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); 746*fe013be4SDimitry Andric } 747*fe013be4SDimitry Andric 748*fe013be4SDimitry Andric for (size_t i = 0; i < OrderedKernels.size(); i++) { 749*fe013be4SDimitry Andric Metadata *AttrMDArgs[1] = { 750*fe013be4SDimitry Andric ConstantAsMetadata::get(Builder.getInt32(i)), 751*fe013be4SDimitry Andric }; 752*fe013be4SDimitry Andric OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", 753*fe013be4SDimitry Andric MDNode::get(Ctx, AttrMDArgs)); 754*fe013be4SDimitry Andric } 755*fe013be4SDimitry Andric } 756*fe013be4SDimitry Andric return OrderedKernels; 757*fe013be4SDimitry Andric } 758*fe013be4SDimitry Andric 759*fe013be4SDimitry Andric static void partitionVariablesIntoIndirectStrategies( 760*fe013be4SDimitry Andric Module &M, LDSUsesInfoTy const &LDSUsesInfo, 761*fe013be4SDimitry Andric VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, 762*fe013be4SDimitry Andric DenseSet<GlobalVariable *> &ModuleScopeVariables, 763*fe013be4SDimitry Andric DenseSet<GlobalVariable *> &TableLookupVariables, 764*fe013be4SDimitry Andric DenseSet<GlobalVariable *> &KernelAccessVariables, 765*fe013be4SDimitry Andric DenseSet<GlobalVariable *> &DynamicVariables) { 766*fe013be4SDimitry Andric 767bdd1243dSDimitry Andric GlobalVariable *HybridModuleRoot = 768bdd1243dSDimitry Andric LoweringKindLoc != LoweringKind::hybrid 769bdd1243dSDimitry Andric ? nullptr 770bdd1243dSDimitry Andric : chooseBestVariableForModuleStrategy( 771*fe013be4SDimitry Andric M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly); 772972a253aSDimitry Andric 773bdd1243dSDimitry Andric DenseSet<Function *> const EmptySet; 774bdd1243dSDimitry Andric DenseSet<Function *> const &HybridModuleRootKernels = 775bdd1243dSDimitry Andric HybridModuleRoot 776bdd1243dSDimitry Andric ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] 777bdd1243dSDimitry Andric : EmptySet; 778bdd1243dSDimitry Andric 779bdd1243dSDimitry Andric for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { 780bdd1243dSDimitry Andric // Each iteration of this loop assigns exactly one global variable to 781bdd1243dSDimitry Andric // exactly one of the implementation strategies. 782bdd1243dSDimitry Andric 783bdd1243dSDimitry Andric GlobalVariable *GV = K.first; 784bdd1243dSDimitry Andric assert(AMDGPU::isLDSVariableToLower(*GV)); 785bdd1243dSDimitry Andric assert(K.second.size() != 0); 786bdd1243dSDimitry Andric 787*fe013be4SDimitry Andric if (AMDGPU::isDynamicLDS(*GV)) { 788*fe013be4SDimitry Andric DynamicVariables.insert(GV); 789*fe013be4SDimitry Andric continue; 790*fe013be4SDimitry Andric } 791*fe013be4SDimitry Andric 792bdd1243dSDimitry Andric switch (LoweringKindLoc) { 793bdd1243dSDimitry Andric case LoweringKind::module: 794bdd1243dSDimitry Andric ModuleScopeVariables.insert(GV); 795bdd1243dSDimitry Andric break; 796bdd1243dSDimitry Andric 797bdd1243dSDimitry Andric case LoweringKind::table: 798bdd1243dSDimitry Andric TableLookupVariables.insert(GV); 799bdd1243dSDimitry Andric break; 800bdd1243dSDimitry Andric 801bdd1243dSDimitry Andric case LoweringKind::kernel: 802bdd1243dSDimitry Andric if (K.second.size() == 1) { 803bdd1243dSDimitry Andric KernelAccessVariables.insert(GV); 804972a253aSDimitry Andric } else { 805bdd1243dSDimitry Andric report_fatal_error( 806bdd1243dSDimitry Andric "cannot lower LDS '" + GV->getName() + 807bdd1243dSDimitry Andric "' to kernel access as it is reachable from multiple kernels"); 808bdd1243dSDimitry Andric } 809bdd1243dSDimitry Andric break; 810bdd1243dSDimitry Andric 811bdd1243dSDimitry Andric case LoweringKind::hybrid: { 812bdd1243dSDimitry Andric if (GV == HybridModuleRoot) { 813bdd1243dSDimitry Andric assert(K.second.size() != 1); 814bdd1243dSDimitry Andric ModuleScopeVariables.insert(GV); 815bdd1243dSDimitry Andric } else if (K.second.size() == 1) { 816bdd1243dSDimitry Andric KernelAccessVariables.insert(GV); 817bdd1243dSDimitry Andric } else if (set_is_subset(K.second, HybridModuleRootKernels)) { 818bdd1243dSDimitry Andric ModuleScopeVariables.insert(GV); 819bdd1243dSDimitry Andric } else { 820bdd1243dSDimitry Andric TableLookupVariables.insert(GV); 821bdd1243dSDimitry Andric } 822bdd1243dSDimitry Andric break; 823bdd1243dSDimitry Andric } 824bdd1243dSDimitry Andric } 825bdd1243dSDimitry Andric } 826bdd1243dSDimitry Andric 827*fe013be4SDimitry Andric // All LDS variables accessed indirectly have now been partitioned into 828*fe013be4SDimitry Andric // the distinct lowering strategies. 829bdd1243dSDimitry Andric assert(ModuleScopeVariables.size() + TableLookupVariables.size() + 830*fe013be4SDimitry Andric KernelAccessVariables.size() + DynamicVariables.size() == 831bdd1243dSDimitry Andric LDSToKernelsThatNeedToAccessItIndirectly.size()); 832*fe013be4SDimitry Andric } 833bdd1243dSDimitry Andric 834*fe013be4SDimitry Andric static GlobalVariable *lowerModuleScopeStructVariables( 835*fe013be4SDimitry Andric Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, 836*fe013be4SDimitry Andric DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { 837*fe013be4SDimitry Andric // Create a struct to hold the ModuleScopeVariables 838*fe013be4SDimitry Andric // Replace all uses of those variables from non-kernel functions with the 839*fe013be4SDimitry Andric // new struct instance Replace only the uses from kernel functions that will 840*fe013be4SDimitry Andric // allocate this instance. That is a space optimisation - kernels that use a 841*fe013be4SDimitry Andric // subset of the module scope struct and do not need to allocate it for 842*fe013be4SDimitry Andric // indirect calls will only allocate the subset they use (they do so as part 843*fe013be4SDimitry Andric // of the per-kernel lowering). 844*fe013be4SDimitry Andric if (ModuleScopeVariables.empty()) { 845*fe013be4SDimitry Andric return nullptr; 846*fe013be4SDimitry Andric } 847bdd1243dSDimitry Andric 848*fe013be4SDimitry Andric LLVMContext &Ctx = M.getContext(); 849*fe013be4SDimitry Andric 850bdd1243dSDimitry Andric LDSVariableReplacement ModuleScopeReplacement = 851bdd1243dSDimitry Andric createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", 852bdd1243dSDimitry Andric ModuleScopeVariables); 853bdd1243dSDimitry Andric 854*fe013be4SDimitry Andric appendToCompilerUsed(M, {static_cast<GlobalValue *>( 855bdd1243dSDimitry Andric ConstantExpr::getPointerBitCastOrAddrSpaceCast( 856bdd1243dSDimitry Andric cast<Constant>(ModuleScopeReplacement.SGV), 857bdd1243dSDimitry Andric Type::getInt8PtrTy(Ctx)))}); 858bdd1243dSDimitry Andric 859*fe013be4SDimitry Andric // module.lds will be allocated at zero in any kernel that allocates it 860*fe013be4SDimitry Andric recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0); 861*fe013be4SDimitry Andric 862bdd1243dSDimitry Andric // historic 863bdd1243dSDimitry Andric removeLocalVarsFromUsedLists(M, ModuleScopeVariables); 864bdd1243dSDimitry Andric 865bdd1243dSDimitry Andric // Replace all uses of module scope variable from non-kernel functions 866bdd1243dSDimitry Andric replaceLDSVariablesWithStruct( 867bdd1243dSDimitry Andric M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { 868bdd1243dSDimitry Andric Instruction *I = dyn_cast<Instruction>(U.getUser()); 869bdd1243dSDimitry Andric if (!I) { 870bdd1243dSDimitry Andric return false; 871bdd1243dSDimitry Andric } 872bdd1243dSDimitry Andric Function *F = I->getFunction(); 873bdd1243dSDimitry Andric return !isKernelLDS(F); 874bdd1243dSDimitry Andric }); 875bdd1243dSDimitry Andric 876bdd1243dSDimitry Andric // Replace uses of module scope variable from kernel functions that 877bdd1243dSDimitry Andric // allocate the module scope variable, otherwise leave them unchanged 878bdd1243dSDimitry Andric // Record on each kernel whether the module scope global is used by it 879bdd1243dSDimitry Andric 880bdd1243dSDimitry Andric for (Function &Func : M.functions()) { 881bdd1243dSDimitry Andric if (Func.isDeclaration() || !isKernelLDS(&Func)) 882bdd1243dSDimitry Andric continue; 883bdd1243dSDimitry Andric 884bdd1243dSDimitry Andric if (KernelsThatAllocateModuleLDS.contains(&Func)) { 885bdd1243dSDimitry Andric replaceLDSVariablesWithStruct( 886bdd1243dSDimitry Andric M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { 887bdd1243dSDimitry Andric Instruction *I = dyn_cast<Instruction>(U.getUser()); 888bdd1243dSDimitry Andric if (!I) { 889bdd1243dSDimitry Andric return false; 890bdd1243dSDimitry Andric } 891bdd1243dSDimitry Andric Function *F = I->getFunction(); 892bdd1243dSDimitry Andric return F == &Func; 893bdd1243dSDimitry Andric }); 894bdd1243dSDimitry Andric 895*fe013be4SDimitry Andric markUsedByKernel(&Func, ModuleScopeReplacement.SGV); 896972a253aSDimitry Andric } 897972a253aSDimitry Andric } 898972a253aSDimitry Andric 899*fe013be4SDimitry Andric return ModuleScopeReplacement.SGV; 900*fe013be4SDimitry Andric } 901*fe013be4SDimitry Andric 902*fe013be4SDimitry Andric static DenseMap<Function *, LDSVariableReplacement> 903*fe013be4SDimitry Andric lowerKernelScopeStructVariables( 904*fe013be4SDimitry Andric Module &M, LDSUsesInfoTy &LDSUsesInfo, 905*fe013be4SDimitry Andric DenseSet<GlobalVariable *> const &ModuleScopeVariables, 906*fe013be4SDimitry Andric DenseSet<Function *> const &KernelsThatAllocateModuleLDS, 907*fe013be4SDimitry Andric GlobalVariable *MaybeModuleScopeStruct) { 908*fe013be4SDimitry Andric 909*fe013be4SDimitry Andric // Create a struct for each kernel for the non-module-scope variables. 910*fe013be4SDimitry Andric 911bdd1243dSDimitry Andric DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; 912bdd1243dSDimitry Andric for (Function &Func : M.functions()) { 913bdd1243dSDimitry Andric if (Func.isDeclaration() || !isKernelLDS(&Func)) 914349cc55cSDimitry Andric continue; 915349cc55cSDimitry Andric 916bdd1243dSDimitry Andric DenseSet<GlobalVariable *> KernelUsedVariables; 917*fe013be4SDimitry Andric // Allocating variables that are used directly in this struct to get 918*fe013be4SDimitry Andric // alignment aware allocation and predictable frame size. 919bdd1243dSDimitry Andric for (auto &v : LDSUsesInfo.direct_access[&Func]) { 920*fe013be4SDimitry Andric if (!AMDGPU::isDynamicLDS(*v)) { 921bdd1243dSDimitry Andric KernelUsedVariables.insert(v); 922bdd1243dSDimitry Andric } 923*fe013be4SDimitry Andric } 924*fe013be4SDimitry Andric 925*fe013be4SDimitry Andric // Allocating variables that are accessed indirectly so that a lookup of 926*fe013be4SDimitry Andric // this struct instance can find them from nested functions. 927bdd1243dSDimitry Andric for (auto &v : LDSUsesInfo.indirect_access[&Func]) { 928*fe013be4SDimitry Andric if (!AMDGPU::isDynamicLDS(*v)) { 929bdd1243dSDimitry Andric KernelUsedVariables.insert(v); 930bdd1243dSDimitry Andric } 931*fe013be4SDimitry Andric } 932bdd1243dSDimitry Andric 933bdd1243dSDimitry Andric // Variables allocated in module lds must all resolve to that struct, 934bdd1243dSDimitry Andric // not to the per-kernel instance. 935bdd1243dSDimitry Andric if (KernelsThatAllocateModuleLDS.contains(&Func)) { 936bdd1243dSDimitry Andric for (GlobalVariable *v : ModuleScopeVariables) { 937bdd1243dSDimitry Andric KernelUsedVariables.erase(v); 938bdd1243dSDimitry Andric } 939bdd1243dSDimitry Andric } 940bdd1243dSDimitry Andric 941bdd1243dSDimitry Andric if (KernelUsedVariables.empty()) { 942*fe013be4SDimitry Andric // Either used no LDS, or the LDS it used was all in the module struct 943*fe013be4SDimitry Andric // or dynamically sized 944fe6060f1SDimitry Andric continue; 945972a253aSDimitry Andric } 946972a253aSDimitry Andric 947bdd1243dSDimitry Andric // The association between kernel function and LDS struct is done by 948bdd1243dSDimitry Andric // symbol name, which only works if the function in question has a 949bdd1243dSDimitry Andric // name This is not expected to be a problem in practice as kernels 950bdd1243dSDimitry Andric // are called by name making anonymous ones (which are named by the 951bdd1243dSDimitry Andric // backend) difficult to use. This does mean that llvm test cases need 952bdd1243dSDimitry Andric // to name the kernels. 953bdd1243dSDimitry Andric if (!Func.hasName()) { 954bdd1243dSDimitry Andric report_fatal_error("Anonymous kernels cannot use LDS variables"); 955bdd1243dSDimitry Andric } 956bdd1243dSDimitry Andric 957972a253aSDimitry Andric std::string VarName = 958bdd1243dSDimitry Andric (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str(); 959bdd1243dSDimitry Andric 960bdd1243dSDimitry Andric auto Replacement = 961972a253aSDimitry Andric createLDSVariableReplacement(M, VarName, KernelUsedVariables); 962972a253aSDimitry Andric 963*fe013be4SDimitry Andric // If any indirect uses, create a direct use to ensure allocation 964*fe013be4SDimitry Andric // TODO: Simpler to unconditionally mark used but that regresses 965*fe013be4SDimitry Andric // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll 966*fe013be4SDimitry Andric auto Accesses = LDSUsesInfo.indirect_access.find(&Func); 967*fe013be4SDimitry Andric if ((Accesses != LDSUsesInfo.indirect_access.end()) && 968*fe013be4SDimitry Andric !Accesses->second.empty()) 969*fe013be4SDimitry Andric markUsedByKernel(&Func, Replacement.SGV); 970*fe013be4SDimitry Andric 971bdd1243dSDimitry Andric // remove preserves existing codegen 972bdd1243dSDimitry Andric removeLocalVarsFromUsedLists(M, KernelUsedVariables); 973bdd1243dSDimitry Andric KernelToReplacement[&Func] = Replacement; 974bdd1243dSDimitry Andric 975bdd1243dSDimitry Andric // Rewrite uses within kernel to the new struct 976972a253aSDimitry Andric replaceLDSVariablesWithStruct( 977bdd1243dSDimitry Andric M, KernelUsedVariables, Replacement, [&Func](Use &U) { 978972a253aSDimitry Andric Instruction *I = dyn_cast<Instruction>(U.getUser()); 979bdd1243dSDimitry Andric return I && I->getFunction() == &Func; 980972a253aSDimitry Andric }); 981972a253aSDimitry Andric } 982*fe013be4SDimitry Andric return KernelToReplacement; 983*fe013be4SDimitry Andric } 984*fe013be4SDimitry Andric 985*fe013be4SDimitry Andric static GlobalVariable * 986*fe013be4SDimitry Andric buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, 987*fe013be4SDimitry Andric Function *func) { 988*fe013be4SDimitry Andric // Create a dynamic lds variable with a name associated with the passed 989*fe013be4SDimitry Andric // function that has the maximum alignment of any dynamic lds variable 990*fe013be4SDimitry Andric // reachable from this kernel. Dynamic LDS is allocated after the static LDS 991*fe013be4SDimitry Andric // allocation, possibly after alignment padding. The representative variable 992*fe013be4SDimitry Andric // created here has the maximum alignment of any other dynamic variable 993*fe013be4SDimitry Andric // reachable by that kernel. All dynamic LDS variables are allocated at the 994*fe013be4SDimitry Andric // same address in each kernel in order to provide the documented aliasing 995*fe013be4SDimitry Andric // semantics. Setting the alignment here allows this IR pass to accurately 996*fe013be4SDimitry Andric // predict the exact constant at which it will be allocated. 997*fe013be4SDimitry Andric 998*fe013be4SDimitry Andric assert(isKernelLDS(func)); 999*fe013be4SDimitry Andric 1000*fe013be4SDimitry Andric LLVMContext &Ctx = M.getContext(); 1001*fe013be4SDimitry Andric const DataLayout &DL = M.getDataLayout(); 1002*fe013be4SDimitry Andric Align MaxDynamicAlignment(1); 1003*fe013be4SDimitry Andric 1004*fe013be4SDimitry Andric auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { 1005*fe013be4SDimitry Andric if (AMDGPU::isDynamicLDS(*GV)) { 1006*fe013be4SDimitry Andric MaxDynamicAlignment = 1007*fe013be4SDimitry Andric std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV)); 1008*fe013be4SDimitry Andric } 1009*fe013be4SDimitry Andric }; 1010*fe013be4SDimitry Andric 1011*fe013be4SDimitry Andric for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { 1012*fe013be4SDimitry Andric UpdateMaxAlignment(GV); 1013*fe013be4SDimitry Andric } 1014*fe013be4SDimitry Andric 1015*fe013be4SDimitry Andric for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { 1016*fe013be4SDimitry Andric UpdateMaxAlignment(GV); 1017*fe013be4SDimitry Andric } 1018*fe013be4SDimitry Andric 1019*fe013be4SDimitry Andric assert(func->hasName()); // Checked by caller 1020*fe013be4SDimitry Andric auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); 1021*fe013be4SDimitry Andric GlobalVariable *N = new GlobalVariable( 1022*fe013be4SDimitry Andric M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, 1023*fe013be4SDimitry Andric Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 1024*fe013be4SDimitry Andric false); 1025*fe013be4SDimitry Andric N->setAlignment(MaxDynamicAlignment); 1026*fe013be4SDimitry Andric 1027*fe013be4SDimitry Andric assert(AMDGPU::isDynamicLDS(*N)); 1028*fe013be4SDimitry Andric return N; 1029*fe013be4SDimitry Andric } 1030*fe013be4SDimitry Andric 1031*fe013be4SDimitry Andric DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( 1032*fe013be4SDimitry Andric Module &M, LDSUsesInfoTy &LDSUsesInfo, 1033*fe013be4SDimitry Andric DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, 1034*fe013be4SDimitry Andric DenseSet<GlobalVariable *> const &DynamicVariables, 1035*fe013be4SDimitry Andric std::vector<Function *> const &OrderedKernels) { 1036*fe013be4SDimitry Andric DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; 1037*fe013be4SDimitry Andric if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { 1038*fe013be4SDimitry Andric LLVMContext &Ctx = M.getContext(); 1039*fe013be4SDimitry Andric IRBuilder<> Builder(Ctx); 1040*fe013be4SDimitry Andric Type *I32 = Type::getInt32Ty(Ctx); 1041*fe013be4SDimitry Andric 1042*fe013be4SDimitry Andric std::vector<Constant *> newDynamicLDS; 1043*fe013be4SDimitry Andric 1044*fe013be4SDimitry Andric // Table is built in the same order as OrderedKernels 1045*fe013be4SDimitry Andric for (auto &func : OrderedKernels) { 1046*fe013be4SDimitry Andric 1047*fe013be4SDimitry Andric if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { 1048*fe013be4SDimitry Andric assert(isKernelLDS(func)); 1049*fe013be4SDimitry Andric if (!func->hasName()) { 1050*fe013be4SDimitry Andric report_fatal_error("Anonymous kernels cannot use LDS variables"); 1051*fe013be4SDimitry Andric } 1052*fe013be4SDimitry Andric 1053*fe013be4SDimitry Andric GlobalVariable *N = 1054*fe013be4SDimitry Andric buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); 1055*fe013be4SDimitry Andric 1056*fe013be4SDimitry Andric KernelToCreatedDynamicLDS[func] = N; 1057*fe013be4SDimitry Andric 1058*fe013be4SDimitry Andric markUsedByKernel(func, N); 1059*fe013be4SDimitry Andric 1060*fe013be4SDimitry Andric auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); 1061*fe013be4SDimitry Andric auto GEP = ConstantExpr::getGetElementPtr( 1062*fe013be4SDimitry Andric emptyCharArray, N, ConstantInt::get(I32, 0), true); 1063*fe013be4SDimitry Andric newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32)); 1064*fe013be4SDimitry Andric } else { 1065*fe013be4SDimitry Andric newDynamicLDS.push_back(PoisonValue::get(I32)); 1066*fe013be4SDimitry Andric } 1067*fe013be4SDimitry Andric } 1068*fe013be4SDimitry Andric assert(OrderedKernels.size() == newDynamicLDS.size()); 1069*fe013be4SDimitry Andric 1070*fe013be4SDimitry Andric ArrayType *t = ArrayType::get(I32, newDynamicLDS.size()); 1071*fe013be4SDimitry Andric Constant *init = ConstantArray::get(t, newDynamicLDS); 1072*fe013be4SDimitry Andric GlobalVariable *table = new GlobalVariable( 1073*fe013be4SDimitry Andric M, t, true, GlobalValue::InternalLinkage, init, 1074*fe013be4SDimitry Andric "llvm.amdgcn.dynlds.offset.table", nullptr, 1075*fe013be4SDimitry Andric GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); 1076*fe013be4SDimitry Andric 1077*fe013be4SDimitry Andric for (GlobalVariable *GV : DynamicVariables) { 1078*fe013be4SDimitry Andric for (Use &U : make_early_inc_range(GV->uses())) { 1079*fe013be4SDimitry Andric auto *I = dyn_cast<Instruction>(U.getUser()); 1080*fe013be4SDimitry Andric if (!I) 1081*fe013be4SDimitry Andric continue; 1082*fe013be4SDimitry Andric if (isKernelLDS(I->getFunction())) 1083*fe013be4SDimitry Andric continue; 1084*fe013be4SDimitry Andric 1085*fe013be4SDimitry Andric replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); 1086*fe013be4SDimitry Andric } 1087*fe013be4SDimitry Andric } 1088*fe013be4SDimitry Andric } 1089*fe013be4SDimitry Andric return KernelToCreatedDynamicLDS; 1090*fe013be4SDimitry Andric } 1091*fe013be4SDimitry Andric 1092*fe013be4SDimitry Andric bool runOnModule(Module &M) override { 1093*fe013be4SDimitry Andric CallGraph CG = CallGraph(M); 1094*fe013be4SDimitry Andric bool Changed = superAlignLDSGlobals(M); 1095*fe013be4SDimitry Andric 1096*fe013be4SDimitry Andric Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); 1097*fe013be4SDimitry Andric 1098*fe013be4SDimitry Andric Changed = true; // todo: narrow this down 1099*fe013be4SDimitry Andric 1100*fe013be4SDimitry Andric // For each kernel, what variables does it access directly or through 1101*fe013be4SDimitry Andric // callees 1102*fe013be4SDimitry Andric LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); 1103*fe013be4SDimitry Andric 1104*fe013be4SDimitry Andric // For each variable accessed through callees, which kernels access it 1105*fe013be4SDimitry Andric VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; 1106*fe013be4SDimitry Andric for (auto &K : LDSUsesInfo.indirect_access) { 1107*fe013be4SDimitry Andric Function *F = K.first; 1108*fe013be4SDimitry Andric assert(isKernelLDS(F)); 1109*fe013be4SDimitry Andric for (GlobalVariable *GV : K.second) { 1110*fe013be4SDimitry Andric LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); 1111*fe013be4SDimitry Andric } 1112*fe013be4SDimitry Andric } 1113*fe013be4SDimitry Andric 1114*fe013be4SDimitry Andric // Partition variables accessed indirectly into the different strategies 1115*fe013be4SDimitry Andric DenseSet<GlobalVariable *> ModuleScopeVariables; 1116*fe013be4SDimitry Andric DenseSet<GlobalVariable *> TableLookupVariables; 1117*fe013be4SDimitry Andric DenseSet<GlobalVariable *> KernelAccessVariables; 1118*fe013be4SDimitry Andric DenseSet<GlobalVariable *> DynamicVariables; 1119*fe013be4SDimitry Andric partitionVariablesIntoIndirectStrategies( 1120*fe013be4SDimitry Andric M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, 1121*fe013be4SDimitry Andric ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, 1122*fe013be4SDimitry Andric DynamicVariables); 1123*fe013be4SDimitry Andric 1124*fe013be4SDimitry Andric // If the kernel accesses a variable that is going to be stored in the 1125*fe013be4SDimitry Andric // module instance through a call then that kernel needs to allocate the 1126*fe013be4SDimitry Andric // module instance 1127*fe013be4SDimitry Andric const DenseSet<Function *> KernelsThatAllocateModuleLDS = 1128*fe013be4SDimitry Andric kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, 1129*fe013be4SDimitry Andric ModuleScopeVariables); 1130*fe013be4SDimitry Andric const DenseSet<Function *> KernelsThatAllocateTableLDS = 1131*fe013be4SDimitry Andric kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, 1132*fe013be4SDimitry Andric TableLookupVariables); 1133*fe013be4SDimitry Andric 1134*fe013be4SDimitry Andric const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = 1135*fe013be4SDimitry Andric kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, 1136*fe013be4SDimitry Andric DynamicVariables); 1137*fe013be4SDimitry Andric 1138*fe013be4SDimitry Andric GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( 1139*fe013be4SDimitry Andric M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); 1140*fe013be4SDimitry Andric 1141*fe013be4SDimitry Andric DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = 1142*fe013be4SDimitry Andric lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, 1143*fe013be4SDimitry Andric KernelsThatAllocateModuleLDS, 1144*fe013be4SDimitry Andric MaybeModuleScopeStruct); 1145bdd1243dSDimitry Andric 1146bdd1243dSDimitry Andric // Lower zero cost accesses to the kernel instances just created 1147bdd1243dSDimitry Andric for (auto &GV : KernelAccessVariables) { 1148bdd1243dSDimitry Andric auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV]; 1149bdd1243dSDimitry Andric assert(funcs.size() == 1); // Only one kernel can access it 1150bdd1243dSDimitry Andric LDSVariableReplacement Replacement = 1151bdd1243dSDimitry Andric KernelToReplacement[*(funcs.begin())]; 1152bdd1243dSDimitry Andric 1153bdd1243dSDimitry Andric DenseSet<GlobalVariable *> Vec; 1154bdd1243dSDimitry Andric Vec.insert(GV); 1155bdd1243dSDimitry Andric 1156bdd1243dSDimitry Andric replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { 1157bdd1243dSDimitry Andric return isa<Instruction>(U.getUser()); 1158bdd1243dSDimitry Andric }); 1159bdd1243dSDimitry Andric } 1160bdd1243dSDimitry Andric 1161*fe013be4SDimitry Andric // The ith element of this vector is kernel id i 1162*fe013be4SDimitry Andric std::vector<Function *> OrderedKernels = 1163*fe013be4SDimitry Andric assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS, 1164*fe013be4SDimitry Andric KernelsThatIndirectlyAllocateDynamicLDS); 1165*fe013be4SDimitry Andric 1166bdd1243dSDimitry Andric if (!KernelsThatAllocateTableLDS.empty()) { 1167bdd1243dSDimitry Andric LLVMContext &Ctx = M.getContext(); 1168bdd1243dSDimitry Andric IRBuilder<> Builder(Ctx); 1169bdd1243dSDimitry Andric 1170bdd1243dSDimitry Andric // The order must be consistent between lookup table and accesses to 1171bdd1243dSDimitry Andric // lookup table 1172*fe013be4SDimitry Andric auto TableLookupVariablesOrdered = 1173*fe013be4SDimitry Andric sortByName(std::vector<GlobalVariable *>(TableLookupVariables.begin(), 1174*fe013be4SDimitry Andric TableLookupVariables.end())); 1175bdd1243dSDimitry Andric 1176bdd1243dSDimitry Andric GlobalVariable *LookupTable = buildLookupTable( 1177bdd1243dSDimitry Andric M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); 1178bdd1243dSDimitry Andric replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, 1179bdd1243dSDimitry Andric LookupTable); 1180bdd1243dSDimitry Andric } 1181bdd1243dSDimitry Andric 1182*fe013be4SDimitry Andric DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = 1183*fe013be4SDimitry Andric lowerDynamicLDSVariables(M, LDSUsesInfo, 1184*fe013be4SDimitry Andric KernelsThatIndirectlyAllocateDynamicLDS, 1185*fe013be4SDimitry Andric DynamicVariables, OrderedKernels); 1186*fe013be4SDimitry Andric 1187*fe013be4SDimitry Andric // All kernel frames have been allocated. Calculate and record the 1188*fe013be4SDimitry Andric // addresses. 1189*fe013be4SDimitry Andric { 1190*fe013be4SDimitry Andric const DataLayout &DL = M.getDataLayout(); 1191*fe013be4SDimitry Andric 1192*fe013be4SDimitry Andric for (Function &Func : M.functions()) { 1193*fe013be4SDimitry Andric if (Func.isDeclaration() || !isKernelLDS(&Func)) 1194*fe013be4SDimitry Andric continue; 1195*fe013be4SDimitry Andric 1196*fe013be4SDimitry Andric // All three of these are optional. The first variable is allocated at 1197*fe013be4SDimitry Andric // zero. They are allocated by AMDGPUMachineFunction as one block. 1198*fe013be4SDimitry Andric // Layout: 1199*fe013be4SDimitry Andric //{ 1200*fe013be4SDimitry Andric // module.lds 1201*fe013be4SDimitry Andric // alignment padding 1202*fe013be4SDimitry Andric // kernel instance 1203*fe013be4SDimitry Andric // alignment padding 1204*fe013be4SDimitry Andric // dynamic lds variables 1205*fe013be4SDimitry Andric //} 1206*fe013be4SDimitry Andric 1207*fe013be4SDimitry Andric const bool AllocateModuleScopeStruct = 1208*fe013be4SDimitry Andric MaybeModuleScopeStruct && 1209*fe013be4SDimitry Andric KernelsThatAllocateModuleLDS.contains(&Func); 1210*fe013be4SDimitry Andric 1211*fe013be4SDimitry Andric auto Replacement = KernelToReplacement.find(&Func); 1212*fe013be4SDimitry Andric const bool AllocateKernelScopeStruct = 1213*fe013be4SDimitry Andric Replacement != KernelToReplacement.end(); 1214*fe013be4SDimitry Andric 1215*fe013be4SDimitry Andric const bool AllocateDynamicVariable = 1216*fe013be4SDimitry Andric KernelToCreatedDynamicLDS.contains(&Func); 1217*fe013be4SDimitry Andric 1218*fe013be4SDimitry Andric uint32_t Offset = 0; 1219*fe013be4SDimitry Andric 1220*fe013be4SDimitry Andric if (AllocateModuleScopeStruct) { 1221*fe013be4SDimitry Andric // Allocated at zero, recorded once on construction, not once per 1222*fe013be4SDimitry Andric // kernel 1223*fe013be4SDimitry Andric Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); 1224*fe013be4SDimitry Andric } 1225*fe013be4SDimitry Andric 1226*fe013be4SDimitry Andric if (AllocateKernelScopeStruct) { 1227*fe013be4SDimitry Andric GlobalVariable *KernelStruct = Replacement->second.SGV; 1228*fe013be4SDimitry Andric Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); 1229*fe013be4SDimitry Andric recordLDSAbsoluteAddress(&M, KernelStruct, Offset); 1230*fe013be4SDimitry Andric Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); 1231*fe013be4SDimitry Andric } 1232*fe013be4SDimitry Andric 1233*fe013be4SDimitry Andric // If there is dynamic allocation, the alignment needed is included in 1234*fe013be4SDimitry Andric // the static frame size. There may be no reference to the dynamic 1235*fe013be4SDimitry Andric // variable in the kernel itself, so without including it here, that 1236*fe013be4SDimitry Andric // alignment padding could be missed. 1237*fe013be4SDimitry Andric if (AllocateDynamicVariable) { 1238*fe013be4SDimitry Andric GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; 1239*fe013be4SDimitry Andric Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable)); 1240*fe013be4SDimitry Andric recordLDSAbsoluteAddress(&M, DynamicVariable, Offset); 1241*fe013be4SDimitry Andric } 1242*fe013be4SDimitry Andric 1243*fe013be4SDimitry Andric if (Offset != 0) { 1244*fe013be4SDimitry Andric std::string Buffer; 1245*fe013be4SDimitry Andric raw_string_ostream SS{Buffer}; 1246*fe013be4SDimitry Andric SS << format("%u", Offset); 1247*fe013be4SDimitry Andric 1248*fe013be4SDimitry Andric // Instead of explictly marking kernels that access dynamic variables 1249*fe013be4SDimitry Andric // using special case metadata, annotate with min-lds == max-lds, i.e. 1250*fe013be4SDimitry Andric // that there is no more space available for allocating more static 1251*fe013be4SDimitry Andric // LDS variables. That is the right condition to prevent allocating 1252*fe013be4SDimitry Andric // more variables which would collide with the addresses assigned to 1253*fe013be4SDimitry Andric // dynamic variables. 1254*fe013be4SDimitry Andric if (AllocateDynamicVariable) 1255*fe013be4SDimitry Andric SS << format(",%u", Offset); 1256*fe013be4SDimitry Andric 1257*fe013be4SDimitry Andric Func.addFnAttr("amdgpu-lds-size", Buffer); 1258*fe013be4SDimitry Andric } 1259*fe013be4SDimitry Andric } 1260*fe013be4SDimitry Andric } 1261*fe013be4SDimitry Andric 1262bdd1243dSDimitry Andric for (auto &GV : make_early_inc_range(M.globals())) 1263bdd1243dSDimitry Andric if (AMDGPU::isLDSVariableToLower(GV)) { 1264bdd1243dSDimitry Andric // probably want to remove from used lists 1265bdd1243dSDimitry Andric GV.removeDeadConstantUsers(); 1266bdd1243dSDimitry Andric if (GV.use_empty()) 1267bdd1243dSDimitry Andric GV.eraseFromParent(); 1268fe6060f1SDimitry Andric } 1269fe6060f1SDimitry Andric 1270fe6060f1SDimitry Andric return Changed; 1271fe6060f1SDimitry Andric } 1272fe6060f1SDimitry Andric 1273fe6060f1SDimitry Andric private: 1274fe6060f1SDimitry Andric // Increase the alignment of LDS globals if necessary to maximise the chance 1275fe6060f1SDimitry Andric // that we can use aligned LDS instructions to access them. 12760eae32dcSDimitry Andric static bool superAlignLDSGlobals(Module &M) { 12770eae32dcSDimitry Andric const DataLayout &DL = M.getDataLayout(); 12780eae32dcSDimitry Andric bool Changed = false; 12790eae32dcSDimitry Andric if (!SuperAlignLDSGlobals) { 12800eae32dcSDimitry Andric return Changed; 12810eae32dcSDimitry Andric } 12820eae32dcSDimitry Andric 12830eae32dcSDimitry Andric for (auto &GV : M.globals()) { 12840eae32dcSDimitry Andric if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { 12850eae32dcSDimitry Andric // Only changing alignment of LDS variables 12860eae32dcSDimitry Andric continue; 12870eae32dcSDimitry Andric } 12880eae32dcSDimitry Andric if (!GV.hasInitializer()) { 12890eae32dcSDimitry Andric // cuda/hip extern __shared__ variable, leave alignment alone 12900eae32dcSDimitry Andric continue; 12910eae32dcSDimitry Andric } 12920eae32dcSDimitry Andric 12930eae32dcSDimitry Andric Align Alignment = AMDGPU::getAlign(DL, &GV); 12940eae32dcSDimitry Andric TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); 1295fe6060f1SDimitry Andric 1296fe6060f1SDimitry Andric if (GVSize > 8) { 1297fe6060f1SDimitry Andric // We might want to use a b96 or b128 load/store 1298fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(16)); 1299fe6060f1SDimitry Andric } else if (GVSize > 4) { 1300fe6060f1SDimitry Andric // We might want to use a b64 load/store 1301fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(8)); 1302fe6060f1SDimitry Andric } else if (GVSize > 2) { 1303fe6060f1SDimitry Andric // We might want to use a b32 load/store 1304fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(4)); 1305fe6060f1SDimitry Andric } else if (GVSize > 1) { 1306fe6060f1SDimitry Andric // We might want to use a b16 load/store 1307fe6060f1SDimitry Andric Alignment = std::max(Alignment, Align(2)); 1308fe6060f1SDimitry Andric } 1309fe6060f1SDimitry Andric 13100eae32dcSDimitry Andric if (Alignment != AMDGPU::getAlign(DL, &GV)) { 13110eae32dcSDimitry Andric Changed = true; 13120eae32dcSDimitry Andric GV.setAlignment(Alignment); 1313fe6060f1SDimitry Andric } 1314fe6060f1SDimitry Andric } 13150eae32dcSDimitry Andric return Changed; 13160eae32dcSDimitry Andric } 13170eae32dcSDimitry Andric 1318bdd1243dSDimitry Andric static LDSVariableReplacement createLDSVariableReplacement( 1319972a253aSDimitry Andric Module &M, std::string VarName, 1320bdd1243dSDimitry Andric DenseSet<GlobalVariable *> const &LDSVarsToTransform) { 1321972a253aSDimitry Andric // Create a struct instance containing LDSVarsToTransform and map from those 1322972a253aSDimitry Andric // variables to ConstantExprGEP 1323972a253aSDimitry Andric // Variables may be introduced to meet alignment requirements. No aliasing 1324972a253aSDimitry Andric // metadata is useful for these as they have no uses. Erased before return. 1325972a253aSDimitry Andric 13260eae32dcSDimitry Andric LLVMContext &Ctx = M.getContext(); 13270eae32dcSDimitry Andric const DataLayout &DL = M.getDataLayout(); 1328972a253aSDimitry Andric assert(!LDSVarsToTransform.empty()); 1329fe6060f1SDimitry Andric 1330fe6060f1SDimitry Andric SmallVector<OptimizedStructLayoutField, 8> LayoutFields; 1331fcaf7f86SDimitry Andric LayoutFields.reserve(LDSVarsToTransform.size()); 1332bdd1243dSDimitry Andric { 1333bdd1243dSDimitry Andric // The order of fields in this struct depends on the order of 1334bdd1243dSDimitry Andric // varables in the argument which varies when changing how they 1335bdd1243dSDimitry Andric // are identified, leading to spurious test breakage. 1336*fe013be4SDimitry Andric auto Sorted = sortByName(std::vector<GlobalVariable *>( 1337*fe013be4SDimitry Andric LDSVarsToTransform.begin(), LDSVarsToTransform.end())); 1338*fe013be4SDimitry Andric 1339bdd1243dSDimitry Andric for (GlobalVariable *GV : Sorted) { 1340bdd1243dSDimitry Andric OptimizedStructLayoutField F(GV, 1341bdd1243dSDimitry Andric DL.getTypeAllocSize(GV->getValueType()), 1342fe6060f1SDimitry Andric AMDGPU::getAlign(DL, GV)); 1343fe6060f1SDimitry Andric LayoutFields.emplace_back(F); 1344fe6060f1SDimitry Andric } 1345bdd1243dSDimitry Andric } 1346fe6060f1SDimitry Andric 1347fe6060f1SDimitry Andric performOptimizedStructLayout(LayoutFields); 1348fe6060f1SDimitry Andric 1349fe6060f1SDimitry Andric std::vector<GlobalVariable *> LocalVars; 1350972a253aSDimitry Andric BitVector IsPaddingField; 1351fcaf7f86SDimitry Andric LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large 1352972a253aSDimitry Andric IsPaddingField.reserve(LDSVarsToTransform.size()); 1353fe6060f1SDimitry Andric { 1354fe6060f1SDimitry Andric uint64_t CurrentOffset = 0; 1355fe6060f1SDimitry Andric for (size_t I = 0; I < LayoutFields.size(); I++) { 1356fe6060f1SDimitry Andric GlobalVariable *FGV = static_cast<GlobalVariable *>( 1357fe6060f1SDimitry Andric const_cast<void *>(LayoutFields[I].Id)); 1358fe6060f1SDimitry Andric Align DataAlign = LayoutFields[I].Alignment; 1359fe6060f1SDimitry Andric 1360fe6060f1SDimitry Andric uint64_t DataAlignV = DataAlign.value(); 1361fe6060f1SDimitry Andric if (uint64_t Rem = CurrentOffset % DataAlignV) { 1362fe6060f1SDimitry Andric uint64_t Padding = DataAlignV - Rem; 1363fe6060f1SDimitry Andric 1364fe6060f1SDimitry Andric // Append an array of padding bytes to meet alignment requested 1365fe6060f1SDimitry Andric // Note (o + (a - (o % a)) ) % a == 0 1366fe6060f1SDimitry Andric // (offset + Padding ) % align == 0 1367fe6060f1SDimitry Andric 1368fe6060f1SDimitry Andric Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); 1369fe6060f1SDimitry Andric LocalVars.push_back(new GlobalVariable( 1370fe6060f1SDimitry Andric M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), 1371fe6060f1SDimitry Andric "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 1372fe6060f1SDimitry Andric false)); 1373972a253aSDimitry Andric IsPaddingField.push_back(true); 1374fe6060f1SDimitry Andric CurrentOffset += Padding; 1375fe6060f1SDimitry Andric } 1376fe6060f1SDimitry Andric 1377fe6060f1SDimitry Andric LocalVars.push_back(FGV); 1378972a253aSDimitry Andric IsPaddingField.push_back(false); 1379fe6060f1SDimitry Andric CurrentOffset += LayoutFields[I].Size; 1380fe6060f1SDimitry Andric } 1381fe6060f1SDimitry Andric } 1382fe6060f1SDimitry Andric 1383fe6060f1SDimitry Andric std::vector<Type *> LocalVarTypes; 1384fe6060f1SDimitry Andric LocalVarTypes.reserve(LocalVars.size()); 1385fe6060f1SDimitry Andric std::transform( 1386fe6060f1SDimitry Andric LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), 1387fe6060f1SDimitry Andric [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); 1388fe6060f1SDimitry Andric 1389fe6060f1SDimitry Andric StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); 1390fe6060f1SDimitry Andric 1391bdd1243dSDimitry Andric Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]); 1392fe6060f1SDimitry Andric 1393fe6060f1SDimitry Andric GlobalVariable *SGV = new GlobalVariable( 1394fe6060f1SDimitry Andric M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), 1395fe6060f1SDimitry Andric VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, 1396fe6060f1SDimitry Andric false); 1397fe6060f1SDimitry Andric SGV->setAlignment(StructAlign); 1398972a253aSDimitry Andric 1399972a253aSDimitry Andric DenseMap<GlobalVariable *, Constant *> Map; 1400972a253aSDimitry Andric Type *I32 = Type::getInt32Ty(Ctx); 1401972a253aSDimitry Andric for (size_t I = 0; I < LocalVars.size(); I++) { 1402972a253aSDimitry Andric GlobalVariable *GV = LocalVars[I]; 1403972a253aSDimitry Andric Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; 1404972a253aSDimitry Andric Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true); 1405972a253aSDimitry Andric if (IsPaddingField[I]) { 1406972a253aSDimitry Andric assert(GV->use_empty()); 1407972a253aSDimitry Andric GV->eraseFromParent(); 1408972a253aSDimitry Andric } else { 1409972a253aSDimitry Andric Map[GV] = GEP; 1410972a253aSDimitry Andric } 1411972a253aSDimitry Andric } 1412972a253aSDimitry Andric assert(Map.size() == LDSVarsToTransform.size()); 1413972a253aSDimitry Andric return {SGV, std::move(Map)}; 1414fe6060f1SDimitry Andric } 1415fe6060f1SDimitry Andric 1416972a253aSDimitry Andric template <typename PredicateTy> 1417*fe013be4SDimitry Andric static void replaceLDSVariablesWithStruct( 1418bdd1243dSDimitry Andric Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, 1419*fe013be4SDimitry Andric const LDSVariableReplacement &Replacement, PredicateTy Predicate) { 1420972a253aSDimitry Andric LLVMContext &Ctx = M.getContext(); 1421972a253aSDimitry Andric const DataLayout &DL = M.getDataLayout(); 1422fe6060f1SDimitry Andric 1423bdd1243dSDimitry Andric // A hack... we need to insert the aliasing info in a predictable order for 1424bdd1243dSDimitry Andric // lit tests. Would like to have them in a stable order already, ideally the 1425bdd1243dSDimitry Andric // same order they get allocated, which might mean an ordered set container 1426*fe013be4SDimitry Andric auto LDSVarsToTransform = sortByName(std::vector<GlobalVariable *>( 1427*fe013be4SDimitry Andric LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); 1428bdd1243dSDimitry Andric 1429349cc55cSDimitry Andric // Create alias.scope and their lists. Each field in the new structure 1430349cc55cSDimitry Andric // does not alias with all other fields. 1431349cc55cSDimitry Andric SmallVector<MDNode *> AliasScopes; 1432349cc55cSDimitry Andric SmallVector<Metadata *> NoAliasList; 1433972a253aSDimitry Andric const size_t NumberVars = LDSVarsToTransform.size(); 1434972a253aSDimitry Andric if (NumberVars > 1) { 1435349cc55cSDimitry Andric MDBuilder MDB(Ctx); 1436972a253aSDimitry Andric AliasScopes.reserve(NumberVars); 1437349cc55cSDimitry Andric MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); 1438972a253aSDimitry Andric for (size_t I = 0; I < NumberVars; I++) { 1439349cc55cSDimitry Andric MDNode *Scope = MDB.createAnonymousAliasScope(Domain); 1440349cc55cSDimitry Andric AliasScopes.push_back(Scope); 1441349cc55cSDimitry Andric } 1442349cc55cSDimitry Andric NoAliasList.append(&AliasScopes[1], AliasScopes.end()); 1443349cc55cSDimitry Andric } 1444349cc55cSDimitry Andric 1445972a253aSDimitry Andric // Replace uses of ith variable with a constantexpr to the corresponding 1446972a253aSDimitry Andric // field of the instance that will be allocated by AMDGPUMachineFunction 1447972a253aSDimitry Andric for (size_t I = 0; I < NumberVars; I++) { 1448972a253aSDimitry Andric GlobalVariable *GV = LDSVarsToTransform[I]; 1449*fe013be4SDimitry Andric Constant *GEP = Replacement.LDSVarsToConstantGEP.at(GV); 1450fe6060f1SDimitry Andric 1451972a253aSDimitry Andric GV->replaceUsesWithIf(GEP, Predicate); 1452fe6060f1SDimitry Andric 1453972a253aSDimitry Andric APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); 1454972a253aSDimitry Andric GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); 1455972a253aSDimitry Andric uint64_t Offset = APOff.getZExtValue(); 1456972a253aSDimitry Andric 1457bdd1243dSDimitry Andric Align A = 1458bdd1243dSDimitry Andric commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset); 1459349cc55cSDimitry Andric 1460349cc55cSDimitry Andric if (I) 1461349cc55cSDimitry Andric NoAliasList[I - 1] = AliasScopes[I - 1]; 1462349cc55cSDimitry Andric MDNode *NoAlias = 1463349cc55cSDimitry Andric NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList); 1464349cc55cSDimitry Andric MDNode *AliasScope = 1465349cc55cSDimitry Andric AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]}); 1466349cc55cSDimitry Andric 1467349cc55cSDimitry Andric refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); 1468fe6060f1SDimitry Andric } 1469fe6060f1SDimitry Andric } 1470fe6060f1SDimitry Andric 1471*fe013be4SDimitry Andric static void refineUsesAlignmentAndAA(Value *Ptr, Align A, 1472*fe013be4SDimitry Andric const DataLayout &DL, MDNode *AliasScope, 1473*fe013be4SDimitry Andric MDNode *NoAlias, unsigned MaxDepth = 5) { 1474349cc55cSDimitry Andric if (!MaxDepth || (A == 1 && !AliasScope)) 1475fe6060f1SDimitry Andric return; 1476fe6060f1SDimitry Andric 1477fe6060f1SDimitry Andric for (User *U : Ptr->users()) { 1478349cc55cSDimitry Andric if (auto *I = dyn_cast<Instruction>(U)) { 1479349cc55cSDimitry Andric if (AliasScope && I->mayReadOrWriteMemory()) { 1480349cc55cSDimitry Andric MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope); 1481349cc55cSDimitry Andric AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope) 1482349cc55cSDimitry Andric : AliasScope); 1483349cc55cSDimitry Andric I->setMetadata(LLVMContext::MD_alias_scope, AS); 1484349cc55cSDimitry Andric 1485349cc55cSDimitry Andric MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); 1486349cc55cSDimitry Andric NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias); 1487349cc55cSDimitry Andric I->setMetadata(LLVMContext::MD_noalias, NA); 1488349cc55cSDimitry Andric } 1489349cc55cSDimitry Andric } 1490349cc55cSDimitry Andric 1491fe6060f1SDimitry Andric if (auto *LI = dyn_cast<LoadInst>(U)) { 1492fe6060f1SDimitry Andric LI->setAlignment(std::max(A, LI->getAlign())); 1493fe6060f1SDimitry Andric continue; 1494fe6060f1SDimitry Andric } 1495fe6060f1SDimitry Andric if (auto *SI = dyn_cast<StoreInst>(U)) { 1496fe6060f1SDimitry Andric if (SI->getPointerOperand() == Ptr) 1497fe6060f1SDimitry Andric SI->setAlignment(std::max(A, SI->getAlign())); 1498fe6060f1SDimitry Andric continue; 1499fe6060f1SDimitry Andric } 1500fe6060f1SDimitry Andric if (auto *AI = dyn_cast<AtomicRMWInst>(U)) { 1501fe6060f1SDimitry Andric // None of atomicrmw operations can work on pointers, but let's 1502fe6060f1SDimitry Andric // check it anyway in case it will or we will process ConstantExpr. 1503fe6060f1SDimitry Andric if (AI->getPointerOperand() == Ptr) 1504fe6060f1SDimitry Andric AI->setAlignment(std::max(A, AI->getAlign())); 1505fe6060f1SDimitry Andric continue; 1506fe6060f1SDimitry Andric } 1507fe6060f1SDimitry Andric if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) { 1508fe6060f1SDimitry Andric if (AI->getPointerOperand() == Ptr) 1509fe6060f1SDimitry Andric AI->setAlignment(std::max(A, AI->getAlign())); 1510fe6060f1SDimitry Andric continue; 1511fe6060f1SDimitry Andric } 1512fe6060f1SDimitry Andric if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) { 1513fe6060f1SDimitry Andric unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); 1514fe6060f1SDimitry Andric APInt Off(BitWidth, 0); 1515349cc55cSDimitry Andric if (GEP->getPointerOperand() == Ptr) { 1516349cc55cSDimitry Andric Align GA; 1517349cc55cSDimitry Andric if (GEP->accumulateConstantOffset(DL, Off)) 1518349cc55cSDimitry Andric GA = commonAlignment(A, Off.getLimitedValue()); 1519349cc55cSDimitry Andric refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias, 1520349cc55cSDimitry Andric MaxDepth - 1); 1521fe6060f1SDimitry Andric } 1522fe6060f1SDimitry Andric continue; 1523fe6060f1SDimitry Andric } 1524fe6060f1SDimitry Andric if (auto *I = dyn_cast<Instruction>(U)) { 1525fe6060f1SDimitry Andric if (I->getOpcode() == Instruction::BitCast || 1526fe6060f1SDimitry Andric I->getOpcode() == Instruction::AddrSpaceCast) 1527349cc55cSDimitry Andric refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1); 1528fe6060f1SDimitry Andric } 1529fe6060f1SDimitry Andric } 1530fe6060f1SDimitry Andric } 1531fe6060f1SDimitry Andric }; 1532fe6060f1SDimitry Andric 1533fe6060f1SDimitry Andric } // namespace 1534fe6060f1SDimitry Andric char AMDGPULowerModuleLDS::ID = 0; 1535fe6060f1SDimitry Andric 1536fe6060f1SDimitry Andric char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; 1537fe6060f1SDimitry Andric 1538fe6060f1SDimitry Andric INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, 1539fe6060f1SDimitry Andric "Lower uses of LDS variables from non-kernel functions", false, 1540fe6060f1SDimitry Andric false) 1541fe6060f1SDimitry Andric 1542fe6060f1SDimitry Andric ModulePass *llvm::createAMDGPULowerModuleLDSPass() { 1543fe6060f1SDimitry Andric return new AMDGPULowerModuleLDS(); 1544fe6060f1SDimitry Andric } 1545fe6060f1SDimitry Andric 1546fe6060f1SDimitry Andric PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, 1547fe6060f1SDimitry Andric ModuleAnalysisManager &) { 1548fe6060f1SDimitry Andric return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() 1549fe6060f1SDimitry Andric : PreservedAnalyses::all(); 1550fe6060f1SDimitry Andric } 1551