19dfe4e7cSTobias Grosser //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 29dfe4e7cSTobias Grosser // 39dfe4e7cSTobias Grosser // The LLVM Compiler Infrastructure 49dfe4e7cSTobias Grosser // 59dfe4e7cSTobias Grosser // This file is distributed under the University of Illinois Open Source 69dfe4e7cSTobias Grosser // License. See LICENSE.TXT for details. 79dfe4e7cSTobias Grosser // 89dfe4e7cSTobias Grosser //===----------------------------------------------------------------------===// 99dfe4e7cSTobias Grosser // 109dfe4e7cSTobias Grosser // Take a scop created by ScopInfo and map it to GPU code using the ppcg 119dfe4e7cSTobias Grosser // GPU mapping strategy. 129dfe4e7cSTobias Grosser // 139dfe4e7cSTobias Grosser //===----------------------------------------------------------------------===// 149dfe4e7cSTobias Grosser 1517f01968SSiddharth Bhat #include "polly/CodeGen/PPCGCodeGeneration.h" 16cb1aef8dSTobias Grosser #include "polly/CodeGen/IslAst.h" 179dfe4e7cSTobias Grosser #include "polly/CodeGen/IslNodeBuilder.h" 1838fc0aedSTobias Grosser #include "polly/CodeGen/Utils.h" 199dfe4e7cSTobias Grosser #include "polly/DependenceInfo.h" 209dfe4e7cSTobias Grosser #include "polly/LinkAllPasses.h" 21f384594dSTobias Grosser #include "polly/Options.h" 22629109b6STobias Grosser #include "polly/ScopDetection.h" 239dfe4e7cSTobias Grosser #include "polly/ScopInfo.h" 24edb885cbSTobias Grosser #include "polly/Support/SCEVValidator.h" 2574dc3cb4STobias Grosser #include "llvm/ADT/PostOrderIterator.h" 269dfe4e7cSTobias Grosser #include "llvm/Analysis/AliasAnalysis.h" 279dfe4e7cSTobias Grosser #include "llvm/Analysis/BasicAliasAnalysis.h" 289dfe4e7cSTobias Grosser #include "llvm/Analysis/GlobalsModRef.h" 299dfe4e7cSTobias Grosser #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 3074dc3cb4STobias Grosser #include "llvm/Analysis/TargetLibraryInfo.h" 3174dc3cb4STobias Grosser #include "llvm/Analysis/TargetTransformInfo.h" 3274dc3cb4STobias Grosser #include "llvm/IR/LegacyPassManager.h" 33e1a98343STobias Grosser #include "llvm/IR/Verifier.h" 3474dc3cb4STobias Grosser #include "llvm/Support/TargetRegistry.h" 3574dc3cb4STobias Grosser #include "llvm/Support/TargetSelect.h" 3674dc3cb4STobias Grosser #include "llvm/Target/TargetMachine.h" 379a18d559STobias Grosser #include "llvm/Transforms/IPO/PassManagerBuilder.h" 38750160e2STobias Grosser #include "llvm/Transforms/Utils/BasicBlockUtils.h" 399dfe4e7cSTobias Grosser 40f384594dSTobias Grosser #include "isl/union_map.h" 41f384594dSTobias Grosser 42e938517eSTobias Grosser extern "C" { 43a56f8f8eSTobias Grosser #include "ppcg/cuda.h" 44a56f8f8eSTobias Grosser #include "ppcg/gpu.h" 45a56f8f8eSTobias Grosser #include "ppcg/gpu_print.h" 46a56f8f8eSTobias Grosser #include "ppcg/ppcg.h" 47a56f8f8eSTobias Grosser #include "ppcg/schedule.h" 48e938517eSTobias Grosser } 49e938517eSTobias Grosser 509dfe4e7cSTobias Grosser #include "llvm/Support/Debug.h" 519dfe4e7cSTobias Grosser 529dfe4e7cSTobias Grosser using namespace polly; 539dfe4e7cSTobias Grosser using namespace llvm; 549dfe4e7cSTobias Grosser 559dfe4e7cSTobias Grosser #define DEBUG_TYPE "polly-codegen-ppcg" 569dfe4e7cSTobias Grosser 57f384594dSTobias Grosser static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 58f384594dSTobias Grosser cl::desc("Dump the computed GPU Schedule"), 59681bd568STobias Grosser cl::Hidden, cl::init(false), cl::ZeroOrMore, 60f384594dSTobias Grosser cl::cat(PollyCategory)); 6169b46751STobias Grosser 6269b46751STobias Grosser static cl::opt<bool> 6369b46751STobias Grosser DumpCode("polly-acc-dump-code", 6469b46751STobias Grosser cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 6569b46751STobias Grosser cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 6669b46751STobias Grosser 6732837fe3STobias Grosser static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 6832837fe3STobias Grosser cl::desc("Dump the kernel LLVM-IR"), 6932837fe3STobias Grosser cl::Hidden, cl::init(false), cl::ZeroOrMore, 7032837fe3STobias Grosser cl::cat(PollyCategory)); 7132837fe3STobias Grosser 7274dc3cb4STobias Grosser static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 7374dc3cb4STobias Grosser cl::desc("Dump the kernel assembly code"), 7474dc3cb4STobias Grosser cl::Hidden, cl::init(false), cl::ZeroOrMore, 7574dc3cb4STobias Grosser cl::cat(PollyCategory)); 7674dc3cb4STobias Grosser 7774dc3cb4STobias Grosser static cl::opt<bool> FastMath("polly-acc-fastmath", 7874dc3cb4STobias Grosser cl::desc("Allow unsafe math optimizations"), 7974dc3cb4STobias Grosser cl::Hidden, cl::init(false), cl::ZeroOrMore, 8074dc3cb4STobias Grosser cl::cat(PollyCategory)); 81b513b491STobias Grosser static cl::opt<bool> SharedMemory("polly-acc-use-shared", 82b513b491STobias Grosser cl::desc("Use shared memory"), cl::Hidden, 83b513b491STobias Grosser cl::init(false), cl::ZeroOrMore, 84b513b491STobias Grosser cl::cat(PollyCategory)); 85130ca30fSTobias Grosser static cl::opt<bool> PrivateMemory("polly-acc-use-private", 86130ca30fSTobias Grosser cl::desc("Use private memory"), cl::Hidden, 87130ca30fSTobias Grosser cl::init(false), cl::ZeroOrMore, 88130ca30fSTobias Grosser cl::cat(PollyCategory)); 8974dc3cb4STobias Grosser 90abed4969SSiddharth Bhat static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory", 91abed4969SSiddharth Bhat cl::desc("Generate Host kernel code assuming" 92abed4969SSiddharth Bhat " that all memory has been" 93abed4969SSiddharth Bhat " declared as managed memory"), 94abed4969SSiddharth Bhat cl::Hidden, cl::init(false), cl::ZeroOrMore, 95abed4969SSiddharth Bhat cl::cat(PollyCategory)); 96abed4969SSiddharth Bhat 9774dc3cb4STobias Grosser static cl::opt<std::string> 9874dc3cb4STobias Grosser CudaVersion("polly-acc-cuda-version", 9974dc3cb4STobias Grosser cl::desc("The CUDA version to compile for"), cl::Hidden, 10074dc3cb4STobias Grosser cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 10174dc3cb4STobias Grosser 10282f2af35STobias Grosser static cl::opt<int> 10382f2af35STobias Grosser MinCompute("polly-acc-mincompute", 10482f2af35STobias Grosser cl::desc("Minimal number of compute statements to run on GPU."), 10582f2af35STobias Grosser cl::Hidden, cl::init(10 * 512 * 512)); 10682f2af35STobias Grosser 10760c60025STobias Grosser /// Create the ast expressions for a ScopStmt. 10860c60025STobias Grosser /// 10960c60025STobias Grosser /// This function is a callback for to generate the ast expressions for each 11060c60025STobias Grosser /// of the scheduled ScopStmts. 11160c60025STobias Grosser static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 112edb885cbSTobias Grosser void *StmtT, isl_ast_build *Build, 11360c60025STobias Grosser isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 11460c60025STobias Grosser isl_id *Id, void *User), 11560c60025STobias Grosser void *UserIndex, 11660c60025STobias Grosser isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 117edb885cbSTobias Grosser void *UserExpr) { 11860c60025STobias Grosser 119edb885cbSTobias Grosser ScopStmt *Stmt = (ScopStmt *)StmtT; 12060c60025STobias Grosser 121edb885cbSTobias Grosser isl_ctx *Ctx; 122edb885cbSTobias Grosser 123edb885cbSTobias Grosser if (!Stmt || !Build) 124edb885cbSTobias Grosser return NULL; 125edb885cbSTobias Grosser 126edb885cbSTobias Grosser Ctx = isl_ast_build_get_ctx(Build); 127edb885cbSTobias Grosser isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 128edb885cbSTobias Grosser 129edb885cbSTobias Grosser for (MemoryAccess *Acc : *Stmt) { 130edb885cbSTobias Grosser isl_map *AddrFunc = Acc->getAddressFunction(); 131edb885cbSTobias Grosser AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 132edb885cbSTobias Grosser isl_id *RefId = Acc->getId(); 133edb885cbSTobias Grosser isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 134edb885cbSTobias Grosser isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 135edb885cbSTobias Grosser MPA = isl_multi_pw_aff_coalesce(MPA); 136edb885cbSTobias Grosser MPA = FunctionIndex(MPA, RefId, UserIndex); 137edb885cbSTobias Grosser isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 138edb885cbSTobias Grosser Access = FunctionExpr(Access, RefId, UserExpr); 139edb885cbSTobias Grosser RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 140edb885cbSTobias Grosser } 141edb885cbSTobias Grosser 142edb885cbSTobias Grosser return RefToExpr; 14360c60025STobias Grosser } 144f384594dSTobias Grosser 145a90be207SSiddharth Bhat /// Given a LLVM Type, compute its size in bytes, 146a90be207SSiddharth Bhat static int computeSizeInBytes(const Type *T) { 147a90be207SSiddharth Bhat int bytes = T->getPrimitiveSizeInBits() / 8; 148a90be207SSiddharth Bhat if (bytes == 0) 149a90be207SSiddharth Bhat bytes = T->getScalarSizeInBits() / 8; 150a90be207SSiddharth Bhat return bytes; 151a90be207SSiddharth Bhat } 152a90be207SSiddharth Bhat 15338fc0aedSTobias Grosser /// Generate code for a GPU specific isl AST. 15438fc0aedSTobias Grosser /// 15538fc0aedSTobias Grosser /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 15638fc0aedSTobias Grosser /// generates code for general-prupose AST nodes, with special functionality 15738fc0aedSTobias Grosser /// for generating GPU specific user nodes. 15838fc0aedSTobias Grosser /// 15938fc0aedSTobias Grosser /// @see GPUNodeBuilder::createUser 16038fc0aedSTobias Grosser class GPUNodeBuilder : public IslNodeBuilder { 16138fc0aedSTobias Grosser public: 1622d950f36SPhilip Pfaffe GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, 16338fc0aedSTobias Grosser const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 164acf80064SEli Friedman DominatorTree &DT, Scop &S, BasicBlock *StartBlock, 16517f01968SSiddharth Bhat gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) 1662d950f36SPhilip Pfaffe : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), 16717f01968SSiddharth Bhat Prog(Prog), Runtime(Runtime), Arch(Arch) { 168edb885cbSTobias Grosser getExprBuilder().setIDToSAI(&IDToSAI); 169edb885cbSTobias Grosser } 17038fc0aedSTobias Grosser 171fa7b0802STobias Grosser /// Create after-run-time-check initialization code. 172fa7b0802STobias Grosser void initializeAfterRTH(); 173fa7b0802STobias Grosser 174fa7b0802STobias Grosser /// Finalize the generated scop. 175fa7b0802STobias Grosser virtual void finalize(); 176fa7b0802STobias Grosser 1775857b701STobias Grosser /// Track if the full build process was successful. 1785857b701STobias Grosser /// 1795857b701STobias Grosser /// This value is set to false, if throughout the build process an error 1805857b701STobias Grosser /// occurred which prevents us from generating valid GPU code. 1815857b701STobias Grosser bool BuildSuccessful = true; 1825857b701STobias Grosser 183bc653f20STobias Grosser /// The maximal number of loops surrounding a sequential kernel. 184bc653f20STobias Grosser unsigned DeepestSequential = 0; 185bc653f20STobias Grosser 186bc653f20STobias Grosser /// The maximal number of loops surrounding a parallel kernel. 187bc653f20STobias Grosser unsigned DeepestParallel = 0; 188bc653f20STobias Grosser 18938fc0aedSTobias Grosser private: 19074dc3cb4STobias Grosser /// A vector of array base pointers for which a new ScopArrayInfo was created. 19174dc3cb4STobias Grosser /// 19274dc3cb4STobias Grosser /// This vector is used to delete the ScopArrayInfo when it is not needed any 19374dc3cb4STobias Grosser /// more. 19474dc3cb4STobias Grosser std::vector<Value *> LocalArrays; 19574dc3cb4STobias Grosser 19613c78e4dSTobias Grosser /// A map from ScopArrays to their corresponding device allocations. 19713c78e4dSTobias Grosser std::map<ScopArrayInfo *, Value *> DeviceAllocations; 1987287aeddSTobias Grosser 199fa7b0802STobias Grosser /// The current GPU context. 200fa7b0802STobias Grosser Value *GPUContext; 201fa7b0802STobias Grosser 202b513b491STobias Grosser /// The set of isl_ids allocated in the kernel 203b513b491STobias Grosser std::vector<isl_id *> KernelIds; 204b513b491STobias Grosser 20532837fe3STobias Grosser /// A module containing GPU code. 20632837fe3STobias Grosser /// 20732837fe3STobias Grosser /// This pointer is only set in case we are currently generating GPU code. 20832837fe3STobias Grosser std::unique_ptr<Module> GPUModule; 20932837fe3STobias Grosser 21032837fe3STobias Grosser /// The GPU program we generate code for. 21132837fe3STobias Grosser gpu_prog *Prog; 21232837fe3STobias Grosser 21317f01968SSiddharth Bhat /// The GPU Runtime implementation to use (OpenCL or CUDA). 21417f01968SSiddharth Bhat GPURuntime Runtime; 21517f01968SSiddharth Bhat 21617f01968SSiddharth Bhat /// The GPU Architecture to target. 21717f01968SSiddharth Bhat GPUArch Arch; 21817f01968SSiddharth Bhat 219472f9654STobias Grosser /// Class to free isl_ids. 220472f9654STobias Grosser class IslIdDeleter { 221472f9654STobias Grosser public: 222472f9654STobias Grosser void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 223472f9654STobias Grosser }; 224472f9654STobias Grosser 225472f9654STobias Grosser /// A set containing all isl_ids allocated in a GPU kernel. 226472f9654STobias Grosser /// 227472f9654STobias Grosser /// By releasing this set all isl_ids will be freed. 228472f9654STobias Grosser std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 229472f9654STobias Grosser 230edb885cbSTobias Grosser IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 231edb885cbSTobias Grosser 23238fc0aedSTobias Grosser /// Create code for user-defined AST nodes. 23338fc0aedSTobias Grosser /// 23438fc0aedSTobias Grosser /// These AST nodes can be of type: 23538fc0aedSTobias Grosser /// 23638fc0aedSTobias Grosser /// - ScopStmt: A computational statement (TODO) 23738fc0aedSTobias Grosser /// - Kernel: A GPU kernel call (TODO) 23813c78e4dSTobias Grosser /// - Data-Transfer: A GPU <-> CPU data-transfer 2395260c041STobias Grosser /// - In-kernel synchronization 2405260c041STobias Grosser /// - In-kernel memory copy statement 24138fc0aedSTobias Grosser /// 2421fb9b64dSTobias Grosser /// @param UserStmt The ast node to generate code for. 2431fb9b64dSTobias Grosser virtual void createUser(__isl_take isl_ast_node *UserStmt); 24432837fe3STobias Grosser 24513c78e4dSTobias Grosser enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 24613c78e4dSTobias Grosser 24713c78e4dSTobias Grosser /// Create code for a data transfer statement 24813c78e4dSTobias Grosser /// 24913c78e4dSTobias Grosser /// @param TransferStmt The data transfer statement. 25013c78e4dSTobias Grosser /// @param Direction The direction in which to transfer data. 25113c78e4dSTobias Grosser void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 25213c78e4dSTobias Grosser enum DataDirection Direction); 25313c78e4dSTobias Grosser 254edb885cbSTobias Grosser /// Find llvm::Values referenced in GPU kernel. 255edb885cbSTobias Grosser /// 256edb885cbSTobias Grosser /// @param Kernel The kernel to scan for llvm::Values 257edb885cbSTobias Grosser /// 258edb885cbSTobias Grosser /// @returns A set of values referenced by the kernel. 259edb885cbSTobias Grosser SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 260edb885cbSTobias Grosser 26179a947c2STobias Grosser /// Compute the sizes of the execution grid for a given kernel. 26279a947c2STobias Grosser /// 26379a947c2STobias Grosser /// @param Kernel The kernel to compute grid sizes for. 26479a947c2STobias Grosser /// 26579a947c2STobias Grosser /// @returns A tuple with grid sizes for X and Y dimension 26679a947c2STobias Grosser std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 26779a947c2STobias Grosser 268abed4969SSiddharth Bhat /// Creates a array that can be sent to the kernel on the device using a 269abed4969SSiddharth Bhat /// host pointer. This is required for managed memory, when we directly send 270abed4969SSiddharth Bhat /// host pointers to the device. 271abed4969SSiddharth Bhat /// \note 272abed4969SSiddharth Bhat /// This is to be used only with managed memory 273abed4969SSiddharth Bhat Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, 274abed4969SSiddharth Bhat ScopArrayInfo *ArrayInfo); 275abed4969SSiddharth Bhat 27679a947c2STobias Grosser /// Compute the sizes of the thread blocks for a given kernel. 27779a947c2STobias Grosser /// 27879a947c2STobias Grosser /// @param Kernel The kernel to compute thread block sizes for. 27979a947c2STobias Grosser /// 28079a947c2STobias Grosser /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 28179a947c2STobias Grosser std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 28279a947c2STobias Grosser 283a90be207SSiddharth Bhat /// Store a specific kernel launch parameter in the array of kernel launch 284a90be207SSiddharth Bhat /// parameters. 285a90be207SSiddharth Bhat /// 286a90be207SSiddharth Bhat /// @param Parameters The list of parameters in which to store. 287a90be207SSiddharth Bhat /// @param Param The kernel launch parameter to store. 288a90be207SSiddharth Bhat /// @param Index The index in the parameter list, at which to store the 289a90be207SSiddharth Bhat /// parameter. 290a90be207SSiddharth Bhat void insertStoreParameter(Instruction *Parameters, Instruction *Param, 291a90be207SSiddharth Bhat int Index); 292a90be207SSiddharth Bhat 29379a947c2STobias Grosser /// Create kernel launch parameters. 29479a947c2STobias Grosser /// 29579a947c2STobias Grosser /// @param Kernel The kernel to create parameters for. 29679a947c2STobias Grosser /// @param F The kernel function that has been created. 29757693272STobias Grosser /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 29879a947c2STobias Grosser /// 29979a947c2STobias Grosser /// @returns A stack allocated array with pointers to the parameter 30079a947c2STobias Grosser /// values that are passed to the kernel. 30157693272STobias Grosser Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 30257693272STobias Grosser SetVector<Value *> SubtreeValues); 30379a947c2STobias Grosser 304b513b491STobias Grosser /// Create declarations for kernel variable. 305b513b491STobias Grosser /// 306b513b491STobias Grosser /// This includes shared memory declarations. 307b513b491STobias Grosser /// 308b513b491STobias Grosser /// @param Kernel The kernel definition to create variables for. 309b513b491STobias Grosser /// @param FN The function into which to generate the variables. 310b513b491STobias Grosser void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 311b513b491STobias Grosser 312c1c6a2a6STobias Grosser /// Add CUDA annotations to module. 313c1c6a2a6STobias Grosser /// 314c1c6a2a6STobias Grosser /// Add a set of CUDA annotations that declares the maximal block dimensions 315c1c6a2a6STobias Grosser /// that will be used to execute the CUDA kernel. This allows the NVIDIA 316c1c6a2a6STobias Grosser /// PTX compiler to bound the number of allocated registers to ensure the 317c1c6a2a6STobias Grosser /// resulting kernel is known to run with up to as many block dimensions 318c1c6a2a6STobias Grosser /// as specified here. 319c1c6a2a6STobias Grosser /// 320c1c6a2a6STobias Grosser /// @param M The module to add the annotations to. 321c1c6a2a6STobias Grosser /// @param BlockDimX The size of block dimension X. 322c1c6a2a6STobias Grosser /// @param BlockDimY The size of block dimension Y. 323c1c6a2a6STobias Grosser /// @param BlockDimZ The size of block dimension Z. 324c1c6a2a6STobias Grosser void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 325c1c6a2a6STobias Grosser Value *BlockDimZ); 326c1c6a2a6STobias Grosser 32732837fe3STobias Grosser /// Create GPU kernel. 32832837fe3STobias Grosser /// 32932837fe3STobias Grosser /// Code generate the kernel described by @p KernelStmt. 33032837fe3STobias Grosser /// 33132837fe3STobias Grosser /// @param KernelStmt The ast node to generate kernel code for. 33232837fe3STobias Grosser void createKernel(__isl_take isl_ast_node *KernelStmt); 33332837fe3STobias Grosser 33413c78e4dSTobias Grosser /// Generate code that computes the size of an array. 33513c78e4dSTobias Grosser /// 33613c78e4dSTobias Grosser /// @param Array The array for which to compute a size. 33713c78e4dSTobias Grosser Value *getArraySize(gpu_array_info *Array); 33813c78e4dSTobias Grosser 339aaabbbf8STobias Grosser /// Generate code to compute the minimal offset at which an array is accessed. 340aaabbbf8STobias Grosser /// 341aaabbbf8STobias Grosser /// The offset of an array is the minimal array location accessed in a scop. 342aaabbbf8STobias Grosser /// 343aaabbbf8STobias Grosser /// Example: 344aaabbbf8STobias Grosser /// 345aaabbbf8STobias Grosser /// for (long i = 0; i < 100; i++) 346aaabbbf8STobias Grosser /// A[i + 42] += ... 347aaabbbf8STobias Grosser /// 348aaabbbf8STobias Grosser /// getArrayOffset(A) results in 42. 349aaabbbf8STobias Grosser /// 350aaabbbf8STobias Grosser /// @param Array The array for which to compute the offset. 351aaabbbf8STobias Grosser /// @returns An llvm::Value that contains the offset of the array. 352aaabbbf8STobias Grosser Value *getArrayOffset(gpu_array_info *Array); 353aaabbbf8STobias Grosser 35400bb5a99STobias Grosser /// Prepare the kernel arguments for kernel code generation 35500bb5a99STobias Grosser /// 35600bb5a99STobias Grosser /// @param Kernel The kernel to generate code for. 35700bb5a99STobias Grosser /// @param FN The function created for the kernel. 35800bb5a99STobias Grosser void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 35900bb5a99STobias Grosser 36032837fe3STobias Grosser /// Create kernel function. 36132837fe3STobias Grosser /// 36232837fe3STobias Grosser /// Create a kernel function located in a newly created module that can serve 36332837fe3STobias Grosser /// as target for device code generation. Set the Builder to point to the 36432837fe3STobias Grosser /// start block of this newly created function. 36532837fe3STobias Grosser /// 36632837fe3STobias Grosser /// @param Kernel The kernel to generate code for. 367edb885cbSTobias Grosser /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 368edb885cbSTobias Grosser void createKernelFunction(ppcg_kernel *Kernel, 369edb885cbSTobias Grosser SetVector<Value *> &SubtreeValues); 37032837fe3STobias Grosser 37132837fe3STobias Grosser /// Create the declaration of a kernel function. 37232837fe3STobias Grosser /// 37332837fe3STobias Grosser /// The kernel function takes as arguments: 37432837fe3STobias Grosser /// 37532837fe3STobias Grosser /// - One i8 pointer for each external array reference used in the kernel. 376f6044bd0STobias Grosser /// - Host iterators 377c84a1995STobias Grosser /// - Parameters 37832837fe3STobias Grosser /// - Other LLVM Value references (TODO) 37932837fe3STobias Grosser /// 38032837fe3STobias Grosser /// @param Kernel The kernel to generate the function declaration for. 381edb885cbSTobias Grosser /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 382edb885cbSTobias Grosser /// 38332837fe3STobias Grosser /// @returns The newly declared function. 384edb885cbSTobias Grosser Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 385edb885cbSTobias Grosser SetVector<Value *> &SubtreeValues); 38632837fe3STobias Grosser 387472f9654STobias Grosser /// Insert intrinsic functions to obtain thread and block ids. 388472f9654STobias Grosser /// 389472f9654STobias Grosser /// @param The kernel to generate the intrinsic functions for. 390472f9654STobias Grosser void insertKernelIntrinsics(ppcg_kernel *Kernel); 391472f9654STobias Grosser 392b513b491STobias Grosser /// Create a global-to-shared or shared-to-global copy statement. 393b513b491STobias Grosser /// 394b513b491STobias Grosser /// @param CopyStmt The copy statement to generate code for 395b513b491STobias Grosser void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 396b513b491STobias Grosser 397edb885cbSTobias Grosser /// Create code for a ScopStmt called in @p Expr. 398edb885cbSTobias Grosser /// 399edb885cbSTobias Grosser /// @param Expr The expression containing the call. 400edb885cbSTobias Grosser /// @param KernelStmt The kernel statement referenced in the call. 401edb885cbSTobias Grosser void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 402edb885cbSTobias Grosser 4035260c041STobias Grosser /// Create an in-kernel synchronization call. 4045260c041STobias Grosser void createKernelSync(); 4055260c041STobias Grosser 40674dc3cb4STobias Grosser /// Create a PTX assembly string for the current GPU kernel. 40774dc3cb4STobias Grosser /// 40874dc3cb4STobias Grosser /// @returns A string containing the corresponding PTX assembly code. 40974dc3cb4STobias Grosser std::string createKernelASM(); 41074dc3cb4STobias Grosser 41174dc3cb4STobias Grosser /// Remove references from the dominator tree to the kernel function @p F. 41274dc3cb4STobias Grosser /// 41374dc3cb4STobias Grosser /// @param F The function to remove references to. 41474dc3cb4STobias Grosser void clearDominators(Function *F); 41574dc3cb4STobias Grosser 41674dc3cb4STobias Grosser /// Remove references from scalar evolution to the kernel function @p F. 41774dc3cb4STobias Grosser /// 41874dc3cb4STobias Grosser /// @param F The function to remove references to. 41974dc3cb4STobias Grosser void clearScalarEvolution(Function *F); 42074dc3cb4STobias Grosser 42174dc3cb4STobias Grosser /// Remove references from loop info to the kernel function @p F. 42274dc3cb4STobias Grosser /// 42374dc3cb4STobias Grosser /// @param F The function to remove references to. 42474dc3cb4STobias Grosser void clearLoops(Function *F); 42574dc3cb4STobias Grosser 42632837fe3STobias Grosser /// Finalize the generation of the kernel function. 42732837fe3STobias Grosser /// 42832837fe3STobias Grosser /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 42932837fe3STobias Grosser /// dump its IR to stderr. 43057793596STobias Grosser /// 43157793596STobias Grosser /// @returns The Assembly string of the kernel. 43257793596STobias Grosser std::string finalizeKernelFunction(); 433fa7b0802STobias Grosser 43451dfc275STobias Grosser /// Finalize the generation of the kernel arguments. 43551dfc275STobias Grosser /// 43651dfc275STobias Grosser /// This function ensures that not-read-only scalars used in a kernel are 43751dfc275STobias Grosser /// stored back to the global memory location they ared backed up with before 43851dfc275STobias Grosser /// the kernel terminates. 43951dfc275STobias Grosser /// 44051dfc275STobias Grosser /// @params Kernel The kernel to finalize kernel arguments for. 44151dfc275STobias Grosser void finalizeKernelArguments(ppcg_kernel *Kernel); 44251dfc275STobias Grosser 4437287aeddSTobias Grosser /// Create code that allocates memory to store arrays on device. 444fa7b0802STobias Grosser void allocateDeviceArrays(); 445fa7b0802STobias Grosser 4467287aeddSTobias Grosser /// Free all allocated device arrays. 4477287aeddSTobias Grosser void freeDeviceArrays(); 4487287aeddSTobias Grosser 449fa7b0802STobias Grosser /// Create a call to initialize the GPU context. 450fa7b0802STobias Grosser /// 451fa7b0802STobias Grosser /// @returns A pointer to the newly initialized context. 452fa7b0802STobias Grosser Value *createCallInitContext(); 453fa7b0802STobias Grosser 45479a947c2STobias Grosser /// Create a call to get the device pointer for a kernel allocation. 45579a947c2STobias Grosser /// 45679a947c2STobias Grosser /// @param Allocation The Polly GPU allocation 45779a947c2STobias Grosser /// 45879a947c2STobias Grosser /// @returns The device parameter corresponding to this allocation. 45979a947c2STobias Grosser Value *createCallGetDevicePtr(Value *Allocation); 46079a947c2STobias Grosser 461fa7b0802STobias Grosser /// Create a call to free the GPU context. 462fa7b0802STobias Grosser /// 463fa7b0802STobias Grosser /// @param Context A pointer to an initialized GPU context. 464fa7b0802STobias Grosser void createCallFreeContext(Value *Context); 465fa7b0802STobias Grosser 4667287aeddSTobias Grosser /// Create a call to allocate memory on the device. 4677287aeddSTobias Grosser /// 4687287aeddSTobias Grosser /// @param Size The size of memory to allocate 4697287aeddSTobias Grosser /// 4707287aeddSTobias Grosser /// @returns A pointer that identifies this allocation. 471fa7b0802STobias Grosser Value *createCallAllocateMemoryForDevice(Value *Size); 4727287aeddSTobias Grosser 4737287aeddSTobias Grosser /// Create a call to free a device array. 4747287aeddSTobias Grosser /// 4757287aeddSTobias Grosser /// @param Array The device array to free. 4767287aeddSTobias Grosser void createCallFreeDeviceMemory(Value *Array); 47713c78e4dSTobias Grosser 47813c78e4dSTobias Grosser /// Create a call to copy data from host to device. 47913c78e4dSTobias Grosser /// 48013c78e4dSTobias Grosser /// @param HostPtr A pointer to the host data that should be copied. 48113c78e4dSTobias Grosser /// @param DevicePtr A device pointer specifying the location to copy to. 48213c78e4dSTobias Grosser void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 48313c78e4dSTobias Grosser Value *Size); 48413c78e4dSTobias Grosser 48513c78e4dSTobias Grosser /// Create a call to copy data from device to host. 48613c78e4dSTobias Grosser /// 48713c78e4dSTobias Grosser /// @param DevicePtr A pointer to the device data that should be copied. 48813c78e4dSTobias Grosser /// @param HostPtr A host pointer specifying the location to copy to. 48913c78e4dSTobias Grosser void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 49013c78e4dSTobias Grosser Value *Size); 49157793596STobias Grosser 492abed4969SSiddharth Bhat /// Create a call to synchronize Host & Device. 493abed4969SSiddharth Bhat /// \note 494abed4969SSiddharth Bhat /// This is to be used only with managed memory. 495abed4969SSiddharth Bhat void createCallSynchronizeDevice(); 496abed4969SSiddharth Bhat 49757793596STobias Grosser /// Create a call to get a kernel from an assembly string. 49857793596STobias Grosser /// 49957793596STobias Grosser /// @param Buffer The string describing the kernel. 50057793596STobias Grosser /// @param Entry The name of the kernel function to call. 50157793596STobias Grosser /// 50257793596STobias Grosser /// @returns A pointer to a kernel object 50357793596STobias Grosser Value *createCallGetKernel(Value *Buffer, Value *Entry); 50457793596STobias Grosser 50557793596STobias Grosser /// Create a call to free a GPU kernel. 50657793596STobias Grosser /// 50757793596STobias Grosser /// @param GPUKernel THe kernel to free. 50857793596STobias Grosser void createCallFreeKernel(Value *GPUKernel); 50979a947c2STobias Grosser 51079a947c2STobias Grosser /// Create a call to launch a GPU kernel. 51179a947c2STobias Grosser /// 51279a947c2STobias Grosser /// @param GPUKernel The kernel to launch. 51379a947c2STobias Grosser /// @param GridDimX The size of the first grid dimension. 51479a947c2STobias Grosser /// @param GridDimY The size of the second grid dimension. 51579a947c2STobias Grosser /// @param GridBlockX The size of the first block dimension. 51679a947c2STobias Grosser /// @param GridBlockY The size of the second block dimension. 51779a947c2STobias Grosser /// @param GridBlockZ The size of the third block dimension. 51879a947c2STobias Grosser /// @param Paramters A pointer to an array that contains itself pointers to 51979a947c2STobias Grosser /// the parameter values passed for each kernel argument. 52079a947c2STobias Grosser void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 52179a947c2STobias Grosser Value *GridDimY, Value *BlockDimX, 52279a947c2STobias Grosser Value *BlockDimY, Value *BlockDimZ, 52379a947c2STobias Grosser Value *Parameters); 5241fb9b64dSTobias Grosser }; 5251fb9b64dSTobias Grosser 526fa7b0802STobias Grosser void GPUNodeBuilder::initializeAfterRTH() { 527750160e2STobias Grosser BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 528750160e2STobias Grosser &*Builder.GetInsertPoint(), &DT, &LI); 529750160e2STobias Grosser NewBB->setName("polly.acc.initialize"); 530750160e2STobias Grosser Builder.SetInsertPoint(&NewBB->front()); 531750160e2STobias Grosser 532fa7b0802STobias Grosser GPUContext = createCallInitContext(); 533abed4969SSiddharth Bhat 534abed4969SSiddharth Bhat if (!ManagedMemory) 535fa7b0802STobias Grosser allocateDeviceArrays(); 536fa7b0802STobias Grosser } 537fa7b0802STobias Grosser 538fa7b0802STobias Grosser void GPUNodeBuilder::finalize() { 539abed4969SSiddharth Bhat if (!ManagedMemory) 5407287aeddSTobias Grosser freeDeviceArrays(); 541abed4969SSiddharth Bhat 542fa7b0802STobias Grosser createCallFreeContext(GPUContext); 543fa7b0802STobias Grosser IslNodeBuilder::finalize(); 544fa7b0802STobias Grosser } 545fa7b0802STobias Grosser 546fa7b0802STobias Grosser void GPUNodeBuilder::allocateDeviceArrays() { 547abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory will directly send host pointers " 548abed4969SSiddharth Bhat "to the kernel. There is no need for device arrays"); 549fa7b0802STobias Grosser isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 550fa7b0802STobias Grosser 551fa7b0802STobias Grosser for (int i = 0; i < Prog->n_array; ++i) { 552fa7b0802STobias Grosser gpu_array_info *Array = &Prog->array[i]; 55313c78e4dSTobias Grosser auto *ScopArray = (ScopArrayInfo *)Array->user; 5547287aeddSTobias Grosser std::string DevArrayName("p_dev_array_"); 5557287aeddSTobias Grosser DevArrayName.append(Array->name); 556fa7b0802STobias Grosser 55713c78e4dSTobias Grosser Value *ArraySize = getArraySize(Array); 558aaabbbf8STobias Grosser Value *Offset = getArrayOffset(Array); 559aaabbbf8STobias Grosser if (Offset) 560aaabbbf8STobias Grosser ArraySize = Builder.CreateSub( 561aaabbbf8STobias Grosser ArraySize, 562aaabbbf8STobias Grosser Builder.CreateMul(Offset, 563aaabbbf8STobias Grosser Builder.getInt64(ScopArray->getElemSizeInBytes()))); 5647287aeddSTobias Grosser Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 5657287aeddSTobias Grosser DevArray->setName(DevArrayName); 56613c78e4dSTobias Grosser DeviceAllocations[ScopArray] = DevArray; 567fa7b0802STobias Grosser } 568fa7b0802STobias Grosser 569fa7b0802STobias Grosser isl_ast_build_free(Build); 570fa7b0802STobias Grosser } 571fa7b0802STobias Grosser 572c1c6a2a6STobias Grosser void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 573c1c6a2a6STobias Grosser Value *BlockDimY, Value *BlockDimZ) { 574c1c6a2a6STobias Grosser auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 575c1c6a2a6STobias Grosser 576c1c6a2a6STobias Grosser for (auto &F : *M) { 577c1c6a2a6STobias Grosser if (F.getCallingConv() != CallingConv::PTX_Kernel) 578c1c6a2a6STobias Grosser continue; 579c1c6a2a6STobias Grosser 580c1c6a2a6STobias Grosser Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 581c1c6a2a6STobias Grosser 582c1c6a2a6STobias Grosser Metadata *Elements[] = { 583c1c6a2a6STobias Grosser ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 584c1c6a2a6STobias Grosser ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 585c1c6a2a6STobias Grosser ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 586c1c6a2a6STobias Grosser ValueAsMetadata::get(V[2]), 587c1c6a2a6STobias Grosser }; 588c1c6a2a6STobias Grosser MDNode *Node = MDNode::get(M->getContext(), Elements); 589c1c6a2a6STobias Grosser AnnotationNode->addOperand(Node); 590c1c6a2a6STobias Grosser } 591c1c6a2a6STobias Grosser } 592c1c6a2a6STobias Grosser 5937287aeddSTobias Grosser void GPUNodeBuilder::freeDeviceArrays() { 594abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory does not use device arrays"); 59513c78e4dSTobias Grosser for (auto &Array : DeviceAllocations) 59613c78e4dSTobias Grosser createCallFreeDeviceMemory(Array.second); 5977287aeddSTobias Grosser } 5987287aeddSTobias Grosser 59957793596STobias Grosser Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 60057793596STobias Grosser const char *Name = "polly_getKernel"; 60157793596STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 60257793596STobias Grosser Function *F = M->getFunction(Name); 60357793596STobias Grosser 60457793596STobias Grosser // If F is not available, declare it. 60557793596STobias Grosser if (!F) { 60657793596STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 60757793596STobias Grosser std::vector<Type *> Args; 60857793596STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 60957793596STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 61057793596STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 61157793596STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 61257793596STobias Grosser } 61357793596STobias Grosser 61457793596STobias Grosser return Builder.CreateCall(F, {Buffer, Entry}); 61557793596STobias Grosser } 61657793596STobias Grosser 61779a947c2STobias Grosser Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 61879a947c2STobias Grosser const char *Name = "polly_getDevicePtr"; 61979a947c2STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 62079a947c2STobias Grosser Function *F = M->getFunction(Name); 62179a947c2STobias Grosser 62279a947c2STobias Grosser // If F is not available, declare it. 62379a947c2STobias Grosser if (!F) { 62479a947c2STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 62579a947c2STobias Grosser std::vector<Type *> Args; 62679a947c2STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 62779a947c2STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 62879a947c2STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 62979a947c2STobias Grosser } 63079a947c2STobias Grosser 63179a947c2STobias Grosser return Builder.CreateCall(F, {Allocation}); 63279a947c2STobias Grosser } 63379a947c2STobias Grosser 63479a947c2STobias Grosser void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 63579a947c2STobias Grosser Value *GridDimY, Value *BlockDimX, 63679a947c2STobias Grosser Value *BlockDimY, Value *BlockDimZ, 63779a947c2STobias Grosser Value *Parameters) { 63879a947c2STobias Grosser const char *Name = "polly_launchKernel"; 63979a947c2STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 64079a947c2STobias Grosser Function *F = M->getFunction(Name); 64179a947c2STobias Grosser 64279a947c2STobias Grosser // If F is not available, declare it. 64379a947c2STobias Grosser if (!F) { 64479a947c2STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 64579a947c2STobias Grosser std::vector<Type *> Args; 64679a947c2STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 64779a947c2STobias Grosser Args.push_back(Builder.getInt32Ty()); 64879a947c2STobias Grosser Args.push_back(Builder.getInt32Ty()); 64979a947c2STobias Grosser Args.push_back(Builder.getInt32Ty()); 65079a947c2STobias Grosser Args.push_back(Builder.getInt32Ty()); 65179a947c2STobias Grosser Args.push_back(Builder.getInt32Ty()); 65279a947c2STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 65379a947c2STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 65479a947c2STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 65579a947c2STobias Grosser } 65679a947c2STobias Grosser 657ff40087aSTobias Grosser Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 65879a947c2STobias Grosser BlockDimZ, Parameters}); 65979a947c2STobias Grosser } 66079a947c2STobias Grosser 66157793596STobias Grosser void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 66257793596STobias Grosser const char *Name = "polly_freeKernel"; 66357793596STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 66457793596STobias Grosser Function *F = M->getFunction(Name); 66557793596STobias Grosser 66657793596STobias Grosser // If F is not available, declare it. 66757793596STobias Grosser if (!F) { 66857793596STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 66957793596STobias Grosser std::vector<Type *> Args; 67057793596STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 67157793596STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 67257793596STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 67357793596STobias Grosser } 67457793596STobias Grosser 67557793596STobias Grosser Builder.CreateCall(F, {GPUKernel}); 67657793596STobias Grosser } 67757793596STobias Grosser 6787287aeddSTobias Grosser void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 679abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory does not allocate or free memory " 680abed4969SSiddharth Bhat "for device"); 6817287aeddSTobias Grosser const char *Name = "polly_freeDeviceMemory"; 6827287aeddSTobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 6837287aeddSTobias Grosser Function *F = M->getFunction(Name); 6847287aeddSTobias Grosser 6857287aeddSTobias Grosser // If F is not available, declare it. 6867287aeddSTobias Grosser if (!F) { 6877287aeddSTobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 6887287aeddSTobias Grosser std::vector<Type *> Args; 6897287aeddSTobias Grosser Args.push_back(Builder.getInt8PtrTy()); 6907287aeddSTobias Grosser FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 6917287aeddSTobias Grosser F = Function::Create(Ty, Linkage, Name, M); 6927287aeddSTobias Grosser } 6937287aeddSTobias Grosser 6947287aeddSTobias Grosser Builder.CreateCall(F, {Array}); 6957287aeddSTobias Grosser } 6967287aeddSTobias Grosser 697fa7b0802STobias Grosser Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 698abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory does not allocate or free memory " 699abed4969SSiddharth Bhat "for device"); 700fa7b0802STobias Grosser const char *Name = "polly_allocateMemoryForDevice"; 701fa7b0802STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 702fa7b0802STobias Grosser Function *F = M->getFunction(Name); 703fa7b0802STobias Grosser 704fa7b0802STobias Grosser // If F is not available, declare it. 705fa7b0802STobias Grosser if (!F) { 706fa7b0802STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 707fa7b0802STobias Grosser std::vector<Type *> Args; 708fa7b0802STobias Grosser Args.push_back(Builder.getInt64Ty()); 709fa7b0802STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 710fa7b0802STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 711fa7b0802STobias Grosser } 712fa7b0802STobias Grosser 713fa7b0802STobias Grosser return Builder.CreateCall(F, {Size}); 714fa7b0802STobias Grosser } 715fa7b0802STobias Grosser 71613c78e4dSTobias Grosser void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 71713c78e4dSTobias Grosser Value *DeviceData, 71813c78e4dSTobias Grosser Value *Size) { 719abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory does not transfer memory between " 720abed4969SSiddharth Bhat "device and host"); 72113c78e4dSTobias Grosser const char *Name = "polly_copyFromHostToDevice"; 72213c78e4dSTobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 72313c78e4dSTobias Grosser Function *F = M->getFunction(Name); 72413c78e4dSTobias Grosser 72513c78e4dSTobias Grosser // If F is not available, declare it. 72613c78e4dSTobias Grosser if (!F) { 72713c78e4dSTobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 72813c78e4dSTobias Grosser std::vector<Type *> Args; 72913c78e4dSTobias Grosser Args.push_back(Builder.getInt8PtrTy()); 73013c78e4dSTobias Grosser Args.push_back(Builder.getInt8PtrTy()); 73113c78e4dSTobias Grosser Args.push_back(Builder.getInt64Ty()); 73213c78e4dSTobias Grosser FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 73313c78e4dSTobias Grosser F = Function::Create(Ty, Linkage, Name, M); 73413c78e4dSTobias Grosser } 73513c78e4dSTobias Grosser 73613c78e4dSTobias Grosser Builder.CreateCall(F, {HostData, DeviceData, Size}); 73713c78e4dSTobias Grosser } 73813c78e4dSTobias Grosser 73913c78e4dSTobias Grosser void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 74013c78e4dSTobias Grosser Value *HostData, 74113c78e4dSTobias Grosser Value *Size) { 742abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory does not transfer memory between " 743abed4969SSiddharth Bhat "device and host"); 74413c78e4dSTobias Grosser const char *Name = "polly_copyFromDeviceToHost"; 74513c78e4dSTobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 74613c78e4dSTobias Grosser Function *F = M->getFunction(Name); 74713c78e4dSTobias Grosser 74813c78e4dSTobias Grosser // If F is not available, declare it. 74913c78e4dSTobias Grosser if (!F) { 75013c78e4dSTobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 75113c78e4dSTobias Grosser std::vector<Type *> Args; 75213c78e4dSTobias Grosser Args.push_back(Builder.getInt8PtrTy()); 75313c78e4dSTobias Grosser Args.push_back(Builder.getInt8PtrTy()); 75413c78e4dSTobias Grosser Args.push_back(Builder.getInt64Ty()); 75513c78e4dSTobias Grosser FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 75613c78e4dSTobias Grosser F = Function::Create(Ty, Linkage, Name, M); 75713c78e4dSTobias Grosser } 75813c78e4dSTobias Grosser 75913c78e4dSTobias Grosser Builder.CreateCall(F, {DeviceData, HostData, Size}); 76013c78e4dSTobias Grosser } 76113c78e4dSTobias Grosser 762abed4969SSiddharth Bhat void GPUNodeBuilder::createCallSynchronizeDevice() { 763abed4969SSiddharth Bhat assert(ManagedMemory && "explicit synchronization is only necessary for " 764abed4969SSiddharth Bhat "managed memory"); 765abed4969SSiddharth Bhat const char *Name = "polly_synchronizeDevice"; 766abed4969SSiddharth Bhat Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 767abed4969SSiddharth Bhat Function *F = M->getFunction(Name); 768abed4969SSiddharth Bhat 769abed4969SSiddharth Bhat // If F is not available, declare it. 770abed4969SSiddharth Bhat if (!F) { 771abed4969SSiddharth Bhat GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 772abed4969SSiddharth Bhat FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); 773abed4969SSiddharth Bhat F = Function::Create(Ty, Linkage, Name, M); 774abed4969SSiddharth Bhat } 775abed4969SSiddharth Bhat 776abed4969SSiddharth Bhat Builder.CreateCall(F); 777abed4969SSiddharth Bhat } 778abed4969SSiddharth Bhat 779fa7b0802STobias Grosser Value *GPUNodeBuilder::createCallInitContext() { 78017f01968SSiddharth Bhat const char *Name; 78117f01968SSiddharth Bhat 78217f01968SSiddharth Bhat switch (Runtime) { 78317f01968SSiddharth Bhat case GPURuntime::CUDA: 78417f01968SSiddharth Bhat Name = "polly_initContextCUDA"; 78517f01968SSiddharth Bhat break; 78617f01968SSiddharth Bhat case GPURuntime::OpenCL: 78717f01968SSiddharth Bhat Name = "polly_initContextCL"; 78817f01968SSiddharth Bhat break; 78917f01968SSiddharth Bhat } 79017f01968SSiddharth Bhat 791fa7b0802STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 792fa7b0802STobias Grosser Function *F = M->getFunction(Name); 793fa7b0802STobias Grosser 794fa7b0802STobias Grosser // If F is not available, declare it. 795fa7b0802STobias Grosser if (!F) { 796fa7b0802STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 797fa7b0802STobias Grosser std::vector<Type *> Args; 798fa7b0802STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 799fa7b0802STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 800fa7b0802STobias Grosser } 801fa7b0802STobias Grosser 802fa7b0802STobias Grosser return Builder.CreateCall(F, {}); 803fa7b0802STobias Grosser } 804fa7b0802STobias Grosser 805fa7b0802STobias Grosser void GPUNodeBuilder::createCallFreeContext(Value *Context) { 806fa7b0802STobias Grosser const char *Name = "polly_freeContext"; 807fa7b0802STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 808fa7b0802STobias Grosser Function *F = M->getFunction(Name); 809fa7b0802STobias Grosser 810fa7b0802STobias Grosser // If F is not available, declare it. 811fa7b0802STobias Grosser if (!F) { 812fa7b0802STobias Grosser GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 813fa7b0802STobias Grosser std::vector<Type *> Args; 814fa7b0802STobias Grosser Args.push_back(Builder.getInt8PtrTy()); 815fa7b0802STobias Grosser FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 816fa7b0802STobias Grosser F = Function::Create(Ty, Linkage, Name, M); 817fa7b0802STobias Grosser } 818fa7b0802STobias Grosser 819fa7b0802STobias Grosser Builder.CreateCall(F, {Context}); 820fa7b0802STobias Grosser } 821fa7b0802STobias Grosser 8225260c041STobias Grosser /// Check if one string is a prefix of another. 8235260c041STobias Grosser /// 8245260c041STobias Grosser /// @param String The string in which to look for the prefix. 8255260c041STobias Grosser /// @param Prefix The prefix to look for. 8265260c041STobias Grosser static bool isPrefix(std::string String, std::string Prefix) { 8275260c041STobias Grosser return String.find(Prefix) == 0; 8285260c041STobias Grosser } 8295260c041STobias Grosser 83013c78e4dSTobias Grosser Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 83113c78e4dSTobias Grosser isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 83213c78e4dSTobias Grosser Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 83313c78e4dSTobias Grosser 83413c78e4dSTobias Grosser if (!gpu_array_is_scalar(Array)) { 83513c78e4dSTobias Grosser auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 83613c78e4dSTobias Grosser isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 83713c78e4dSTobias Grosser 83813c78e4dSTobias Grosser for (unsigned int i = 1; i < Array->n_index; i++) { 83913c78e4dSTobias Grosser isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 84013c78e4dSTobias Grosser isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 84113c78e4dSTobias Grosser Res = isl_ast_expr_mul(Res, Expr); 84213c78e4dSTobias Grosser } 84313c78e4dSTobias Grosser 84413c78e4dSTobias Grosser Value *NumElements = ExprBuilder.create(Res); 845b79f4d39STobias Grosser if (NumElements->getType() != ArraySize->getType()) 846b79f4d39STobias Grosser NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); 84713c78e4dSTobias Grosser ArraySize = Builder.CreateMul(ArraySize, NumElements); 84813c78e4dSTobias Grosser } 84913c78e4dSTobias Grosser isl_ast_build_free(Build); 85013c78e4dSTobias Grosser return ArraySize; 85113c78e4dSTobias Grosser } 85213c78e4dSTobias Grosser 853aaabbbf8STobias Grosser Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { 854aaabbbf8STobias Grosser if (gpu_array_is_scalar(Array)) 855aaabbbf8STobias Grosser return nullptr; 856aaabbbf8STobias Grosser 857aaabbbf8STobias Grosser isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 858aaabbbf8STobias Grosser 859aaabbbf8STobias Grosser isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent)); 860aaabbbf8STobias Grosser 861aaabbbf8STobias Grosser isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min)); 862aaabbbf8STobias Grosser 863aaabbbf8STobias Grosser for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) 864aaabbbf8STobias Grosser ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0); 865aaabbbf8STobias Grosser 866aaabbbf8STobias Grosser if (isl_set_is_subset(Min, ZeroSet)) { 867aaabbbf8STobias Grosser isl_set_free(Min); 868aaabbbf8STobias Grosser isl_set_free(ZeroSet); 869aaabbbf8STobias Grosser isl_ast_build_free(Build); 870aaabbbf8STobias Grosser return nullptr; 871aaabbbf8STobias Grosser } 872aaabbbf8STobias Grosser isl_set_free(ZeroSet); 873aaabbbf8STobias Grosser 874aaabbbf8STobias Grosser isl_ast_expr *Result = 875aaabbbf8STobias Grosser isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0)); 876aaabbbf8STobias Grosser 877aaabbbf8STobias Grosser for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { 878aaabbbf8STobias Grosser if (i > 0) { 879aaabbbf8STobias Grosser isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); 880aaabbbf8STobias Grosser isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 881aaabbbf8STobias Grosser Result = isl_ast_expr_mul(Result, BExpr); 882aaabbbf8STobias Grosser } 883aaabbbf8STobias Grosser isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i); 884aaabbbf8STobias Grosser isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin); 885aaabbbf8STobias Grosser Result = isl_ast_expr_add(Result, MExpr); 886aaabbbf8STobias Grosser } 887aaabbbf8STobias Grosser 888aaabbbf8STobias Grosser Value *ResultValue = ExprBuilder.create(Result); 889aaabbbf8STobias Grosser isl_set_free(Min); 890aaabbbf8STobias Grosser isl_ast_build_free(Build); 891aaabbbf8STobias Grosser 892aaabbbf8STobias Grosser return ResultValue; 893aaabbbf8STobias Grosser } 894aaabbbf8STobias Grosser 895abed4969SSiddharth Bhat Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, 896abed4969SSiddharth Bhat ScopArrayInfo *ArrayInfo) { 897abed4969SSiddharth Bhat 898abed4969SSiddharth Bhat assert(ManagedMemory && "Only used when you wish to get a host " 899abed4969SSiddharth Bhat "pointer for sending data to the kernel, " 900abed4969SSiddharth Bhat "with managed memory"); 901abed4969SSiddharth Bhat std::map<ScopArrayInfo *, Value *>::iterator it; 902abed4969SSiddharth Bhat if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { 903abed4969SSiddharth Bhat return it->second; 904abed4969SSiddharth Bhat } else { 905abed4969SSiddharth Bhat Value *HostPtr; 906abed4969SSiddharth Bhat 907abed4969SSiddharth Bhat if (gpu_array_is_scalar(Array)) 908abed4969SSiddharth Bhat HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); 909abed4969SSiddharth Bhat else 910abed4969SSiddharth Bhat HostPtr = ArrayInfo->getBasePtr(); 911abed4969SSiddharth Bhat 912abed4969SSiddharth Bhat Value *Offset = getArrayOffset(Array); 913abed4969SSiddharth Bhat if (Offset) { 914abed4969SSiddharth Bhat HostPtr = Builder.CreatePointerCast( 915abed4969SSiddharth Bhat HostPtr, ArrayInfo->getElementType()->getPointerTo()); 916abed4969SSiddharth Bhat HostPtr = Builder.CreateGEP(HostPtr, Offset); 917abed4969SSiddharth Bhat } 918abed4969SSiddharth Bhat 919abed4969SSiddharth Bhat HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 920abed4969SSiddharth Bhat DeviceAllocations[ArrayInfo] = HostPtr; 921abed4969SSiddharth Bhat return HostPtr; 922abed4969SSiddharth Bhat } 923abed4969SSiddharth Bhat } 924abed4969SSiddharth Bhat 92513c78e4dSTobias Grosser void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 92613c78e4dSTobias Grosser enum DataDirection Direction) { 927abed4969SSiddharth Bhat assert(!ManagedMemory && "Managed memory needs no data transfers"); 92813c78e4dSTobias Grosser isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 92913c78e4dSTobias Grosser isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 93013c78e4dSTobias Grosser isl_id *Id = isl_ast_expr_get_id(Arg); 93113c78e4dSTobias Grosser auto Array = (gpu_array_info *)isl_id_get_user(Id); 93213c78e4dSTobias Grosser auto ScopArray = (ScopArrayInfo *)(Array->user); 93313c78e4dSTobias Grosser 93413c78e4dSTobias Grosser Value *Size = getArraySize(Array); 935aaabbbf8STobias Grosser Value *Offset = getArrayOffset(Array); 93613c78e4dSTobias Grosser Value *DevPtr = DeviceAllocations[ScopArray]; 93713c78e4dSTobias Grosser 938b06ff457STobias Grosser Value *HostPtr; 939b06ff457STobias Grosser 940b06ff457STobias Grosser if (gpu_array_is_scalar(Array)) 941b06ff457STobias Grosser HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 942b06ff457STobias Grosser else 943b06ff457STobias Grosser HostPtr = ScopArray->getBasePtr(); 94413c78e4dSTobias Grosser 945aaabbbf8STobias Grosser if (Offset) { 946aaabbbf8STobias Grosser HostPtr = Builder.CreatePointerCast( 947aaabbbf8STobias Grosser HostPtr, ScopArray->getElementType()->getPointerTo()); 948aaabbbf8STobias Grosser HostPtr = Builder.CreateGEP(HostPtr, Offset); 949aaabbbf8STobias Grosser } 950aaabbbf8STobias Grosser 95113c78e4dSTobias Grosser HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 95213c78e4dSTobias Grosser 953aaabbbf8STobias Grosser if (Offset) { 954aaabbbf8STobias Grosser Size = Builder.CreateSub( 955ff40087aSTobias Grosser Size, Builder.CreateMul( 956ff40087aSTobias Grosser Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); 957aaabbbf8STobias Grosser } 958aaabbbf8STobias Grosser 95913c78e4dSTobias Grosser if (Direction == HOST_TO_DEVICE) 96013c78e4dSTobias Grosser createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 96113c78e4dSTobias Grosser else 96213c78e4dSTobias Grosser createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 96313c78e4dSTobias Grosser 96413c78e4dSTobias Grosser isl_id_free(Id); 96513c78e4dSTobias Grosser isl_ast_expr_free(Arg); 96613c78e4dSTobias Grosser isl_ast_expr_free(Expr); 96713c78e4dSTobias Grosser isl_ast_node_free(TransferStmt); 96813c78e4dSTobias Grosser } 96913c78e4dSTobias Grosser 9701fb9b64dSTobias Grosser void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 97132837fe3STobias Grosser isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 97232837fe3STobias Grosser isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 97332837fe3STobias Grosser isl_id *Id = isl_ast_expr_get_id(StmtExpr); 97432837fe3STobias Grosser isl_id_free(Id); 97532837fe3STobias Grosser isl_ast_expr_free(StmtExpr); 97632837fe3STobias Grosser 97732837fe3STobias Grosser const char *Str = isl_id_get_name(Id); 97832837fe3STobias Grosser if (!strcmp(Str, "kernel")) { 97932837fe3STobias Grosser createKernel(UserStmt); 98032837fe3STobias Grosser isl_ast_expr_free(Expr); 98132837fe3STobias Grosser return; 98232837fe3STobias Grosser } 98332837fe3STobias Grosser 98413c78e4dSTobias Grosser if (isPrefix(Str, "to_device")) { 985abed4969SSiddharth Bhat if (!ManagedMemory) 98613c78e4dSTobias Grosser createDataTransfer(UserStmt, HOST_TO_DEVICE); 987abed4969SSiddharth Bhat else 988abed4969SSiddharth Bhat isl_ast_node_free(UserStmt); 989abed4969SSiddharth Bhat 99032837fe3STobias Grosser isl_ast_expr_free(Expr); 99113c78e4dSTobias Grosser return; 99213c78e4dSTobias Grosser } 99313c78e4dSTobias Grosser 99413c78e4dSTobias Grosser if (isPrefix(Str, "from_device")) { 995abed4969SSiddharth Bhat if (!ManagedMemory) { 99613c78e4dSTobias Grosser createDataTransfer(UserStmt, DEVICE_TO_HOST); 997abed4969SSiddharth Bhat } else { 998abed4969SSiddharth Bhat createCallSynchronizeDevice(); 999abed4969SSiddharth Bhat isl_ast_node_free(UserStmt); 1000abed4969SSiddharth Bhat } 100113c78e4dSTobias Grosser isl_ast_expr_free(Expr); 100238fc0aedSTobias Grosser return; 100338fc0aedSTobias Grosser } 100438fc0aedSTobias Grosser 10055260c041STobias Grosser isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 10065260c041STobias Grosser struct ppcg_kernel_stmt *KernelStmt = 10075260c041STobias Grosser (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 10085260c041STobias Grosser isl_id_free(Anno); 10095260c041STobias Grosser 10105260c041STobias Grosser switch (KernelStmt->type) { 10115260c041STobias Grosser case ppcg_kernel_domain: 1012edb885cbSTobias Grosser createScopStmt(Expr, KernelStmt); 10135260c041STobias Grosser isl_ast_node_free(UserStmt); 10145260c041STobias Grosser return; 10155260c041STobias Grosser case ppcg_kernel_copy: 1016b513b491STobias Grosser createKernelCopy(KernelStmt); 10175260c041STobias Grosser isl_ast_expr_free(Expr); 10185260c041STobias Grosser isl_ast_node_free(UserStmt); 10195260c041STobias Grosser return; 10205260c041STobias Grosser case ppcg_kernel_sync: 10215260c041STobias Grosser createKernelSync(); 10225260c041STobias Grosser isl_ast_expr_free(Expr); 10235260c041STobias Grosser isl_ast_node_free(UserStmt); 10245260c041STobias Grosser return; 10255260c041STobias Grosser } 10265260c041STobias Grosser 10275260c041STobias Grosser isl_ast_expr_free(Expr); 10285260c041STobias Grosser isl_ast_node_free(UserStmt); 10295260c041STobias Grosser return; 10305260c041STobias Grosser } 1031b513b491STobias Grosser void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 1032b513b491STobias Grosser isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 1033b513b491STobias Grosser LocalIndex = isl_ast_expr_address_of(LocalIndex); 1034b513b491STobias Grosser Value *LocalAddr = ExprBuilder.create(LocalIndex); 1035b513b491STobias Grosser isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 1036b513b491STobias Grosser Index = isl_ast_expr_address_of(Index); 1037b513b491STobias Grosser Value *GlobalAddr = ExprBuilder.create(Index); 1038b513b491STobias Grosser 1039b513b491STobias Grosser if (KernelStmt->u.c.read) { 1040b513b491STobias Grosser LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 1041b513b491STobias Grosser Builder.CreateStore(Load, LocalAddr); 1042b513b491STobias Grosser } else { 1043b513b491STobias Grosser LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 1044b513b491STobias Grosser Builder.CreateStore(Load, GlobalAddr); 1045b513b491STobias Grosser } 1046b513b491STobias Grosser } 10475260c041STobias Grosser 1048edb885cbSTobias Grosser void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 1049edb885cbSTobias Grosser ppcg_kernel_stmt *KernelStmt) { 1050edb885cbSTobias Grosser auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1051edb885cbSTobias Grosser isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 1052edb885cbSTobias Grosser 1053edb885cbSTobias Grosser LoopToScevMapT LTS; 1054edb885cbSTobias Grosser LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 1055edb885cbSTobias Grosser 1056edb885cbSTobias Grosser createSubstitutions(Expr, Stmt, LTS); 1057edb885cbSTobias Grosser 1058edb885cbSTobias Grosser if (Stmt->isBlockStmt()) 1059edb885cbSTobias Grosser BlockGen.copyStmt(*Stmt, LTS, Indexes); 1060edb885cbSTobias Grosser else 1061a82c4b5dSTobias Grosser RegionGen.copyStmt(*Stmt, LTS, Indexes); 1062edb885cbSTobias Grosser } 1063edb885cbSTobias Grosser 10645260c041STobias Grosser void GPUNodeBuilder::createKernelSync() { 10655260c041STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 106617f01968SSiddharth Bhat 106717f01968SSiddharth Bhat Function *Sync; 106817f01968SSiddharth Bhat 106917f01968SSiddharth Bhat switch (Arch) { 107017f01968SSiddharth Bhat case GPUArch::NVPTX64: 107117f01968SSiddharth Bhat Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 107217f01968SSiddharth Bhat break; 107317f01968SSiddharth Bhat } 107417f01968SSiddharth Bhat 10755260c041STobias Grosser Builder.CreateCall(Sync, {}); 10765260c041STobias Grosser } 10775260c041STobias Grosser 1078edb885cbSTobias Grosser /// Collect llvm::Values referenced from @p Node 1079edb885cbSTobias Grosser /// 1080edb885cbSTobias Grosser /// This function only applies to isl_ast_nodes that are user_nodes referring 1081edb885cbSTobias Grosser /// to a ScopStmt. All other node types are ignore. 1082edb885cbSTobias Grosser /// 1083edb885cbSTobias Grosser /// @param Node The node to collect references for. 1084edb885cbSTobias Grosser /// @param User A user pointer used as storage for the data that is collected. 1085edb885cbSTobias Grosser /// 1086edb885cbSTobias Grosser /// @returns isl_bool_true if data could be collected successfully. 1087edb885cbSTobias Grosser isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 1088edb885cbSTobias Grosser if (isl_ast_node_get_type(Node) != isl_ast_node_user) 1089edb885cbSTobias Grosser return isl_bool_true; 1090edb885cbSTobias Grosser 1091edb885cbSTobias Grosser isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 1092edb885cbSTobias Grosser isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1093edb885cbSTobias Grosser isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1094edb885cbSTobias Grosser const char *Str = isl_id_get_name(Id); 1095edb885cbSTobias Grosser isl_id_free(Id); 1096edb885cbSTobias Grosser isl_ast_expr_free(StmtExpr); 1097edb885cbSTobias Grosser isl_ast_expr_free(Expr); 1098edb885cbSTobias Grosser 1099edb885cbSTobias Grosser if (!isPrefix(Str, "Stmt")) 1100edb885cbSTobias Grosser return isl_bool_true; 1101edb885cbSTobias Grosser 1102edb885cbSTobias Grosser Id = isl_ast_node_get_annotation(Node); 1103edb885cbSTobias Grosser auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 1104edb885cbSTobias Grosser auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1105edb885cbSTobias Grosser isl_id_free(Id); 1106edb885cbSTobias Grosser 110700bb5a99STobias Grosser addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 1108edb885cbSTobias Grosser 1109edb885cbSTobias Grosser return isl_bool_true; 1110edb885cbSTobias Grosser } 1111edb885cbSTobias Grosser 1112edb885cbSTobias Grosser SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 1113edb885cbSTobias Grosser SetVector<Value *> SubtreeValues; 1114edb885cbSTobias Grosser SetVector<const SCEV *> SCEVs; 1115edb885cbSTobias Grosser SetVector<const Loop *> Loops; 1116edb885cbSTobias Grosser SubtreeReferences References = { 1117edb885cbSTobias Grosser LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 1118edb885cbSTobias Grosser 1119edb885cbSTobias Grosser for (const auto &I : IDToValue) 1120edb885cbSTobias Grosser SubtreeValues.insert(I.second); 1121edb885cbSTobias Grosser 1122edb885cbSTobias Grosser isl_ast_node_foreach_descendant_top_down( 1123edb885cbSTobias Grosser Kernel->tree, collectReferencesInGPUStmt, &References); 1124edb885cbSTobias Grosser 1125edb885cbSTobias Grosser for (const SCEV *Expr : SCEVs) 1126edb885cbSTobias Grosser findValues(Expr, SE, SubtreeValues); 1127edb885cbSTobias Grosser 1128edb885cbSTobias Grosser for (auto &SAI : S.arrays()) 1129d7754a12SRoman Gareev SubtreeValues.remove(SAI->getBasePtr()); 1130edb885cbSTobias Grosser 1131edb885cbSTobias Grosser isl_space *Space = S.getParamSpace(); 1132edb885cbSTobias Grosser for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 1133edb885cbSTobias Grosser isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 1134edb885cbSTobias Grosser assert(IDToValue.count(Id)); 1135edb885cbSTobias Grosser Value *Val = IDToValue[Id]; 1136edb885cbSTobias Grosser SubtreeValues.remove(Val); 1137edb885cbSTobias Grosser isl_id_free(Id); 1138edb885cbSTobias Grosser } 1139edb885cbSTobias Grosser isl_space_free(Space); 1140edb885cbSTobias Grosser 1141edb885cbSTobias Grosser for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 1142edb885cbSTobias Grosser isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1143edb885cbSTobias Grosser assert(IDToValue.count(Id)); 1144edb885cbSTobias Grosser Value *Val = IDToValue[Id]; 1145edb885cbSTobias Grosser SubtreeValues.remove(Val); 1146edb885cbSTobias Grosser isl_id_free(Id); 1147edb885cbSTobias Grosser } 1148edb885cbSTobias Grosser 1149edb885cbSTobias Grosser return SubtreeValues; 1150edb885cbSTobias Grosser } 1151edb885cbSTobias Grosser 115274dc3cb4STobias Grosser void GPUNodeBuilder::clearDominators(Function *F) { 115374dc3cb4STobias Grosser DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 115474dc3cb4STobias Grosser std::vector<BasicBlock *> Nodes; 115574dc3cb4STobias Grosser for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 115674dc3cb4STobias Grosser Nodes.push_back(I->getBlock()); 115774dc3cb4STobias Grosser 115874dc3cb4STobias Grosser for (BasicBlock *BB : Nodes) 115974dc3cb4STobias Grosser DT.eraseNode(BB); 116074dc3cb4STobias Grosser } 116174dc3cb4STobias Grosser 116274dc3cb4STobias Grosser void GPUNodeBuilder::clearScalarEvolution(Function *F) { 116374dc3cb4STobias Grosser for (BasicBlock &BB : *F) { 116474dc3cb4STobias Grosser Loop *L = LI.getLoopFor(&BB); 116574dc3cb4STobias Grosser if (L) 116674dc3cb4STobias Grosser SE.forgetLoop(L); 116774dc3cb4STobias Grosser } 116874dc3cb4STobias Grosser } 116974dc3cb4STobias Grosser 117074dc3cb4STobias Grosser void GPUNodeBuilder::clearLoops(Function *F) { 117174dc3cb4STobias Grosser for (BasicBlock &BB : *F) { 117274dc3cb4STobias Grosser Loop *L = LI.getLoopFor(&BB); 117374dc3cb4STobias Grosser if (L) 117474dc3cb4STobias Grosser SE.forgetLoop(L); 117574dc3cb4STobias Grosser LI.removeBlock(&BB); 117674dc3cb4STobias Grosser } 117774dc3cb4STobias Grosser } 117874dc3cb4STobias Grosser 117979a947c2STobias Grosser std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 118079a947c2STobias Grosser std::vector<Value *> Sizes; 118179a947c2STobias Grosser isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 118279a947c2STobias Grosser 118379a947c2STobias Grosser for (long i = 0; i < Kernel->n_grid; i++) { 118479a947c2STobias Grosser isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 118579a947c2STobias Grosser isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 118679a947c2STobias Grosser Value *Res = ExprBuilder.create(GridSize); 118779a947c2STobias Grosser Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 118879a947c2STobias Grosser Sizes.push_back(Res); 118979a947c2STobias Grosser } 119079a947c2STobias Grosser isl_ast_build_free(Context); 119179a947c2STobias Grosser 119279a947c2STobias Grosser for (long i = Kernel->n_grid; i < 3; i++) 119379a947c2STobias Grosser Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 119479a947c2STobias Grosser 119579a947c2STobias Grosser return std::make_tuple(Sizes[0], Sizes[1]); 119679a947c2STobias Grosser } 119779a947c2STobias Grosser 119879a947c2STobias Grosser std::tuple<Value *, Value *, Value *> 119979a947c2STobias Grosser GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 120079a947c2STobias Grosser std::vector<Value *> Sizes; 120179a947c2STobias Grosser 120279a947c2STobias Grosser for (long i = 0; i < Kernel->n_block; i++) { 120379a947c2STobias Grosser Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 120479a947c2STobias Grosser Sizes.push_back(Res); 120579a947c2STobias Grosser } 120679a947c2STobias Grosser 120779a947c2STobias Grosser for (long i = Kernel->n_block; i < 3; i++) 120879a947c2STobias Grosser Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 120979a947c2STobias Grosser 121079a947c2STobias Grosser return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 121179a947c2STobias Grosser } 121279a947c2STobias Grosser 1213a90be207SSiddharth Bhat void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, 1214a90be207SSiddharth Bhat Instruction *Param, int Index) { 1215a90be207SSiddharth Bhat Value *Slot = Builder.CreateGEP( 1216a90be207SSiddharth Bhat Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1217a90be207SSiddharth Bhat Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1218a90be207SSiddharth Bhat Builder.CreateStore(ParamTyped, Slot); 1219a90be207SSiddharth Bhat } 1220a90be207SSiddharth Bhat 122157693272STobias Grosser Value * 122257693272STobias Grosser GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 122357693272STobias Grosser SetVector<Value *> SubtreeValues) { 1224a90be207SSiddharth Bhat const int NumArgs = F->arg_size(); 1225a90be207SSiddharth Bhat std::vector<int> ArgSizes(NumArgs); 1226a90be207SSiddharth Bhat 1227a90be207SSiddharth Bhat Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); 122879a947c2STobias Grosser 122979a947c2STobias Grosser BasicBlock *EntryBlock = 123079a947c2STobias Grosser &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 123167726b32STobias Grosser auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); 123279a947c2STobias Grosser std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 123367726b32STobias Grosser Instruction *Parameters = new AllocaInst( 123467726b32STobias Grosser ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); 123579a947c2STobias Grosser 123679a947c2STobias Grosser int Index = 0; 123779a947c2STobias Grosser for (long i = 0; i < Prog->n_array; i++) { 123879a947c2STobias Grosser if (!ppcg_kernel_requires_array_argument(Kernel, i)) 123979a947c2STobias Grosser continue; 124079a947c2STobias Grosser 124179a947c2STobias Grosser isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 124279a947c2STobias Grosser const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 124379a947c2STobias Grosser 1244a90be207SSiddharth Bhat ArgSizes[Index] = SAI->getElemSizeInBytes(); 1245a90be207SSiddharth Bhat 1246abed4969SSiddharth Bhat Value *DevArray = nullptr; 1247abed4969SSiddharth Bhat if (ManagedMemory) { 1248abed4969SSiddharth Bhat DevArray = getOrCreateManagedDeviceArray( 1249abed4969SSiddharth Bhat &Prog->array[i], const_cast<ScopArrayInfo *>(SAI)); 1250abed4969SSiddharth Bhat } else { 1251abed4969SSiddharth Bhat DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)]; 125279a947c2STobias Grosser DevArray = createCallGetDevicePtr(DevArray); 1253abed4969SSiddharth Bhat } 1254abed4969SSiddharth Bhat assert(DevArray != nullptr && "Array to be offloaded to device not " 1255abed4969SSiddharth Bhat "initialized"); 1256aaabbbf8STobias Grosser Value *Offset = getArrayOffset(&Prog->array[i]); 1257aaabbbf8STobias Grosser 1258aaabbbf8STobias Grosser if (Offset) { 1259aaabbbf8STobias Grosser DevArray = Builder.CreatePointerCast( 1260aaabbbf8STobias Grosser DevArray, SAI->getElementType()->getPointerTo()); 1261aaabbbf8STobias Grosser DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); 1262aaabbbf8STobias Grosser DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); 1263aaabbbf8STobias Grosser } 1264fe74a7a1STobias Grosser Value *Slot = Builder.CreateGEP( 1265fe74a7a1STobias Grosser Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1266aaabbbf8STobias Grosser 1267fe74a7a1STobias Grosser if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1268abed4969SSiddharth Bhat Value *ValPtr = nullptr; 1269abed4969SSiddharth Bhat if (ManagedMemory) 1270abed4969SSiddharth Bhat ValPtr = DevArray; 1271abed4969SSiddharth Bhat else 1272abed4969SSiddharth Bhat ValPtr = BlockGen.getOrCreateAlloca(SAI); 1273abed4969SSiddharth Bhat 1274abed4969SSiddharth Bhat assert(ValPtr != nullptr && "ValPtr that should point to a valid object" 1275abed4969SSiddharth Bhat " to be stored into Parameters"); 1276fe74a7a1STobias Grosser Value *ValPtrCast = 1277fe74a7a1STobias Grosser Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); 1278fe74a7a1STobias Grosser Builder.CreateStore(ValPtrCast, Slot); 1279fe74a7a1STobias Grosser } else { 128067726b32STobias Grosser Instruction *Param = 128167726b32STobias Grosser new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, 128267726b32STobias Grosser Launch + "_param_" + std::to_string(Index), 128379a947c2STobias Grosser EntryBlock->getTerminator()); 128479a947c2STobias Grosser Builder.CreateStore(DevArray, Param); 128579a947c2STobias Grosser Value *ParamTyped = 128679a947c2STobias Grosser Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 128779a947c2STobias Grosser Builder.CreateStore(ParamTyped, Slot); 1288fe74a7a1STobias Grosser } 128979a947c2STobias Grosser Index++; 129079a947c2STobias Grosser } 129179a947c2STobias Grosser 1292a490147cSTobias Grosser int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1293a490147cSTobias Grosser 1294a490147cSTobias Grosser for (long i = 0; i < NumHostIters; i++) { 1295a490147cSTobias Grosser isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1296a490147cSTobias Grosser Value *Val = IDToValue[Id]; 1297a490147cSTobias Grosser isl_id_free(Id); 1298a90be207SSiddharth Bhat 1299a90be207SSiddharth Bhat ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1300a90be207SSiddharth Bhat 130167726b32STobias Grosser Instruction *Param = 130267726b32STobias Grosser new AllocaInst(Val->getType(), AddressSpace, 130367726b32STobias Grosser Launch + "_param_" + std::to_string(Index), 1304a490147cSTobias Grosser EntryBlock->getTerminator()); 1305a490147cSTobias Grosser Builder.CreateStore(Val, Param); 1306a90be207SSiddharth Bhat insertStoreParameter(Parameters, Param, Index); 1307a490147cSTobias Grosser Index++; 1308a490147cSTobias Grosser } 1309a490147cSTobias Grosser 1310d8b94bcaSTobias Grosser int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1311d8b94bcaSTobias Grosser 1312d8b94bcaSTobias Grosser for (long i = 0; i < NumVars; i++) { 1313d8b94bcaSTobias Grosser isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1314d8b94bcaSTobias Grosser Value *Val = IDToValue[Id]; 1315d8b94bcaSTobias Grosser isl_id_free(Id); 1316a90be207SSiddharth Bhat 1317a90be207SSiddharth Bhat ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1318a90be207SSiddharth Bhat 131967726b32STobias Grosser Instruction *Param = 132067726b32STobias Grosser new AllocaInst(Val->getType(), AddressSpace, 132167726b32STobias Grosser Launch + "_param_" + std::to_string(Index), 1322d8b94bcaSTobias Grosser EntryBlock->getTerminator()); 1323d8b94bcaSTobias Grosser Builder.CreateStore(Val, Param); 1324a90be207SSiddharth Bhat insertStoreParameter(Parameters, Param, Index); 1325d8b94bcaSTobias Grosser Index++; 1326d8b94bcaSTobias Grosser } 1327d8b94bcaSTobias Grosser 132857693272STobias Grosser for (auto Val : SubtreeValues) { 1329a90be207SSiddharth Bhat ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1330a90be207SSiddharth Bhat 133167726b32STobias Grosser Instruction *Param = 133267726b32STobias Grosser new AllocaInst(Val->getType(), AddressSpace, 133367726b32STobias Grosser Launch + "_param_" + std::to_string(Index), 133457693272STobias Grosser EntryBlock->getTerminator()); 133557693272STobias Grosser Builder.CreateStore(Val, Param); 1336a90be207SSiddharth Bhat insertStoreParameter(Parameters, Param, Index); 1337a90be207SSiddharth Bhat Index++; 1338a90be207SSiddharth Bhat } 1339a90be207SSiddharth Bhat 1340a90be207SSiddharth Bhat for (int i = 0; i < NumArgs; i++) { 1341a90be207SSiddharth Bhat Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); 1342a90be207SSiddharth Bhat Instruction *Param = 1343a90be207SSiddharth Bhat new AllocaInst(Builder.getInt32Ty(), AddressSpace, 1344a90be207SSiddharth Bhat Launch + "_param_size_" + std::to_string(i), 1345a90be207SSiddharth Bhat EntryBlock->getTerminator()); 1346a90be207SSiddharth Bhat Builder.CreateStore(Val, Param); 1347a90be207SSiddharth Bhat insertStoreParameter(Parameters, Param, Index); 134857693272STobias Grosser Index++; 134957693272STobias Grosser } 135057693272STobias Grosser 135179a947c2STobias Grosser auto Location = EntryBlock->getTerminator(); 135279a947c2STobias Grosser return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 135379a947c2STobias Grosser Launch + "_params_i8ptr", Location); 135479a947c2STobias Grosser } 135579a947c2STobias Grosser 135632837fe3STobias Grosser void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 135732837fe3STobias Grosser isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 135832837fe3STobias Grosser ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 135932837fe3STobias Grosser isl_id_free(Id); 136032837fe3STobias Grosser isl_ast_node_free(KernelStmt); 136132837fe3STobias Grosser 1362bc653f20STobias Grosser if (Kernel->n_grid > 1) 1363bc653f20STobias Grosser DeepestParallel = 1364bc653f20STobias Grosser std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set)); 1365bc653f20STobias Grosser else 1366bc653f20STobias Grosser DeepestSequential = 1367bc653f20STobias Grosser std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set)); 1368bc653f20STobias Grosser 1369c1c6a2a6STobias Grosser Value *BlockDimX, *BlockDimY, *BlockDimZ; 1370c1c6a2a6STobias Grosser std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1371c1c6a2a6STobias Grosser 1372edb885cbSTobias Grosser SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 1373edb885cbSTobias Grosser 137432837fe3STobias Grosser assert(Kernel->tree && "Device AST of kernel node is empty"); 137532837fe3STobias Grosser 137632837fe3STobias Grosser Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1377472f9654STobias Grosser IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1378edb885cbSTobias Grosser ValueMapT HostValueMap = ValueMap; 1379587f1f57STobias Grosser BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; 1380b06ff457STobias Grosser ScalarMap.clear(); 138132837fe3STobias Grosser 1382edb885cbSTobias Grosser SetVector<const Loop *> Loops; 1383edb885cbSTobias Grosser 1384edb885cbSTobias Grosser // Create for all loops we depend on values that contain the current loop 1385edb885cbSTobias Grosser // iteration. These values are necessary to generate code for SCEVs that 1386edb885cbSTobias Grosser // depend on such loops. As a result we need to pass them to the subfunction. 1387edb885cbSTobias Grosser for (const Loop *L : Loops) { 1388edb885cbSTobias Grosser const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1389edb885cbSTobias Grosser SE.getUnknown(Builder.getInt64(1)), 1390edb885cbSTobias Grosser L, SCEV::FlagAnyWrap); 1391edb885cbSTobias Grosser Value *V = generateSCEV(OuterLIV); 1392edb885cbSTobias Grosser OutsideLoopIterations[L] = SE.getUnknown(V); 1393edb885cbSTobias Grosser SubtreeValues.insert(V); 1394edb885cbSTobias Grosser } 1395edb885cbSTobias Grosser 1396edb885cbSTobias Grosser createKernelFunction(Kernel, SubtreeValues); 139732837fe3STobias Grosser 139859ab0705STobias Grosser create(isl_ast_node_copy(Kernel->tree)); 139959ab0705STobias Grosser 140051dfc275STobias Grosser finalizeKernelArguments(Kernel); 140174dc3cb4STobias Grosser Function *F = Builder.GetInsertBlock()->getParent(); 1402c1c6a2a6STobias Grosser addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 140374dc3cb4STobias Grosser clearDominators(F); 140474dc3cb4STobias Grosser clearScalarEvolution(F); 140574dc3cb4STobias Grosser clearLoops(F); 140674dc3cb4STobias Grosser 1407472f9654STobias Grosser IDToValue = HostIDs; 140832837fe3STobias Grosser 1409b06ff457STobias Grosser ValueMap = std::move(HostValueMap); 1410b06ff457STobias Grosser ScalarMap = std::move(HostScalarMap); 1411edb885cbSTobias Grosser EscapeMap.clear(); 1412edb885cbSTobias Grosser IDToSAI.clear(); 141374dc3cb4STobias Grosser Annotator.resetAlternativeAliasBases(); 141474dc3cb4STobias Grosser for (auto &BasePtr : LocalArrays) 14154d5a9172STobias Grosser S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); 141674dc3cb4STobias Grosser LocalArrays.clear(); 1417edb885cbSTobias Grosser 141851dfc275STobias Grosser std::string ASMString = finalizeKernelFunction(); 141951dfc275STobias Grosser Builder.SetInsertPoint(&HostInsertPoint); 142057693272STobias Grosser Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 142179a947c2STobias Grosser 142257793596STobias Grosser std::string Name = "kernel_" + std::to_string(Kernel->id); 142357793596STobias Grosser Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 142457793596STobias Grosser Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 142557793596STobias Grosser Value *GPUKernel = createCallGetKernel(KernelString, NameString); 142679a947c2STobias Grosser 142779a947c2STobias Grosser Value *GridDimX, *GridDimY; 142879a947c2STobias Grosser std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 142979a947c2STobias Grosser 143079a947c2STobias Grosser createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 143179a947c2STobias Grosser BlockDimZ, Parameters); 143257793596STobias Grosser createCallFreeKernel(GPUKernel); 1433b513b491STobias Grosser 1434b513b491STobias Grosser for (auto Id : KernelIds) 1435b513b491STobias Grosser isl_id_free(Id); 1436b513b491STobias Grosser 1437b513b491STobias Grosser KernelIds.clear(); 143832837fe3STobias Grosser } 143932837fe3STobias Grosser 144032837fe3STobias Grosser /// Compute the DataLayout string for the NVPTX backend. 144132837fe3STobias Grosser /// 144232837fe3STobias Grosser /// @param is64Bit Are we looking for a 64 bit architecture? 144332837fe3STobias Grosser static std::string computeNVPTXDataLayout(bool is64Bit) { 1444d277fedaSSiddharth Bhat std::string Ret = ""; 144532837fe3STobias Grosser 1446d277fedaSSiddharth Bhat if (!is64Bit) { 1447d277fedaSSiddharth Bhat Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1448d277fedaSSiddharth Bhat "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1449d277fedaSSiddharth Bhat "64-v128:128:128-n16:32:64"; 1450d277fedaSSiddharth Bhat } else { 1451d277fedaSSiddharth Bhat Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1452d277fedaSSiddharth Bhat "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1453d277fedaSSiddharth Bhat "64-v128:128:128-n16:32:64"; 1454d277fedaSSiddharth Bhat } 145532837fe3STobias Grosser 145632837fe3STobias Grosser return Ret; 145732837fe3STobias Grosser } 145832837fe3STobias Grosser 1459edb885cbSTobias Grosser Function * 1460edb885cbSTobias Grosser GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1461edb885cbSTobias Grosser SetVector<Value *> &SubtreeValues) { 146232837fe3STobias Grosser std::vector<Type *> Args; 146332837fe3STobias Grosser std::string Identifier = "kernel_" + std::to_string(Kernel->id); 146432837fe3STobias Grosser 146532837fe3STobias Grosser for (long i = 0; i < Prog->n_array; i++) { 146632837fe3STobias Grosser if (!ppcg_kernel_requires_array_argument(Kernel, i)) 146732837fe3STobias Grosser continue; 146832837fe3STobias Grosser 1469fe74a7a1STobias Grosser if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1470fe74a7a1STobias Grosser isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1471fe74a7a1STobias Grosser const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1472fe74a7a1STobias Grosser Args.push_back(SAI->getElementType()); 1473fe74a7a1STobias Grosser } else { 1474d277fedaSSiddharth Bhat static const int UseGlobalMemory = 1; 1475d277fedaSSiddharth Bhat Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); 147632837fe3STobias Grosser } 1477fe74a7a1STobias Grosser } 147832837fe3STobias Grosser 1479f6044bd0STobias Grosser int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1480f6044bd0STobias Grosser 1481f6044bd0STobias Grosser for (long i = 0; i < NumHostIters; i++) 1482f6044bd0STobias Grosser Args.push_back(Builder.getInt64Ty()); 1483f6044bd0STobias Grosser 1484c84a1995STobias Grosser int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1485c84a1995STobias Grosser 1486cf66ef26STobias Grosser for (long i = 0; i < NumVars; i++) { 1487cf66ef26STobias Grosser isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1488cf66ef26STobias Grosser Value *Val = IDToValue[Id]; 1489cf66ef26STobias Grosser isl_id_free(Id); 1490cf66ef26STobias Grosser Args.push_back(Val->getType()); 1491cf66ef26STobias Grosser } 1492c84a1995STobias Grosser 1493edb885cbSTobias Grosser for (auto *V : SubtreeValues) 1494edb885cbSTobias Grosser Args.push_back(V->getType()); 1495edb885cbSTobias Grosser 149632837fe3STobias Grosser auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 149732837fe3STobias Grosser auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 149832837fe3STobias Grosser GPUModule.get()); 149917f01968SSiddharth Bhat 150017f01968SSiddharth Bhat switch (Arch) { 150117f01968SSiddharth Bhat case GPUArch::NVPTX64: 150232837fe3STobias Grosser FN->setCallingConv(CallingConv::PTX_Kernel); 150317f01968SSiddharth Bhat break; 150417f01968SSiddharth Bhat } 150532837fe3STobias Grosser 150632837fe3STobias Grosser auto Arg = FN->arg_begin(); 150732837fe3STobias Grosser for (long i = 0; i < Kernel->n_array; i++) { 150832837fe3STobias Grosser if (!ppcg_kernel_requires_array_argument(Kernel, i)) 150932837fe3STobias Grosser continue; 151032837fe3STobias Grosser 1511edb885cbSTobias Grosser Arg->setName(Kernel->array[i].array->name); 1512edb885cbSTobias Grosser 1513edb885cbSTobias Grosser isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1514edb885cbSTobias Grosser const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1515edb885cbSTobias Grosser Type *EleTy = SAI->getElementType(); 1516edb885cbSTobias Grosser Value *Val = &*Arg; 1517edb885cbSTobias Grosser SmallVector<const SCEV *, 4> Sizes; 1518edb885cbSTobias Grosser isl_ast_build *Build = 1519edb885cbSTobias Grosser isl_ast_build_from_context(isl_set_copy(Prog->context)); 1520f5aff704SRoman Gareev Sizes.push_back(nullptr); 1521edb885cbSTobias Grosser for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1522edb885cbSTobias Grosser isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1523edb885cbSTobias Grosser Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1524edb885cbSTobias Grosser auto V = ExprBuilder.create(DimSize); 1525edb885cbSTobias Grosser Sizes.push_back(SE.getSCEV(V)); 1526edb885cbSTobias Grosser } 1527edb885cbSTobias Grosser const ScopArrayInfo *SAIRep = 15284d5a9172STobias Grosser S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); 152974dc3cb4STobias Grosser LocalArrays.push_back(Val); 1530edb885cbSTobias Grosser 1531edb885cbSTobias Grosser isl_ast_build_free(Build); 1532b513b491STobias Grosser KernelIds.push_back(Id); 1533edb885cbSTobias Grosser IDToSAI[Id] = SAIRep; 153432837fe3STobias Grosser Arg++; 153532837fe3STobias Grosser } 153632837fe3STobias Grosser 1537f6044bd0STobias Grosser for (long i = 0; i < NumHostIters; i++) { 1538f6044bd0STobias Grosser isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1539f6044bd0STobias Grosser Arg->setName(isl_id_get_name(Id)); 1540f6044bd0STobias Grosser IDToValue[Id] = &*Arg; 1541f6044bd0STobias Grosser KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1542f6044bd0STobias Grosser Arg++; 1543f6044bd0STobias Grosser } 1544f6044bd0STobias Grosser 1545c84a1995STobias Grosser for (long i = 0; i < NumVars; i++) { 1546c84a1995STobias Grosser isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1547c84a1995STobias Grosser Arg->setName(isl_id_get_name(Id)); 154812453403STobias Grosser Value *Val = IDToValue[Id]; 154912453403STobias Grosser ValueMap[Val] = &*Arg; 1550c84a1995STobias Grosser IDToValue[Id] = &*Arg; 1551c84a1995STobias Grosser KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1552c84a1995STobias Grosser Arg++; 1553c84a1995STobias Grosser } 1554c84a1995STobias Grosser 1555edb885cbSTobias Grosser for (auto *V : SubtreeValues) { 1556edb885cbSTobias Grosser Arg->setName(V->getName()); 1557edb885cbSTobias Grosser ValueMap[V] = &*Arg; 1558edb885cbSTobias Grosser Arg++; 1559edb885cbSTobias Grosser } 1560edb885cbSTobias Grosser 156132837fe3STobias Grosser return FN; 156232837fe3STobias Grosser } 156332837fe3STobias Grosser 1564472f9654STobias Grosser void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 156517f01968SSiddharth Bhat Intrinsic::ID IntrinsicsBID[2]; 156617f01968SSiddharth Bhat Intrinsic::ID IntrinsicsTID[3]; 1567472f9654STobias Grosser 156817f01968SSiddharth Bhat switch (Arch) { 156917f01968SSiddharth Bhat case GPUArch::NVPTX64: 157017f01968SSiddharth Bhat IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; 157117f01968SSiddharth Bhat IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; 157217f01968SSiddharth Bhat 157317f01968SSiddharth Bhat IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; 157417f01968SSiddharth Bhat IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; 157517f01968SSiddharth Bhat IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; 157617f01968SSiddharth Bhat break; 157717f01968SSiddharth Bhat } 1578472f9654STobias Grosser 1579472f9654STobias Grosser auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1580472f9654STobias Grosser std::string Name = isl_id_get_name(Id); 1581472f9654STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1582472f9654STobias Grosser Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1583472f9654STobias Grosser Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1584472f9654STobias Grosser Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1585472f9654STobias Grosser IDToValue[Id] = Val; 1586472f9654STobias Grosser KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1587472f9654STobias Grosser }; 1588472f9654STobias Grosser 1589472f9654STobias Grosser for (int i = 0; i < Kernel->n_grid; ++i) { 1590472f9654STobias Grosser isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1591472f9654STobias Grosser addId(Id, IntrinsicsBID[i]); 1592472f9654STobias Grosser } 1593472f9654STobias Grosser 1594472f9654STobias Grosser for (int i = 0; i < Kernel->n_block; ++i) { 1595472f9654STobias Grosser isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1596472f9654STobias Grosser addId(Id, IntrinsicsTID[i]); 1597472f9654STobias Grosser } 1598472f9654STobias Grosser } 1599472f9654STobias Grosser 160000bb5a99STobias Grosser void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 160100bb5a99STobias Grosser auto Arg = FN->arg_begin(); 160200bb5a99STobias Grosser for (long i = 0; i < Kernel->n_array; i++) { 160300bb5a99STobias Grosser if (!ppcg_kernel_requires_array_argument(Kernel, i)) 160400bb5a99STobias Grosser continue; 160500bb5a99STobias Grosser 160600bb5a99STobias Grosser isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 160700bb5a99STobias Grosser const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 160800bb5a99STobias Grosser isl_id_free(Id); 160900bb5a99STobias Grosser 161000bb5a99STobias Grosser if (SAI->getNumberOfDimensions() > 0) { 161100bb5a99STobias Grosser Arg++; 161200bb5a99STobias Grosser continue; 161300bb5a99STobias Grosser } 161400bb5a99STobias Grosser 1615fe74a7a1STobias Grosser Value *Val = &*Arg; 1616fe74a7a1STobias Grosser 1617fe74a7a1STobias Grosser if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { 161800bb5a99STobias Grosser Type *TypePtr = SAI->getElementType()->getPointerTo(); 1619fe74a7a1STobias Grosser Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); 1620fe74a7a1STobias Grosser Val = Builder.CreateLoad(TypedArgPtr); 1621fe74a7a1STobias Grosser } 1622fe74a7a1STobias Grosser 1623fe74a7a1STobias Grosser Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 162400bb5a99STobias Grosser Builder.CreateStore(Val, Alloca); 162500bb5a99STobias Grosser 162600bb5a99STobias Grosser Arg++; 162700bb5a99STobias Grosser } 162800bb5a99STobias Grosser } 162900bb5a99STobias Grosser 163051dfc275STobias Grosser void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { 163151dfc275STobias Grosser auto *FN = Builder.GetInsertBlock()->getParent(); 163251dfc275STobias Grosser auto Arg = FN->arg_begin(); 163351dfc275STobias Grosser 163451dfc275STobias Grosser bool StoredScalar = false; 163551dfc275STobias Grosser for (long i = 0; i < Kernel->n_array; i++) { 163651dfc275STobias Grosser if (!ppcg_kernel_requires_array_argument(Kernel, i)) 163751dfc275STobias Grosser continue; 163851dfc275STobias Grosser 163951dfc275STobias Grosser isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 164051dfc275STobias Grosser const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 164151dfc275STobias Grosser isl_id_free(Id); 164251dfc275STobias Grosser 164351dfc275STobias Grosser if (SAI->getNumberOfDimensions() > 0) { 164451dfc275STobias Grosser Arg++; 164551dfc275STobias Grosser continue; 164651dfc275STobias Grosser } 164751dfc275STobias Grosser 164851dfc275STobias Grosser if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 164951dfc275STobias Grosser Arg++; 165051dfc275STobias Grosser continue; 165151dfc275STobias Grosser } 165251dfc275STobias Grosser 165351dfc275STobias Grosser Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 165451dfc275STobias Grosser Value *ArgPtr = &*Arg; 165551dfc275STobias Grosser Type *TypePtr = SAI->getElementType()->getPointerTo(); 165651dfc275STobias Grosser Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 165751dfc275STobias Grosser Value *Val = Builder.CreateLoad(Alloca); 165851dfc275STobias Grosser Builder.CreateStore(Val, TypedArgPtr); 165951dfc275STobias Grosser StoredScalar = true; 166051dfc275STobias Grosser 166151dfc275STobias Grosser Arg++; 166251dfc275STobias Grosser } 166351dfc275STobias Grosser 166451dfc275STobias Grosser if (StoredScalar) 166551dfc275STobias Grosser /// In case more than one thread contains scalar stores, the generated 166651dfc275STobias Grosser /// code might be incorrect, if we only store at the end of the kernel. 166751dfc275STobias Grosser /// To support this case we need to store these scalars back at each 166851dfc275STobias Grosser /// memory store or at least before each kernel barrier. 166951dfc275STobias Grosser if (Kernel->n_block != 0 || Kernel->n_grid != 0) 167051dfc275STobias Grosser BuildSuccessful = 0; 167151dfc275STobias Grosser } 167251dfc275STobias Grosser 1673b513b491STobias Grosser void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1674b513b491STobias Grosser Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1675b513b491STobias Grosser 1676b513b491STobias Grosser for (int i = 0; i < Kernel->n_var; ++i) { 1677b513b491STobias Grosser struct ppcg_kernel_var &Var = Kernel->var[i]; 1678b513b491STobias Grosser isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1679b513b491STobias Grosser Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1680b513b491STobias Grosser 1681f919d8b3STobias Grosser Type *ArrayTy = EleTy; 1682b513b491STobias Grosser SmallVector<const SCEV *, 4> Sizes; 1683b513b491STobias Grosser 1684f5aff704SRoman Gareev Sizes.push_back(nullptr); 1685928d7573STobias Grosser for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1686b513b491STobias Grosser isl_val *Val = isl_vec_get_element_val(Var.size, j); 1687f919d8b3STobias Grosser long Bound = isl_val_get_num_si(Val); 1688b513b491STobias Grosser isl_val_free(Val); 1689b513b491STobias Grosser Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1690928d7573STobias Grosser } 1691928d7573STobias Grosser 1692928d7573STobias Grosser for (int j = Var.array->n_index - 1; j >= 0; --j) { 1693928d7573STobias Grosser isl_val *Val = isl_vec_get_element_val(Var.size, j); 1694928d7573STobias Grosser long Bound = isl_val_get_num_si(Val); 1695928d7573STobias Grosser isl_val_free(Val); 1696b513b491STobias Grosser ArrayTy = ArrayType::get(ArrayTy, Bound); 1697b513b491STobias Grosser } 1698b513b491STobias Grosser 1699130ca30fSTobias Grosser const ScopArrayInfo *SAI; 1700130ca30fSTobias Grosser Value *Allocation; 1701130ca30fSTobias Grosser if (Var.type == ppcg_access_shared) { 1702130ca30fSTobias Grosser auto GlobalVar = new GlobalVariable( 1703130ca30fSTobias Grosser *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1704130ca30fSTobias Grosser nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1705130ca30fSTobias Grosser GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1706f919d8b3STobias Grosser GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1707f919d8b3STobias Grosser 1708130ca30fSTobias Grosser Allocation = GlobalVar; 1709130ca30fSTobias Grosser } else if (Var.type == ppcg_access_private) { 1710130ca30fSTobias Grosser Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1711130ca30fSTobias Grosser } else { 1712130ca30fSTobias Grosser llvm_unreachable("unknown variable type"); 1713130ca30fSTobias Grosser } 17144d5a9172STobias Grosser SAI = 17154d5a9172STobias Grosser S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); 1716b513b491STobias Grosser Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1717130ca30fSTobias Grosser IDToValue[Id] = Allocation; 1718130ca30fSTobias Grosser LocalArrays.push_back(Allocation); 1719b513b491STobias Grosser KernelIds.push_back(Id); 1720b513b491STobias Grosser IDToSAI[Id] = SAI; 1721b513b491STobias Grosser } 1722b513b491STobias Grosser } 1723b513b491STobias Grosser 1724edb885cbSTobias Grosser void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1725edb885cbSTobias Grosser SetVector<Value *> &SubtreeValues) { 172632837fe3STobias Grosser std::string Identifier = "kernel_" + std::to_string(Kernel->id); 172732837fe3STobias Grosser GPUModule.reset(new Module(Identifier, Builder.getContext())); 172817f01968SSiddharth Bhat 172917f01968SSiddharth Bhat switch (Arch) { 173017f01968SSiddharth Bhat case GPUArch::NVPTX64: 173117f01968SSiddharth Bhat if (Runtime == GPURuntime::CUDA) 173232837fe3STobias Grosser GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 173317f01968SSiddharth Bhat else if (Runtime == GPURuntime::OpenCL) 173417f01968SSiddharth Bhat GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); 173532837fe3STobias Grosser GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 173617f01968SSiddharth Bhat break; 173717f01968SSiddharth Bhat } 173832837fe3STobias Grosser 1739edb885cbSTobias Grosser Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 174032837fe3STobias Grosser 174159ab0705STobias Grosser BasicBlock *PrevBlock = Builder.GetInsertBlock(); 174232837fe3STobias Grosser auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 174332837fe3STobias Grosser 174459ab0705STobias Grosser DT.addNewBlock(EntryBlock, PrevBlock); 174559ab0705STobias Grosser 174632837fe3STobias Grosser Builder.SetInsertPoint(EntryBlock); 174732837fe3STobias Grosser Builder.CreateRetVoid(); 174832837fe3STobias Grosser Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1749472f9654STobias Grosser 1750629109b6STobias Grosser ScopDetection::markFunctionAsInvalid(FN); 1751629109b6STobias Grosser 175200bb5a99STobias Grosser prepareKernelArguments(Kernel, FN); 1753b513b491STobias Grosser createKernelVariables(Kernel, FN); 1754472f9654STobias Grosser insertKernelIntrinsics(Kernel); 175532837fe3STobias Grosser } 175632837fe3STobias Grosser 175774dc3cb4STobias Grosser std::string GPUNodeBuilder::createKernelASM() { 175817f01968SSiddharth Bhat llvm::Triple GPUTriple; 175917f01968SSiddharth Bhat 176017f01968SSiddharth Bhat switch (Arch) { 176117f01968SSiddharth Bhat case GPUArch::NVPTX64: 176217f01968SSiddharth Bhat switch (Runtime) { 176317f01968SSiddharth Bhat case GPURuntime::CUDA: 176417f01968SSiddharth Bhat GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); 176517f01968SSiddharth Bhat break; 176617f01968SSiddharth Bhat case GPURuntime::OpenCL: 176717f01968SSiddharth Bhat GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); 176817f01968SSiddharth Bhat break; 176917f01968SSiddharth Bhat } 177017f01968SSiddharth Bhat break; 177117f01968SSiddharth Bhat } 177217f01968SSiddharth Bhat 177374dc3cb4STobias Grosser std::string ErrMsg; 177474dc3cb4STobias Grosser auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 177574dc3cb4STobias Grosser 177674dc3cb4STobias Grosser if (!GPUTarget) { 177774dc3cb4STobias Grosser errs() << ErrMsg << "\n"; 177874dc3cb4STobias Grosser return ""; 177974dc3cb4STobias Grosser } 178074dc3cb4STobias Grosser 178174dc3cb4STobias Grosser TargetOptions Options; 178274dc3cb4STobias Grosser Options.UnsafeFPMath = FastMath; 178317f01968SSiddharth Bhat 178417f01968SSiddharth Bhat std::string subtarget; 178517f01968SSiddharth Bhat 178617f01968SSiddharth Bhat switch (Arch) { 178717f01968SSiddharth Bhat case GPUArch::NVPTX64: 178817f01968SSiddharth Bhat subtarget = CudaVersion; 178917f01968SSiddharth Bhat break; 179017f01968SSiddharth Bhat } 179117f01968SSiddharth Bhat 179217f01968SSiddharth Bhat std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine( 179317f01968SSiddharth Bhat GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>())); 179474dc3cb4STobias Grosser 179574dc3cb4STobias Grosser SmallString<0> ASMString; 179674dc3cb4STobias Grosser raw_svector_ostream ASMStream(ASMString); 179774dc3cb4STobias Grosser llvm::legacy::PassManager PM; 179874dc3cb4STobias Grosser 179974dc3cb4STobias Grosser PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 180074dc3cb4STobias Grosser 180174dc3cb4STobias Grosser if (TargetM->addPassesToEmitFile( 180274dc3cb4STobias Grosser PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 180374dc3cb4STobias Grosser errs() << "The target does not support generation of this file type!\n"; 180474dc3cb4STobias Grosser return ""; 180574dc3cb4STobias Grosser } 180674dc3cb4STobias Grosser 180774dc3cb4STobias Grosser PM.run(*GPUModule); 180874dc3cb4STobias Grosser 180974dc3cb4STobias Grosser return ASMStream.str(); 181074dc3cb4STobias Grosser } 181174dc3cb4STobias Grosser 181257793596STobias Grosser std::string GPUNodeBuilder::finalizeKernelFunction() { 18135857b701STobias Grosser if (verifyModule(*GPUModule)) { 18145857b701STobias Grosser BuildSuccessful = false; 18155857b701STobias Grosser return ""; 18165857b701STobias Grosser } 181732837fe3STobias Grosser 181832837fe3STobias Grosser if (DumpKernelIR) 181932837fe3STobias Grosser outs() << *GPUModule << "\n"; 182032837fe3STobias Grosser 18219a18d559STobias Grosser // Optimize module. 18229a18d559STobias Grosser llvm::legacy::PassManager OptPasses; 18239a18d559STobias Grosser PassManagerBuilder PassBuilder; 18249a18d559STobias Grosser PassBuilder.OptLevel = 3; 18259a18d559STobias Grosser PassBuilder.SizeLevel = 0; 18269a18d559STobias Grosser PassBuilder.populateModulePassManager(OptPasses); 18279a18d559STobias Grosser OptPasses.run(*GPUModule); 18289a18d559STobias Grosser 182974dc3cb4STobias Grosser std::string Assembly = createKernelASM(); 183074dc3cb4STobias Grosser 183174dc3cb4STobias Grosser if (DumpKernelASM) 183274dc3cb4STobias Grosser outs() << Assembly << "\n"; 183374dc3cb4STobias Grosser 183432837fe3STobias Grosser GPUModule.release(); 1835472f9654STobias Grosser KernelIDs.clear(); 183657793596STobias Grosser 183757793596STobias Grosser return Assembly; 183832837fe3STobias Grosser } 183932837fe3STobias Grosser 18409dfe4e7cSTobias Grosser namespace { 18419dfe4e7cSTobias Grosser class PPCGCodeGeneration : public ScopPass { 18429dfe4e7cSTobias Grosser public: 18439dfe4e7cSTobias Grosser static char ID; 18449dfe4e7cSTobias Grosser 184517f01968SSiddharth Bhat GPURuntime Runtime = GPURuntime::CUDA; 184617f01968SSiddharth Bhat 184717f01968SSiddharth Bhat GPUArch Architecture = GPUArch::NVPTX64; 184817f01968SSiddharth Bhat 1849e938517eSTobias Grosser /// The scop that is currently processed. 1850e938517eSTobias Grosser Scop *S; 1851e938517eSTobias Grosser 185238fc0aedSTobias Grosser LoopInfo *LI; 185338fc0aedSTobias Grosser DominatorTree *DT; 185438fc0aedSTobias Grosser ScalarEvolution *SE; 185538fc0aedSTobias Grosser const DataLayout *DL; 185638fc0aedSTobias Grosser RegionInfo *RI; 185738fc0aedSTobias Grosser 18589dfe4e7cSTobias Grosser PPCGCodeGeneration() : ScopPass(ID) {} 18599dfe4e7cSTobias Grosser 1860e938517eSTobias Grosser /// Construct compilation options for PPCG. 1861e938517eSTobias Grosser /// 1862e938517eSTobias Grosser /// @returns The compilation options. 1863e938517eSTobias Grosser ppcg_options *createPPCGOptions() { 1864e938517eSTobias Grosser auto DebugOptions = 1865e938517eSTobias Grosser (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1866e938517eSTobias Grosser auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1867e938517eSTobias Grosser 1868e938517eSTobias Grosser DebugOptions->dump_schedule_constraints = false; 1869e938517eSTobias Grosser DebugOptions->dump_schedule = false; 1870e938517eSTobias Grosser DebugOptions->dump_final_schedule = false; 1871e938517eSTobias Grosser DebugOptions->dump_sizes = false; 18728950ceadSTobias Grosser DebugOptions->verbose = false; 1873e938517eSTobias Grosser 1874e938517eSTobias Grosser Options->debug = DebugOptions; 1875e938517eSTobias Grosser 1876e938517eSTobias Grosser Options->reschedule = true; 1877e938517eSTobias Grosser Options->scale_tile_loops = false; 1878e938517eSTobias Grosser Options->wrap = false; 1879e938517eSTobias Grosser 1880e938517eSTobias Grosser Options->non_negative_parameters = false; 1881e938517eSTobias Grosser Options->ctx = nullptr; 1882e938517eSTobias Grosser Options->sizes = nullptr; 1883e938517eSTobias Grosser 18844eaedde5STobias Grosser Options->tile_size = 32; 18854eaedde5STobias Grosser 1886130ca30fSTobias Grosser Options->use_private_memory = PrivateMemory; 1887b513b491STobias Grosser Options->use_shared_memory = SharedMemory; 1888b513b491STobias Grosser Options->max_shared_memory = 48 * 1024; 1889e938517eSTobias Grosser 1890e938517eSTobias Grosser Options->target = PPCG_TARGET_CUDA; 1891e938517eSTobias Grosser Options->openmp = false; 1892e938517eSTobias Grosser Options->linearize_device_arrays = true; 1893e938517eSTobias Grosser Options->live_range_reordering = false; 1894e938517eSTobias Grosser 1895e938517eSTobias Grosser Options->opencl_compiler_options = nullptr; 1896e938517eSTobias Grosser Options->opencl_use_gpu = false; 1897e938517eSTobias Grosser Options->opencl_n_include_file = 0; 1898e938517eSTobias Grosser Options->opencl_include_files = nullptr; 1899e938517eSTobias Grosser Options->opencl_print_kernel_types = false; 1900e938517eSTobias Grosser Options->opencl_embed_kernel_code = false; 1901e938517eSTobias Grosser 1902e938517eSTobias Grosser Options->save_schedule_file = nullptr; 1903e938517eSTobias Grosser Options->load_schedule_file = nullptr; 1904e938517eSTobias Grosser 1905e938517eSTobias Grosser return Options; 1906e938517eSTobias Grosser } 1907e938517eSTobias Grosser 1908f384594dSTobias Grosser /// Get a tagged access relation containing all accesses of type @p AccessTy. 1909f384594dSTobias Grosser /// 1910f384594dSTobias Grosser /// Instead of a normal access of the form: 1911f384594dSTobias Grosser /// 1912f384594dSTobias Grosser /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1913f384594dSTobias Grosser /// 1914f384594dSTobias Grosser /// a tagged access has the form 1915f384594dSTobias Grosser /// 1916f384594dSTobias Grosser /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1917f384594dSTobias Grosser /// 1918f384594dSTobias Grosser /// where 'id' is an additional space that references the memory access that 1919f384594dSTobias Grosser /// triggered the access. 1920f384594dSTobias Grosser /// 1921f384594dSTobias Grosser /// @param AccessTy The type of the memory accesses to collect. 1922f384594dSTobias Grosser /// 1923f384594dSTobias Grosser /// @return The relation describing all tagged memory accesses. 1924f384594dSTobias Grosser isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1925f384594dSTobias Grosser isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1926f384594dSTobias Grosser 1927f384594dSTobias Grosser for (auto &Stmt : *S) 1928f384594dSTobias Grosser for (auto &Acc : Stmt) 1929f384594dSTobias Grosser if (Acc->getType() == AccessTy) { 1930f384594dSTobias Grosser isl_map *Relation = Acc->getAccessRelation(); 1931f384594dSTobias Grosser Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1932f384594dSTobias Grosser 1933f384594dSTobias Grosser isl_space *Space = isl_map_get_space(Relation); 1934f384594dSTobias Grosser Space = isl_space_range(Space); 1935f384594dSTobias Grosser Space = isl_space_from_range(Space); 19366293ba69STobias Grosser Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1937f384594dSTobias Grosser isl_map *Universe = isl_map_universe(Space); 1938f384594dSTobias Grosser Relation = isl_map_domain_product(Relation, Universe); 1939f384594dSTobias Grosser Accesses = isl_union_map_add_map(Accesses, Relation); 1940f384594dSTobias Grosser } 1941f384594dSTobias Grosser 1942f384594dSTobias Grosser return Accesses; 1943f384594dSTobias Grosser } 1944f384594dSTobias Grosser 1945f384594dSTobias Grosser /// Get the set of all read accesses, tagged with the access id. 1946f384594dSTobias Grosser /// 1947f384594dSTobias Grosser /// @see getTaggedAccesses 1948f384594dSTobias Grosser isl_union_map *getTaggedReads() { 1949f384594dSTobias Grosser return getTaggedAccesses(MemoryAccess::READ); 1950f384594dSTobias Grosser } 1951f384594dSTobias Grosser 1952f384594dSTobias Grosser /// Get the set of all may (and must) accesses, tagged with the access id. 1953f384594dSTobias Grosser /// 1954f384594dSTobias Grosser /// @see getTaggedAccesses 1955f384594dSTobias Grosser isl_union_map *getTaggedMayWrites() { 1956f384594dSTobias Grosser return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1957f384594dSTobias Grosser getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1958f384594dSTobias Grosser } 1959f384594dSTobias Grosser 1960f384594dSTobias Grosser /// Get the set of all must accesses, tagged with the access id. 1961f384594dSTobias Grosser /// 1962f384594dSTobias Grosser /// @see getTaggedAccesses 1963f384594dSTobias Grosser isl_union_map *getTaggedMustWrites() { 1964f384594dSTobias Grosser return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1965f384594dSTobias Grosser } 1966f384594dSTobias Grosser 1967aef5196fSTobias Grosser /// Collect parameter and array names as isl_ids. 1968aef5196fSTobias Grosser /// 1969aef5196fSTobias Grosser /// To reason about the different parameters and arrays used, ppcg requires 1970aef5196fSTobias Grosser /// a list of all isl_ids in use. As PPCG traditionally performs 1971aef5196fSTobias Grosser /// source-to-source compilation each of these isl_ids is mapped to the 1972aef5196fSTobias Grosser /// expression that represents it. As we do not have a corresponding 1973aef5196fSTobias Grosser /// expression in Polly, we just map each id to a 'zero' expression to match 1974aef5196fSTobias Grosser /// the data format that ppcg expects. 1975aef5196fSTobias Grosser /// 1976aef5196fSTobias Grosser /// @returns Retun a map from collected ids to 'zero' ast expressions. 1977aef5196fSTobias Grosser __isl_give isl_id_to_ast_expr *getNames() { 1978aef5196fSTobias Grosser auto *Names = isl_id_to_ast_expr_alloc( 1979bd81a7eeSTobias Grosser S->getIslCtx(), 1980bd81a7eeSTobias Grosser S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1981aef5196fSTobias Grosser auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1982aef5196fSTobias Grosser auto *Space = S->getParamSpace(); 1983aef5196fSTobias Grosser 1984aef5196fSTobias Grosser for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1985aef5196fSTobias Grosser isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1986aef5196fSTobias Grosser Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1987aef5196fSTobias Grosser } 1988aef5196fSTobias Grosser 1989aef5196fSTobias Grosser for (auto &Array : S->arrays()) { 1990d7754a12SRoman Gareev auto Id = Array->getBasePtrId(); 1991aef5196fSTobias Grosser Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1992aef5196fSTobias Grosser } 1993aef5196fSTobias Grosser 1994aef5196fSTobias Grosser isl_space_free(Space); 1995aef5196fSTobias Grosser isl_ast_expr_free(Zero); 1996aef5196fSTobias Grosser 1997aef5196fSTobias Grosser return Names; 1998aef5196fSTobias Grosser } 1999aef5196fSTobias Grosser 2000e938517eSTobias Grosser /// Create a new PPCG scop from the current scop. 2001e938517eSTobias Grosser /// 2002f384594dSTobias Grosser /// The PPCG scop is initialized with data from the current polly::Scop. From 2003f384594dSTobias Grosser /// this initial data, the data-dependences in the PPCG scop are initialized. 2004f384594dSTobias Grosser /// We do not use Polly's dependence analysis for now, to ensure we match 2005f384594dSTobias Grosser /// the PPCG default behaviour more closely. 2006e938517eSTobias Grosser /// 2007e938517eSTobias Grosser /// @returns A new ppcg scop. 2008e938517eSTobias Grosser ppcg_scop *createPPCGScop() { 2009e938517eSTobias Grosser auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 2010e938517eSTobias Grosser 2011e938517eSTobias Grosser PPCGScop->options = createPPCGOptions(); 2012e938517eSTobias Grosser 2013e938517eSTobias Grosser PPCGScop->start = 0; 2014e938517eSTobias Grosser PPCGScop->end = 0; 2015e938517eSTobias Grosser 2016f384594dSTobias Grosser PPCGScop->context = S->getContext(); 2017f384594dSTobias Grosser PPCGScop->domain = S->getDomains(); 2018e938517eSTobias Grosser PPCGScop->call = nullptr; 2019f384594dSTobias Grosser PPCGScop->tagged_reads = getTaggedReads(); 2020f384594dSTobias Grosser PPCGScop->reads = S->getReads(); 2021e938517eSTobias Grosser PPCGScop->live_in = nullptr; 2022f384594dSTobias Grosser PPCGScop->tagged_may_writes = getTaggedMayWrites(); 2023f384594dSTobias Grosser PPCGScop->may_writes = S->getWrites(); 2024f384594dSTobias Grosser PPCGScop->tagged_must_writes = getTaggedMustWrites(); 2025f384594dSTobias Grosser PPCGScop->must_writes = S->getMustWrites(); 2026e938517eSTobias Grosser PPCGScop->live_out = nullptr; 2027f384594dSTobias Grosser PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 2028e938517eSTobias Grosser PPCGScop->tagger = nullptr; 2029e938517eSTobias Grosser 2030e938517eSTobias Grosser PPCGScop->independence = nullptr; 2031e938517eSTobias Grosser PPCGScop->dep_flow = nullptr; 2032e938517eSTobias Grosser PPCGScop->tagged_dep_flow = nullptr; 2033e938517eSTobias Grosser PPCGScop->dep_false = nullptr; 2034e938517eSTobias Grosser PPCGScop->dep_forced = nullptr; 2035e938517eSTobias Grosser PPCGScop->dep_order = nullptr; 2036e938517eSTobias Grosser PPCGScop->tagged_dep_order = nullptr; 2037e938517eSTobias Grosser 2038f384594dSTobias Grosser PPCGScop->schedule = S->getScheduleTree(); 2039aef5196fSTobias Grosser PPCGScop->names = getNames(); 2040e938517eSTobias Grosser 2041e938517eSTobias Grosser PPCGScop->pet = nullptr; 2042e938517eSTobias Grosser 2043f384594dSTobias Grosser compute_tagger(PPCGScop); 2044f384594dSTobias Grosser compute_dependences(PPCGScop); 2045f384594dSTobias Grosser 2046e938517eSTobias Grosser return PPCGScop; 2047e938517eSTobias Grosser } 2048e938517eSTobias Grosser 204960f63b49STobias Grosser /// Collect the array acesses in a statement. 205060f63b49STobias Grosser /// 205160f63b49STobias Grosser /// @param Stmt The statement for which to collect the accesses. 205260f63b49STobias Grosser /// 205360f63b49STobias Grosser /// @returns A list of array accesses. 205460f63b49STobias Grosser gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 205560f63b49STobias Grosser gpu_stmt_access *Accesses = nullptr; 205660f63b49STobias Grosser 205760f63b49STobias Grosser for (MemoryAccess *Acc : Stmt) { 205860f63b49STobias Grosser auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 205960f63b49STobias Grosser Access->read = Acc->isRead(); 206060f63b49STobias Grosser Access->write = Acc->isWrite(); 206160f63b49STobias Grosser Access->access = Acc->getAccessRelation(); 206260f63b49STobias Grosser isl_space *Space = isl_map_get_space(Access->access); 206360f63b49STobias Grosser Space = isl_space_range(Space); 206460f63b49STobias Grosser Space = isl_space_from_range(Space); 20656293ba69STobias Grosser Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 206660f63b49STobias Grosser isl_map *Universe = isl_map_universe(Space); 206760f63b49STobias Grosser Access->tagged_access = 206860f63b49STobias Grosser isl_map_domain_product(Acc->getAccessRelation(), Universe); 2069b513b491STobias Grosser Access->exact_write = !Acc->isMayWrite(); 207060f63b49STobias Grosser Access->ref_id = Acc->getId(); 207160f63b49STobias Grosser Access->next = Accesses; 2072b513b491STobias Grosser Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 207360f63b49STobias Grosser Accesses = Access; 207460f63b49STobias Grosser } 207560f63b49STobias Grosser 207660f63b49STobias Grosser return Accesses; 207760f63b49STobias Grosser } 207860f63b49STobias Grosser 207969b46751STobias Grosser /// Collect the list of GPU statements. 208069b46751STobias Grosser /// 208169b46751STobias Grosser /// Each statement has an id, a pointer to the underlying data structure, 208269b46751STobias Grosser /// as well as a list with all memory accesses. 208369b46751STobias Grosser /// 208469b46751STobias Grosser /// TODO: Initialize the list of memory accesses. 208569b46751STobias Grosser /// 208669b46751STobias Grosser /// @returns A linked-list of statements. 208769b46751STobias Grosser gpu_stmt *getStatements() { 208869b46751STobias Grosser gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 208969b46751STobias Grosser std::distance(S->begin(), S->end())); 209069b46751STobias Grosser 209169b46751STobias Grosser int i = 0; 209269b46751STobias Grosser for (auto &Stmt : *S) { 209369b46751STobias Grosser gpu_stmt *GPUStmt = &Stmts[i]; 209469b46751STobias Grosser 209569b46751STobias Grosser GPUStmt->id = Stmt.getDomainId(); 209669b46751STobias Grosser 209769b46751STobias Grosser // We use the pet stmt pointer to keep track of the Polly statements. 209869b46751STobias Grosser GPUStmt->stmt = (pet_stmt *)&Stmt; 209960f63b49STobias Grosser GPUStmt->accesses = getStmtAccesses(Stmt); 210069b46751STobias Grosser i++; 210169b46751STobias Grosser } 210269b46751STobias Grosser 210369b46751STobias Grosser return Stmts; 210469b46751STobias Grosser } 210569b46751STobias Grosser 210660f63b49STobias Grosser /// Derive the extent of an array. 210760f63b49STobias Grosser /// 2108d58acf86STobias Grosser /// The extent of an array is the set of elements that are within the 2109d58acf86STobias Grosser /// accessed array. For the inner dimensions, the extent constraints are 2110d58acf86STobias Grosser /// 0 and the size of the corresponding array dimension. For the first 2111d58acf86STobias Grosser /// (outermost) dimension, the extent constraints are the minimal and maximal 2112d58acf86STobias Grosser /// subscript value for the first dimension. 211360f63b49STobias Grosser /// 211460f63b49STobias Grosser /// @param Array The array to derive the extent for. 211560f63b49STobias Grosser /// 211660f63b49STobias Grosser /// @returns An isl_set describing the extent of the array. 211760f63b49STobias Grosser __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 2118d58acf86STobias Grosser unsigned NumDims = Array->getNumberOfDimensions(); 211960f63b49STobias Grosser isl_union_map *Accesses = S->getAccesses(); 212060f63b49STobias Grosser Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 2121d58acf86STobias Grosser Accesses = isl_union_map_detect_equalities(Accesses); 212260f63b49STobias Grosser isl_union_set *AccessUSet = isl_union_map_range(Accesses); 2123d58acf86STobias Grosser AccessUSet = isl_union_set_coalesce(AccessUSet); 2124d58acf86STobias Grosser AccessUSet = isl_union_set_detect_equalities(AccessUSet); 2125d58acf86STobias Grosser AccessUSet = isl_union_set_coalesce(AccessUSet); 2126d58acf86STobias Grosser 2127d58acf86STobias Grosser if (isl_union_set_is_empty(AccessUSet)) { 2128d58acf86STobias Grosser isl_union_set_free(AccessUSet); 2129d58acf86STobias Grosser return isl_set_empty(Array->getSpace()); 2130d58acf86STobias Grosser } 2131d58acf86STobias Grosser 2132d58acf86STobias Grosser if (Array->getNumberOfDimensions() == 0) { 2133d58acf86STobias Grosser isl_union_set_free(AccessUSet); 2134d58acf86STobias Grosser return isl_set_universe(Array->getSpace()); 2135d58acf86STobias Grosser } 2136d58acf86STobias Grosser 213760f63b49STobias Grosser isl_set *AccessSet = 213860f63b49STobias Grosser isl_union_set_extract_set(AccessUSet, Array->getSpace()); 213960f63b49STobias Grosser 2140d58acf86STobias Grosser isl_union_set_free(AccessUSet); 2141d58acf86STobias Grosser isl_local_space *LS = isl_local_space_from_space(Array->getSpace()); 2142d58acf86STobias Grosser 2143d58acf86STobias Grosser isl_pw_aff *Val = 2144d58acf86STobias Grosser isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 2145d58acf86STobias Grosser 2146d58acf86STobias Grosser isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 2147d58acf86STobias Grosser isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 2148d58acf86STobias Grosser OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 2149d58acf86STobias Grosser isl_pw_aff_dim(Val, isl_dim_in)); 2150d58acf86STobias Grosser OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 2151d58acf86STobias Grosser isl_pw_aff_dim(Val, isl_dim_in)); 2152d58acf86STobias Grosser OuterMin = 2153d58acf86STobias Grosser isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); 2154d58acf86STobias Grosser OuterMax = 2155d58acf86STobias Grosser isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); 2156d58acf86STobias Grosser 2157d58acf86STobias Grosser isl_set *Extent = isl_set_universe(Array->getSpace()); 2158d58acf86STobias Grosser 2159d58acf86STobias Grosser Extent = isl_set_intersect( 2160d58acf86STobias Grosser Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 2161d58acf86STobias Grosser Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 2162d58acf86STobias Grosser 2163d58acf86STobias Grosser for (unsigned i = 1; i < NumDims; ++i) 2164d58acf86STobias Grosser Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 2165d58acf86STobias Grosser 2166b7f68b8cSSiddharth Bhat for (unsigned i = 0; i < NumDims; ++i) { 2167d58acf86STobias Grosser isl_pw_aff *PwAff = 2168d58acf86STobias Grosser const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i)); 2169b7f68b8cSSiddharth Bhat 2170b7f68b8cSSiddharth Bhat // isl_pw_aff can be NULL for zero dimension. Only in the case of a 2171b7f68b8cSSiddharth Bhat // Fortran array will we have a legitimate dimension. 2172b7f68b8cSSiddharth Bhat if (!PwAff) { 2173b7f68b8cSSiddharth Bhat assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); 2174b7f68b8cSSiddharth Bhat continue; 2175b7f68b8cSSiddharth Bhat } 2176b7f68b8cSSiddharth Bhat 2177d58acf86STobias Grosser isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 2178d58acf86STobias Grosser isl_local_space_from_space(Array->getSpace()), isl_dim_set, i)); 2179d58acf86STobias Grosser PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 2180d58acf86STobias Grosser isl_pw_aff_dim(Val, isl_dim_in)); 2181d58acf86STobias Grosser PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 2182d58acf86STobias Grosser isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 2183d58acf86STobias Grosser auto *Set = isl_pw_aff_gt_set(PwAff, Val); 2184d58acf86STobias Grosser Extent = isl_set_intersect(Set, Extent); 2185d58acf86STobias Grosser } 2186d58acf86STobias Grosser 2187d58acf86STobias Grosser return Extent; 218860f63b49STobias Grosser } 218960f63b49STobias Grosser 219060f63b49STobias Grosser /// Derive the bounds of an array. 219160f63b49STobias Grosser /// 219260f63b49STobias Grosser /// For the first dimension we derive the bound of the array from the extent 219360f63b49STobias Grosser /// of this dimension. For inner dimensions we obtain their size directly from 219460f63b49STobias Grosser /// ScopArrayInfo. 219560f63b49STobias Grosser /// 219660f63b49STobias Grosser /// @param PPCGArray The array to compute bounds for. 219760f63b49STobias Grosser /// @param Array The polly array from which to take the information. 219860f63b49STobias Grosser void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 219960f63b49STobias Grosser if (PPCGArray.n_index > 0) { 220002293ed7STobias Grosser if (isl_set_is_empty(PPCGArray.extent)) { 220102293ed7STobias Grosser isl_set *Dom = isl_set_copy(PPCGArray.extent); 220202293ed7STobias Grosser isl_local_space *LS = isl_local_space_from_space( 220302293ed7STobias Grosser isl_space_params(isl_set_get_space(Dom))); 220402293ed7STobias Grosser isl_set_free(Dom); 220502293ed7STobias Grosser isl_aff *Zero = isl_aff_zero_on_domain(LS); 220602293ed7STobias Grosser PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero); 220702293ed7STobias Grosser } else { 220860f63b49STobias Grosser isl_set *Dom = isl_set_copy(PPCGArray.extent); 220960f63b49STobias Grosser Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 221060f63b49STobias Grosser isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 221160f63b49STobias Grosser isl_set_free(Dom); 221260f63b49STobias Grosser Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 221302293ed7STobias Grosser isl_local_space *LS = 221402293ed7STobias Grosser isl_local_space_from_space(isl_set_get_space(Dom)); 221560f63b49STobias Grosser isl_aff *One = isl_aff_zero_on_domain(LS); 221660f63b49STobias Grosser One = isl_aff_add_constant_si(One, 1); 221760f63b49STobias Grosser Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 221860f63b49STobias Grosser Bound = isl_pw_aff_gist(Bound, S->getContext()); 221960f63b49STobias Grosser PPCGArray.bound[0] = Bound; 222060f63b49STobias Grosser } 222102293ed7STobias Grosser } 222260f63b49STobias Grosser 222360f63b49STobias Grosser for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 222460f63b49STobias Grosser isl_pw_aff *Bound = Array->getDimensionSizePw(i); 222560f63b49STobias Grosser auto LS = isl_pw_aff_get_domain_space(Bound); 222660f63b49STobias Grosser auto Aff = isl_multi_aff_zero(LS); 222760f63b49STobias Grosser Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 222860f63b49STobias Grosser PPCGArray.bound[i] = Bound; 222960f63b49STobias Grosser } 223060f63b49STobias Grosser } 223160f63b49STobias Grosser 223260f63b49STobias Grosser /// Create the arrays for @p PPCGProg. 223360f63b49STobias Grosser /// 223460f63b49STobias Grosser /// @param PPCGProg The program to compute the arrays for. 223560f63b49STobias Grosser void createArrays(gpu_prog *PPCGProg) { 223660f63b49STobias Grosser int i = 0; 2237d7754a12SRoman Gareev for (auto &Array : S->arrays()) { 223860f63b49STobias Grosser std::string TypeName; 223960f63b49STobias Grosser raw_string_ostream OS(TypeName); 224060f63b49STobias Grosser 224160f63b49STobias Grosser OS << *Array->getElementType(); 224260f63b49STobias Grosser TypeName = OS.str(); 224360f63b49STobias Grosser 224460f63b49STobias Grosser gpu_array_info &PPCGArray = PPCGProg->array[i]; 224560f63b49STobias Grosser 224660f63b49STobias Grosser PPCGArray.space = Array->getSpace(); 224760f63b49STobias Grosser PPCGArray.type = strdup(TypeName.c_str()); 224860f63b49STobias Grosser PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 224960f63b49STobias Grosser PPCGArray.name = strdup(Array->getName().c_str()); 225060f63b49STobias Grosser PPCGArray.extent = nullptr; 225160f63b49STobias Grosser PPCGArray.n_index = Array->getNumberOfDimensions(); 225260f63b49STobias Grosser PPCGArray.bound = 225360f63b49STobias Grosser isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 225460f63b49STobias Grosser PPCGArray.extent = getExtent(Array); 225560f63b49STobias Grosser PPCGArray.n_ref = 0; 225660f63b49STobias Grosser PPCGArray.refs = nullptr; 225760f63b49STobias Grosser PPCGArray.accessed = true; 2258fe74a7a1STobias Grosser PPCGArray.read_only_scalar = 2259fe74a7a1STobias Grosser Array->isReadOnly() && Array->getNumberOfDimensions() == 0; 226060f63b49STobias Grosser PPCGArray.has_compound_element = false; 226160f63b49STobias Grosser PPCGArray.local = false; 226260f63b49STobias Grosser PPCGArray.declare_local = false; 226360f63b49STobias Grosser PPCGArray.global = false; 226460f63b49STobias Grosser PPCGArray.linearize = false; 226560f63b49STobias Grosser PPCGArray.dep_order = nullptr; 226613c78e4dSTobias Grosser PPCGArray.user = Array; 226760f63b49STobias Grosser 226860f63b49STobias Grosser setArrayBounds(PPCGArray, Array); 22692d010dafSTobias Grosser i++; 2270b9fc860aSTobias Grosser 2271b9fc860aSTobias Grosser collect_references(PPCGProg, &PPCGArray); 227260f63b49STobias Grosser } 227360f63b49STobias Grosser } 227460f63b49STobias Grosser 227560f63b49STobias Grosser /// Create an identity map between the arrays in the scop. 227660f63b49STobias Grosser /// 227760f63b49STobias Grosser /// @returns An identity map between the arrays in the scop. 227860f63b49STobias Grosser isl_union_map *getArrayIdentity() { 227960f63b49STobias Grosser isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 228060f63b49STobias Grosser 2281d7754a12SRoman Gareev for (auto &Array : S->arrays()) { 228260f63b49STobias Grosser isl_space *Space = Array->getSpace(); 228360f63b49STobias Grosser Space = isl_space_map_from_set(Space); 228460f63b49STobias Grosser isl_map *Identity = isl_map_identity(Space); 228560f63b49STobias Grosser Maps = isl_union_map_add_map(Maps, Identity); 228660f63b49STobias Grosser } 228760f63b49STobias Grosser 228860f63b49STobias Grosser return Maps; 228960f63b49STobias Grosser } 229060f63b49STobias Grosser 2291e938517eSTobias Grosser /// Create a default-initialized PPCG GPU program. 2292e938517eSTobias Grosser /// 2293e938517eSTobias Grosser /// @returns A new gpu grogram description. 2294e938517eSTobias Grosser gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 2295e938517eSTobias Grosser 2296e938517eSTobias Grosser if (!PPCGScop) 2297e938517eSTobias Grosser return nullptr; 2298e938517eSTobias Grosser 2299e938517eSTobias Grosser auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 2300e938517eSTobias Grosser 2301e938517eSTobias Grosser PPCGProg->ctx = S->getIslCtx(); 2302e938517eSTobias Grosser PPCGProg->scop = PPCGScop; 2303aef5196fSTobias Grosser PPCGProg->context = isl_set_copy(PPCGScop->context); 230460f63b49STobias Grosser PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 230560f63b49STobias Grosser PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 230660f63b49STobias Grosser PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 230760f63b49STobias Grosser PPCGProg->tagged_must_kill = 230860f63b49STobias Grosser isl_union_map_copy(PPCGScop->tagged_must_kills); 230960f63b49STobias Grosser PPCGProg->to_inner = getArrayIdentity(); 231060f63b49STobias Grosser PPCGProg->to_outer = getArrayIdentity(); 2311e938517eSTobias Grosser PPCGProg->any_to_outer = nullptr; 2312e938517eSTobias Grosser PPCGProg->array_order = nullptr; 231369b46751STobias Grosser PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 231469b46751STobias Grosser PPCGProg->stmts = getStatements(); 231560f63b49STobias Grosser PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 231660f63b49STobias Grosser PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 231760f63b49STobias Grosser PPCGProg->n_array); 231860f63b49STobias Grosser 231960f63b49STobias Grosser createArrays(PPCGProg); 2320e938517eSTobias Grosser 2321d58acf86STobias Grosser PPCGProg->may_persist = compute_may_persist(PPCGProg); 2322d58acf86STobias Grosser 2323e938517eSTobias Grosser return PPCGProg; 2324e938517eSTobias Grosser } 2325e938517eSTobias Grosser 232669b46751STobias Grosser struct PrintGPUUserData { 232769b46751STobias Grosser struct cuda_info *CudaInfo; 232869b46751STobias Grosser struct gpu_prog *PPCGProg; 232969b46751STobias Grosser std::vector<ppcg_kernel *> Kernels; 233069b46751STobias Grosser }; 233169b46751STobias Grosser 233269b46751STobias Grosser /// Print a user statement node in the host code. 233369b46751STobias Grosser /// 233469b46751STobias Grosser /// We use ppcg's printing facilities to print the actual statement and 233569b46751STobias Grosser /// additionally build up a list of all kernels that are encountered in the 233669b46751STobias Grosser /// host ast. 233769b46751STobias Grosser /// 233869b46751STobias Grosser /// @param P The printer to print to 233969b46751STobias Grosser /// @param Options The printing options to use 234069b46751STobias Grosser /// @param Node The node to print 234169b46751STobias Grosser /// @param User A user pointer to carry additional data. This pointer is 234269b46751STobias Grosser /// expected to be of type PrintGPUUserData. 234369b46751STobias Grosser /// 234469b46751STobias Grosser /// @returns A printer to which the output has been printed. 234569b46751STobias Grosser static __isl_give isl_printer * 234669b46751STobias Grosser printHostUser(__isl_take isl_printer *P, 234769b46751STobias Grosser __isl_take isl_ast_print_options *Options, 234869b46751STobias Grosser __isl_take isl_ast_node *Node, void *User) { 234969b46751STobias Grosser auto Data = (struct PrintGPUUserData *)User; 235069b46751STobias Grosser auto Id = isl_ast_node_get_annotation(Node); 235169b46751STobias Grosser 235269b46751STobias Grosser if (Id) { 235320251734STobias Grosser bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 235420251734STobias Grosser 235520251734STobias Grosser // If this is a user statement, format it ourselves as ppcg would 235620251734STobias Grosser // otherwise try to call pet functionality that is not available in 235720251734STobias Grosser // Polly. 235820251734STobias Grosser if (IsUser) { 235920251734STobias Grosser P = isl_printer_start_line(P); 236020251734STobias Grosser P = isl_printer_print_ast_node(P, Node); 236120251734STobias Grosser P = isl_printer_end_line(P); 236220251734STobias Grosser isl_id_free(Id); 236320251734STobias Grosser isl_ast_print_options_free(Options); 236420251734STobias Grosser return P; 236520251734STobias Grosser } 236620251734STobias Grosser 236769b46751STobias Grosser auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 236869b46751STobias Grosser isl_id_free(Id); 236969b46751STobias Grosser Data->Kernels.push_back(Kernel); 237069b46751STobias Grosser } 237169b46751STobias Grosser 237269b46751STobias Grosser return print_host_user(P, Options, Node, User); 237369b46751STobias Grosser } 237469b46751STobias Grosser 237569b46751STobias Grosser /// Print C code corresponding to the control flow in @p Kernel. 237669b46751STobias Grosser /// 237769b46751STobias Grosser /// @param Kernel The kernel to print 237869b46751STobias Grosser void printKernel(ppcg_kernel *Kernel) { 237969b46751STobias Grosser auto *P = isl_printer_to_str(S->getIslCtx()); 238069b46751STobias Grosser P = isl_printer_set_output_format(P, ISL_FORMAT_C); 238169b46751STobias Grosser auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 238269b46751STobias Grosser P = isl_ast_node_print(Kernel->tree, P, Options); 238369b46751STobias Grosser char *String = isl_printer_get_str(P); 238469b46751STobias Grosser printf("%s\n", String); 238569b46751STobias Grosser free(String); 238669b46751STobias Grosser isl_printer_free(P); 238769b46751STobias Grosser } 238869b46751STobias Grosser 238969b46751STobias Grosser /// Print C code corresponding to the GPU code described by @p Tree. 239069b46751STobias Grosser /// 239169b46751STobias Grosser /// @param Tree An AST describing GPU code 239269b46751STobias Grosser /// @param PPCGProg The PPCG program from which @Tree has been constructed. 239369b46751STobias Grosser void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 239469b46751STobias Grosser auto *P = isl_printer_to_str(S->getIslCtx()); 239569b46751STobias Grosser P = isl_printer_set_output_format(P, ISL_FORMAT_C); 239669b46751STobias Grosser 239769b46751STobias Grosser PrintGPUUserData Data; 239869b46751STobias Grosser Data.PPCGProg = PPCGProg; 239969b46751STobias Grosser 240069b46751STobias Grosser auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 240169b46751STobias Grosser Options = 240269b46751STobias Grosser isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 240369b46751STobias Grosser P = isl_ast_node_print(Tree, P, Options); 240469b46751STobias Grosser char *String = isl_printer_get_str(P); 240569b46751STobias Grosser printf("# host\n"); 240669b46751STobias Grosser printf("%s\n", String); 240769b46751STobias Grosser free(String); 240869b46751STobias Grosser isl_printer_free(P); 240969b46751STobias Grosser 241069b46751STobias Grosser for (auto Kernel : Data.Kernels) { 241169b46751STobias Grosser printf("# kernel%d\n", Kernel->id); 241269b46751STobias Grosser printKernel(Kernel); 241369b46751STobias Grosser } 241469b46751STobias Grosser } 241569b46751STobias Grosser 2416f384594dSTobias Grosser // Generate a GPU program using PPCG. 2417f384594dSTobias Grosser // 2418f384594dSTobias Grosser // GPU mapping consists of multiple steps: 2419f384594dSTobias Grosser // 2420f384594dSTobias Grosser // 1) Compute new schedule for the program. 2421f384594dSTobias Grosser // 2) Map schedule to GPU (TODO) 2422f384594dSTobias Grosser // 3) Generate code for new schedule (TODO) 2423f384594dSTobias Grosser // 2424f384594dSTobias Grosser // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 2425f384594dSTobias Grosser // is mostly CPU specific. Instead, we use PPCG's GPU code generation 2426f384594dSTobias Grosser // strategy directly from this pass. 2427f384594dSTobias Grosser gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 2428f384594dSTobias Grosser 2429f384594dSTobias Grosser auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 2430f384594dSTobias Grosser 2431f384594dSTobias Grosser PPCGGen->ctx = S->getIslCtx(); 2432f384594dSTobias Grosser PPCGGen->options = PPCGScop->options; 2433f384594dSTobias Grosser PPCGGen->print = nullptr; 2434f384594dSTobias Grosser PPCGGen->print_user = nullptr; 243560c60025STobias Grosser PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 2436f384594dSTobias Grosser PPCGGen->prog = PPCGProg; 2437f384594dSTobias Grosser PPCGGen->tree = nullptr; 2438f384594dSTobias Grosser PPCGGen->types.n = 0; 2439f384594dSTobias Grosser PPCGGen->types.name = nullptr; 2440f384594dSTobias Grosser PPCGGen->sizes = nullptr; 2441f384594dSTobias Grosser PPCGGen->used_sizes = nullptr; 2442f384594dSTobias Grosser PPCGGen->kernel_id = 0; 2443f384594dSTobias Grosser 2444f384594dSTobias Grosser // Set scheduling strategy to same strategy PPCG is using. 2445f384594dSTobias Grosser isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 2446f384594dSTobias Grosser isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 24472341fe9eSTobias Grosser isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 2448f384594dSTobias Grosser 2449f384594dSTobias Grosser isl_schedule *Schedule = get_schedule(PPCGGen); 2450f384594dSTobias Grosser 2451aef5196fSTobias Grosser int has_permutable = has_any_permutable_node(Schedule); 2452aef5196fSTobias Grosser 245369b46751STobias Grosser if (!has_permutable || has_permutable < 0) { 2454aef5196fSTobias Grosser Schedule = isl_schedule_free(Schedule); 245569b46751STobias Grosser } else { 2456aef5196fSTobias Grosser Schedule = map_to_device(PPCGGen, Schedule); 245769b46751STobias Grosser PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 245869b46751STobias Grosser } 2459aef5196fSTobias Grosser 2460f384594dSTobias Grosser if (DumpSchedule) { 2461f384594dSTobias Grosser isl_printer *P = isl_printer_to_str(S->getIslCtx()); 2462f384594dSTobias Grosser P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 2463f384594dSTobias Grosser P = isl_printer_print_str(P, "Schedule\n"); 2464f384594dSTobias Grosser P = isl_printer_print_str(P, "========\n"); 2465f384594dSTobias Grosser if (Schedule) 2466f384594dSTobias Grosser P = isl_printer_print_schedule(P, Schedule); 2467f384594dSTobias Grosser else 2468f384594dSTobias Grosser P = isl_printer_print_str(P, "No schedule found\n"); 2469f384594dSTobias Grosser 2470f384594dSTobias Grosser printf("%s\n", isl_printer_get_str(P)); 2471f384594dSTobias Grosser isl_printer_free(P); 2472f384594dSTobias Grosser } 2473f384594dSTobias Grosser 247469b46751STobias Grosser if (DumpCode) { 247569b46751STobias Grosser printf("Code\n"); 247669b46751STobias Grosser printf("====\n"); 247769b46751STobias Grosser if (PPCGGen->tree) 247869b46751STobias Grosser printGPUTree(PPCGGen->tree, PPCGProg); 247969b46751STobias Grosser else 248069b46751STobias Grosser printf("No code generated\n"); 248169b46751STobias Grosser } 248269b46751STobias Grosser 2483f384594dSTobias Grosser isl_schedule_free(Schedule); 2484f384594dSTobias Grosser 2485f384594dSTobias Grosser return PPCGGen; 2486f384594dSTobias Grosser } 2487f384594dSTobias Grosser 2488f384594dSTobias Grosser /// Free gpu_gen structure. 2489f384594dSTobias Grosser /// 2490f384594dSTobias Grosser /// @param PPCGGen The ppcg_gen object to free. 2491f384594dSTobias Grosser void freePPCGGen(gpu_gen *PPCGGen) { 2492f384594dSTobias Grosser isl_ast_node_free(PPCGGen->tree); 2493f384594dSTobias Grosser isl_union_map_free(PPCGGen->sizes); 2494f384594dSTobias Grosser isl_union_map_free(PPCGGen->used_sizes); 2495f384594dSTobias Grosser free(PPCGGen); 2496f384594dSTobias Grosser } 2497f384594dSTobias Grosser 2498b307ed4dSTobias Grosser /// Free the options in the ppcg scop structure. 2499b307ed4dSTobias Grosser /// 2500b307ed4dSTobias Grosser /// ppcg is not freeing these options for us. To avoid leaks we do this 2501b307ed4dSTobias Grosser /// ourselves. 2502b307ed4dSTobias Grosser /// 2503b307ed4dSTobias Grosser /// @param PPCGScop The scop referencing the options to free. 2504b307ed4dSTobias Grosser void freeOptions(ppcg_scop *PPCGScop) { 2505b307ed4dSTobias Grosser free(PPCGScop->options->debug); 2506b307ed4dSTobias Grosser PPCGScop->options->debug = nullptr; 2507b307ed4dSTobias Grosser free(PPCGScop->options); 2508b307ed4dSTobias Grosser PPCGScop->options = nullptr; 2509b307ed4dSTobias Grosser } 2510b307ed4dSTobias Grosser 251182f2af35STobias Grosser /// Approximate the number of points in the set. 251282f2af35STobias Grosser /// 251382f2af35STobias Grosser /// This function returns an ast expression that overapproximates the number 251482f2af35STobias Grosser /// of points in an isl set through the rectangular hull surrounding this set. 251582f2af35STobias Grosser /// 251682f2af35STobias Grosser /// @param Set The set to count. 251782f2af35STobias Grosser /// @param Build The isl ast build object to use for creating the ast 251882f2af35STobias Grosser /// expression. 251982f2af35STobias Grosser /// 252082f2af35STobias Grosser /// @returns An approximation of the number of points in the set. 252182f2af35STobias Grosser __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, 252282f2af35STobias Grosser __isl_keep isl_ast_build *Build) { 252382f2af35STobias Grosser 252482f2af35STobias Grosser isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); 252582f2af35STobias Grosser auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); 252682f2af35STobias Grosser 252782f2af35STobias Grosser isl_space *Space = isl_set_get_space(Set); 252882f2af35STobias Grosser Space = isl_space_params(Space); 252982f2af35STobias Grosser auto *Univ = isl_set_universe(Space); 253082f2af35STobias Grosser isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); 253182f2af35STobias Grosser 253282f2af35STobias Grosser for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) { 253382f2af35STobias Grosser isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); 253482f2af35STobias Grosser isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); 253582f2af35STobias Grosser isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); 253682f2af35STobias Grosser DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); 253782f2af35STobias Grosser auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); 253882f2af35STobias Grosser Expr = isl_ast_expr_mul(Expr, DimSizeExpr); 253982f2af35STobias Grosser } 254082f2af35STobias Grosser 254182f2af35STobias Grosser isl_set_free(Set); 254282f2af35STobias Grosser isl_pw_aff_free(OneAff); 254382f2af35STobias Grosser 254482f2af35STobias Grosser return Expr; 254582f2af35STobias Grosser } 254682f2af35STobias Grosser 254782f2af35STobias Grosser /// Approximate a number of dynamic instructions executed by a given 254882f2af35STobias Grosser /// statement. 254982f2af35STobias Grosser /// 255082f2af35STobias Grosser /// @param Stmt The statement for which to compute the number of dynamic 255182f2af35STobias Grosser /// instructions. 255282f2af35STobias Grosser /// @param Build The isl ast build object to use for creating the ast 255382f2af35STobias Grosser /// expression. 255482f2af35STobias Grosser /// @returns An approximation of the number of dynamic instructions executed 255582f2af35STobias Grosser /// by @p Stmt. 255682f2af35STobias Grosser __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, 255782f2af35STobias Grosser __isl_keep isl_ast_build *Build) { 255882f2af35STobias Grosser auto Iterations = approxPointsInSet(Stmt.getDomain(), Build); 255982f2af35STobias Grosser 256082f2af35STobias Grosser long InstCount = 0; 256182f2af35STobias Grosser 256282f2af35STobias Grosser if (Stmt.isBlockStmt()) { 256382f2af35STobias Grosser auto *BB = Stmt.getBasicBlock(); 256482f2af35STobias Grosser InstCount = std::distance(BB->begin(), BB->end()); 256582f2af35STobias Grosser } else { 256682f2af35STobias Grosser auto *R = Stmt.getRegion(); 256782f2af35STobias Grosser 256882f2af35STobias Grosser for (auto *BB : R->blocks()) { 256982f2af35STobias Grosser InstCount += std::distance(BB->begin(), BB->end()); 257082f2af35STobias Grosser } 257182f2af35STobias Grosser } 257282f2af35STobias Grosser 257382f2af35STobias Grosser isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount); 257482f2af35STobias Grosser auto *InstExpr = isl_ast_expr_from_val(InstVal); 257582f2af35STobias Grosser return isl_ast_expr_mul(InstExpr, Iterations); 257682f2af35STobias Grosser } 257782f2af35STobias Grosser 257882f2af35STobias Grosser /// Approximate dynamic instructions executed in scop. 257982f2af35STobias Grosser /// 258082f2af35STobias Grosser /// @param S The scop for which to approximate dynamic instructions. 258182f2af35STobias Grosser /// @param Build The isl ast build object to use for creating the ast 258282f2af35STobias Grosser /// expression. 258382f2af35STobias Grosser /// @returns An approximation of the number of dynamic instructions executed 258482f2af35STobias Grosser /// in @p S. 258582f2af35STobias Grosser __isl_give isl_ast_expr * 258682f2af35STobias Grosser getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { 258782f2af35STobias Grosser isl_ast_expr *Instructions; 258882f2af35STobias Grosser 258982f2af35STobias Grosser isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0); 259082f2af35STobias Grosser Instructions = isl_ast_expr_from_val(Zero); 259182f2af35STobias Grosser 259282f2af35STobias Grosser for (ScopStmt &Stmt : S) { 259382f2af35STobias Grosser isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); 259482f2af35STobias Grosser Instructions = isl_ast_expr_add(Instructions, StmtInstructions); 259582f2af35STobias Grosser } 259682f2af35STobias Grosser return Instructions; 259782f2af35STobias Grosser } 259882f2af35STobias Grosser 259982f2af35STobias Grosser /// Create a check that ensures sufficient compute in scop. 260082f2af35STobias Grosser /// 260182f2af35STobias Grosser /// @param S The scop for which to ensure sufficient compute. 260282f2af35STobias Grosser /// @param Build The isl ast build object to use for creating the ast 260382f2af35STobias Grosser /// expression. 260482f2af35STobias Grosser /// @returns An expression that evaluates to TRUE in case of sufficient 260582f2af35STobias Grosser /// compute and to FALSE, otherwise. 260682f2af35STobias Grosser __isl_give isl_ast_expr * 260782f2af35STobias Grosser createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { 260882f2af35STobias Grosser auto Iterations = getNumberOfIterations(S, Build); 260982f2af35STobias Grosser auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute); 261082f2af35STobias Grosser auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); 261182f2af35STobias Grosser return isl_ast_expr_ge(Iterations, MinComputeExpr); 261282f2af35STobias Grosser } 261382f2af35STobias Grosser 261438fc0aedSTobias Grosser /// Generate code for a given GPU AST described by @p Root. 261538fc0aedSTobias Grosser /// 261632837fe3STobias Grosser /// @param Root An isl_ast_node pointing to the root of the GPU AST. 261732837fe3STobias Grosser /// @param Prog The GPU Program to generate code for. 261832837fe3STobias Grosser void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 261938fc0aedSTobias Grosser ScopAnnotator Annotator; 262038fc0aedSTobias Grosser Annotator.buildAliasScopes(*S); 262138fc0aedSTobias Grosser 262238fc0aedSTobias Grosser Region *R = &S->getRegion(); 262338fc0aedSTobias Grosser 262438fc0aedSTobias Grosser simplifyRegion(R, DT, LI, RI); 262538fc0aedSTobias Grosser 262638fc0aedSTobias Grosser BasicBlock *EnteringBB = R->getEnteringBlock(); 262738fc0aedSTobias Grosser 262838fc0aedSTobias Grosser PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 262938fc0aedSTobias Grosser 263038fc0aedSTobias Grosser // Only build the run-time condition and parameters _after_ having 263138fc0aedSTobias Grosser // introduced the conditional branch. This is important as the conditional 263238fc0aedSTobias Grosser // branch will guard the original scop from new induction variables that 263338fc0aedSTobias Grosser // the SCEVExpander may introduce while code generating the parameters and 263438fc0aedSTobias Grosser // which may introduce scalar dependences that prevent us from correctly 263538fc0aedSTobias Grosser // code generating this scop. 263638fc0aedSTobias Grosser BasicBlock *StartBlock = 26372d950f36SPhilip Pfaffe executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); 263838fc0aedSTobias Grosser 26392d950f36SPhilip Pfaffe GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, 264017f01968SSiddharth Bhat StartBlock, Prog, Runtime, Architecture); 2641acf80064SEli Friedman 264238fc0aedSTobias Grosser // TODO: Handle LICM 264338fc0aedSTobias Grosser auto SplitBlock = StartBlock->getSinglePredecessor(); 264438fc0aedSTobias Grosser Builder.SetInsertPoint(SplitBlock->getTerminator()); 264538fc0aedSTobias Grosser NodeBuilder.addParameters(S->getContext()); 2646cb1aef8dSTobias Grosser 2647cb1aef8dSTobias Grosser isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 2648*2b852e2eSPhilip Pfaffe isl_ast_expr *Condition = IslAst::buildRunCondition(*S, Build); 264982f2af35STobias Grosser isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); 265082f2af35STobias Grosser Condition = isl_ast_expr_and(Condition, SufficientCompute); 2651cb1aef8dSTobias Grosser isl_ast_build_free(Build); 2652cb1aef8dSTobias Grosser 2653cb1aef8dSTobias Grosser Value *RTC = NodeBuilder.createRTC(Condition); 2654cb1aef8dSTobias Grosser Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 2655cb1aef8dSTobias Grosser 265638fc0aedSTobias Grosser Builder.SetInsertPoint(&*StartBlock->begin()); 2657fa7b0802STobias Grosser 2658fa7b0802STobias Grosser NodeBuilder.initializeAfterRTH(); 265938fc0aedSTobias Grosser NodeBuilder.create(Root); 26608ed5e599STobias Grosser NodeBuilder.finalize(); 26615857b701STobias Grosser 2662bc653f20STobias Grosser /// In case a sequential kernel has more surrounding loops as any parallel 2663bc653f20STobias Grosser /// kernel, the SCoP is probably mostly sequential. Hence, there is no 2664de244eb4STobias Grosser /// point in running it on a GPU. 2665bc653f20STobias Grosser if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) 2666bc653f20STobias Grosser SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2667bc653f20STobias Grosser 26685857b701STobias Grosser if (!NodeBuilder.BuildSuccessful) 26695857b701STobias Grosser SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 267038fc0aedSTobias Grosser } 267138fc0aedSTobias Grosser 2672e938517eSTobias Grosser bool runOnScop(Scop &CurrentScop) override { 2673e938517eSTobias Grosser S = &CurrentScop; 267438fc0aedSTobias Grosser LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 267538fc0aedSTobias Grosser DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 267638fc0aedSTobias Grosser SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 26777b5a4dfdSTobias Grosser DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); 267838fc0aedSTobias Grosser RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2679e938517eSTobias Grosser 26802d58a64eSTobias Grosser // We currently do not support scops with invariant loads. 26812d58a64eSTobias Grosser if (S->hasInvariantAccesses()) 26822d58a64eSTobias Grosser return false; 26832d58a64eSTobias Grosser 2684e938517eSTobias Grosser auto PPCGScop = createPPCGScop(); 2685e938517eSTobias Grosser auto PPCGProg = createPPCGProg(PPCGScop); 2686f384594dSTobias Grosser auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 268738fc0aedSTobias Grosser 268838fc0aedSTobias Grosser if (PPCGGen->tree) 268932837fe3STobias Grosser generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 269038fc0aedSTobias Grosser 2691b307ed4dSTobias Grosser freeOptions(PPCGScop); 2692f384594dSTobias Grosser freePPCGGen(PPCGGen); 2693e938517eSTobias Grosser gpu_prog_free(PPCGProg); 2694e938517eSTobias Grosser ppcg_scop_free(PPCGScop); 2695e938517eSTobias Grosser 2696e938517eSTobias Grosser return true; 2697e938517eSTobias Grosser } 26989dfe4e7cSTobias Grosser 26999dfe4e7cSTobias Grosser void printScop(raw_ostream &, Scop &) const override {} 27009dfe4e7cSTobias Grosser 27019dfe4e7cSTobias Grosser void getAnalysisUsage(AnalysisUsage &AU) const override { 27029dfe4e7cSTobias Grosser AU.addRequired<DominatorTreeWrapperPass>(); 27039dfe4e7cSTobias Grosser AU.addRequired<RegionInfoPass>(); 27049dfe4e7cSTobias Grosser AU.addRequired<ScalarEvolutionWrapperPass>(); 27055cc87e3aSPhilip Pfaffe AU.addRequired<ScopDetectionWrapperPass>(); 27069dfe4e7cSTobias Grosser AU.addRequired<ScopInfoRegionPass>(); 27079dfe4e7cSTobias Grosser AU.addRequired<LoopInfoWrapperPass>(); 27089dfe4e7cSTobias Grosser 27099dfe4e7cSTobias Grosser AU.addPreserved<AAResultsWrapperPass>(); 27109dfe4e7cSTobias Grosser AU.addPreserved<BasicAAWrapperPass>(); 27119dfe4e7cSTobias Grosser AU.addPreserved<LoopInfoWrapperPass>(); 27129dfe4e7cSTobias Grosser AU.addPreserved<DominatorTreeWrapperPass>(); 27139dfe4e7cSTobias Grosser AU.addPreserved<GlobalsAAWrapperPass>(); 27145cc87e3aSPhilip Pfaffe AU.addPreserved<ScopDetectionWrapperPass>(); 27159dfe4e7cSTobias Grosser AU.addPreserved<ScalarEvolutionWrapperPass>(); 27169dfe4e7cSTobias Grosser AU.addPreserved<SCEVAAWrapperPass>(); 27179dfe4e7cSTobias Grosser 27189dfe4e7cSTobias Grosser // FIXME: We do not yet add regions for the newly generated code to the 27199dfe4e7cSTobias Grosser // region tree. 27209dfe4e7cSTobias Grosser AU.addPreserved<RegionInfoPass>(); 27219dfe4e7cSTobias Grosser AU.addPreserved<ScopInfoRegionPass>(); 27229dfe4e7cSTobias Grosser } 27239dfe4e7cSTobias Grosser }; 272424222c73STobias Grosser } // namespace 27259dfe4e7cSTobias Grosser 27269dfe4e7cSTobias Grosser char PPCGCodeGeneration::ID = 1; 27279dfe4e7cSTobias Grosser 272817f01968SSiddharth Bhat Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { 272917f01968SSiddharth Bhat PPCGCodeGeneration *generator = new PPCGCodeGeneration(); 273017f01968SSiddharth Bhat generator->Runtime = Runtime; 273117f01968SSiddharth Bhat generator->Architecture = Arch; 273217f01968SSiddharth Bhat return generator; 273317f01968SSiddharth Bhat } 27349dfe4e7cSTobias Grosser 27359dfe4e7cSTobias Grosser INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 27369dfe4e7cSTobias Grosser "Polly - Apply PPCG translation to SCOP", false, false) 27379dfe4e7cSTobias Grosser INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 27389dfe4e7cSTobias Grosser INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 27399dfe4e7cSTobias Grosser INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 27409dfe4e7cSTobias Grosser INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 27419dfe4e7cSTobias Grosser INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 27425cc87e3aSPhilip Pfaffe INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); 27439dfe4e7cSTobias Grosser INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 27449dfe4e7cSTobias Grosser "Polly - Apply PPCG translation to SCOP", false, false) 2745