1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/PPCGCodeGeneration.h" 16 #include "polly/CodeGen/IslAst.h" 17 #include "polly/CodeGen/IslNodeBuilder.h" 18 #include "polly/CodeGen/Utils.h" 19 #include "polly/DependenceInfo.h" 20 #include "polly/LinkAllPasses.h" 21 #include "polly/Options.h" 22 #include "polly/ScopDetection.h" 23 #include "polly/ScopInfo.h" 24 #include "polly/Support/SCEVValidator.h" 25 #include "llvm/ADT/PostOrderIterator.h" 26 #include "llvm/Analysis/AliasAnalysis.h" 27 #include "llvm/Analysis/BasicAliasAnalysis.h" 28 #include "llvm/Analysis/GlobalsModRef.h" 29 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 30 #include "llvm/Analysis/TargetLibraryInfo.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/IR/LegacyPassManager.h" 33 #include "llvm/IR/Verifier.h" 34 #include "llvm/Support/TargetRegistry.h" 35 #include "llvm/Support/TargetSelect.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 40 #include "isl/union_map.h" 41 42 extern "C" { 43 #include "ppcg/cuda.h" 44 #include "ppcg/gpu.h" 45 #include "ppcg/gpu_print.h" 46 #include "ppcg/ppcg.h" 47 #include "ppcg/schedule.h" 48 } 49 50 #include "llvm/Support/Debug.h" 51 52 using namespace polly; 53 using namespace llvm; 54 55 #define DEBUG_TYPE "polly-codegen-ppcg" 56 57 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 58 cl::desc("Dump the computed GPU Schedule"), 59 cl::Hidden, cl::init(false), cl::ZeroOrMore, 60 cl::cat(PollyCategory)); 61 62 static cl::opt<bool> 63 DumpCode("polly-acc-dump-code", 64 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 65 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 66 67 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 68 cl::desc("Dump the kernel LLVM-IR"), 69 cl::Hidden, cl::init(false), cl::ZeroOrMore, 70 cl::cat(PollyCategory)); 71 72 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 73 cl::desc("Dump the kernel assembly code"), 74 cl::Hidden, cl::init(false), cl::ZeroOrMore, 75 cl::cat(PollyCategory)); 76 77 static cl::opt<bool> FastMath("polly-acc-fastmath", 78 cl::desc("Allow unsafe math optimizations"), 79 cl::Hidden, cl::init(false), cl::ZeroOrMore, 80 cl::cat(PollyCategory)); 81 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 82 cl::desc("Use shared memory"), cl::Hidden, 83 cl::init(false), cl::ZeroOrMore, 84 cl::cat(PollyCategory)); 85 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 86 cl::desc("Use private memory"), cl::Hidden, 87 cl::init(false), cl::ZeroOrMore, 88 cl::cat(PollyCategory)); 89 90 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory", 91 cl::desc("Generate Host kernel code assuming" 92 " that all memory has been" 93 " declared as managed memory"), 94 cl::Hidden, cl::init(false), cl::ZeroOrMore, 95 cl::cat(PollyCategory)); 96 97 static cl::opt<std::string> 98 CudaVersion("polly-acc-cuda-version", 99 cl::desc("The CUDA version to compile for"), cl::Hidden, 100 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 101 102 static cl::opt<int> 103 MinCompute("polly-acc-mincompute", 104 cl::desc("Minimal number of compute statements to run on GPU."), 105 cl::Hidden, cl::init(10 * 512 * 512)); 106 107 /// Create the ast expressions for a ScopStmt. 108 /// 109 /// This function is a callback for to generate the ast expressions for each 110 /// of the scheduled ScopStmts. 111 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 112 void *StmtT, isl_ast_build *Build, 113 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 114 isl_id *Id, void *User), 115 void *UserIndex, 116 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 117 void *UserExpr) { 118 119 ScopStmt *Stmt = (ScopStmt *)StmtT; 120 121 isl_ctx *Ctx; 122 123 if (!Stmt || !Build) 124 return NULL; 125 126 Ctx = isl_ast_build_get_ctx(Build); 127 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 128 129 for (MemoryAccess *Acc : *Stmt) { 130 isl_map *AddrFunc = Acc->getAddressFunction(); 131 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 132 isl_id *RefId = Acc->getId(); 133 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 134 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 135 MPA = isl_multi_pw_aff_coalesce(MPA); 136 MPA = FunctionIndex(MPA, RefId, UserIndex); 137 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 138 Access = FunctionExpr(Access, RefId, UserExpr); 139 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 140 } 141 142 return RefToExpr; 143 } 144 145 /// Given a LLVM Type, compute its size in bytes, 146 static int computeSizeInBytes(const Type *T) { 147 int bytes = T->getPrimitiveSizeInBits() / 8; 148 if (bytes == 0) 149 bytes = T->getScalarSizeInBits() / 8; 150 return bytes; 151 } 152 153 /// Generate code for a GPU specific isl AST. 154 /// 155 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 156 /// generates code for general-purpose AST nodes, with special functionality 157 /// for generating GPU specific user nodes. 158 /// 159 /// @see GPUNodeBuilder::createUser 160 class GPUNodeBuilder : public IslNodeBuilder { 161 public: 162 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, 163 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 164 DominatorTree &DT, Scop &S, BasicBlock *StartBlock, 165 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) 166 : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), 167 Prog(Prog), Runtime(Runtime), Arch(Arch) { 168 getExprBuilder().setIDToSAI(&IDToSAI); 169 } 170 171 /// Create after-run-time-check initialization code. 172 void initializeAfterRTH(); 173 174 /// Finalize the generated scop. 175 virtual void finalize(); 176 177 /// Track if the full build process was successful. 178 /// 179 /// This value is set to false, if throughout the build process an error 180 /// occurred which prevents us from generating valid GPU code. 181 bool BuildSuccessful = true; 182 183 /// The maximal number of loops surrounding a sequential kernel. 184 unsigned DeepestSequential = 0; 185 186 /// The maximal number of loops surrounding a parallel kernel. 187 unsigned DeepestParallel = 0; 188 189 private: 190 /// A vector of array base pointers for which a new ScopArrayInfo was created. 191 /// 192 /// This vector is used to delete the ScopArrayInfo when it is not needed any 193 /// more. 194 std::vector<Value *> LocalArrays; 195 196 /// A map from ScopArrays to their corresponding device allocations. 197 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 198 199 /// The current GPU context. 200 Value *GPUContext; 201 202 /// The set of isl_ids allocated in the kernel 203 std::vector<isl_id *> KernelIds; 204 205 /// A module containing GPU code. 206 /// 207 /// This pointer is only set in case we are currently generating GPU code. 208 std::unique_ptr<Module> GPUModule; 209 210 /// The GPU program we generate code for. 211 gpu_prog *Prog; 212 213 /// The GPU Runtime implementation to use (OpenCL or CUDA). 214 GPURuntime Runtime; 215 216 /// The GPU Architecture to target. 217 GPUArch Arch; 218 219 /// Class to free isl_ids. 220 class IslIdDeleter { 221 public: 222 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 223 }; 224 225 /// A set containing all isl_ids allocated in a GPU kernel. 226 /// 227 /// By releasing this set all isl_ids will be freed. 228 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 229 230 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 231 232 /// Create code for user-defined AST nodes. 233 /// 234 /// These AST nodes can be of type: 235 /// 236 /// - ScopStmt: A computational statement (TODO) 237 /// - Kernel: A GPU kernel call (TODO) 238 /// - Data-Transfer: A GPU <-> CPU data-transfer 239 /// - In-kernel synchronization 240 /// - In-kernel memory copy statement 241 /// 242 /// @param UserStmt The ast node to generate code for. 243 virtual void createUser(__isl_take isl_ast_node *UserStmt); 244 245 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 246 247 /// Create code for a data transfer statement 248 /// 249 /// @param TransferStmt The data transfer statement. 250 /// @param Direction The direction in which to transfer data. 251 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 252 enum DataDirection Direction); 253 254 /// Find llvm::Values referenced in GPU kernel. 255 /// 256 /// @param Kernel The kernel to scan for llvm::Values 257 /// 258 /// @returns A set of values referenced by the kernel. 259 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 260 261 /// Compute the sizes of the execution grid for a given kernel. 262 /// 263 /// @param Kernel The kernel to compute grid sizes for. 264 /// 265 /// @returns A tuple with grid sizes for X and Y dimension 266 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 267 268 /// Creates a array that can be sent to the kernel on the device using a 269 /// host pointer. This is required for managed memory, when we directly send 270 /// host pointers to the device. 271 /// \note 272 /// This is to be used only with managed memory 273 Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, 274 ScopArrayInfo *ArrayInfo); 275 276 /// Compute the sizes of the thread blocks for a given kernel. 277 /// 278 /// @param Kernel The kernel to compute thread block sizes for. 279 /// 280 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 281 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 282 283 /// Store a specific kernel launch parameter in the array of kernel launch 284 /// parameters. 285 /// 286 /// @param Parameters The list of parameters in which to store. 287 /// @param Param The kernel launch parameter to store. 288 /// @param Index The index in the parameter list, at which to store the 289 /// parameter. 290 void insertStoreParameter(Instruction *Parameters, Instruction *Param, 291 int Index); 292 293 /// Create kernel launch parameters. 294 /// 295 /// @param Kernel The kernel to create parameters for. 296 /// @param F The kernel function that has been created. 297 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 298 /// 299 /// @returns A stack allocated array with pointers to the parameter 300 /// values that are passed to the kernel. 301 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 302 SetVector<Value *> SubtreeValues); 303 304 /// Create declarations for kernel variable. 305 /// 306 /// This includes shared memory declarations. 307 /// 308 /// @param Kernel The kernel definition to create variables for. 309 /// @param FN The function into which to generate the variables. 310 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 311 312 /// Add CUDA annotations to module. 313 /// 314 /// Add a set of CUDA annotations that declares the maximal block dimensions 315 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 316 /// PTX compiler to bound the number of allocated registers to ensure the 317 /// resulting kernel is known to run with up to as many block dimensions 318 /// as specified here. 319 /// 320 /// @param M The module to add the annotations to. 321 /// @param BlockDimX The size of block dimension X. 322 /// @param BlockDimY The size of block dimension Y. 323 /// @param BlockDimZ The size of block dimension Z. 324 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 325 Value *BlockDimZ); 326 327 /// Create GPU kernel. 328 /// 329 /// Code generate the kernel described by @p KernelStmt. 330 /// 331 /// @param KernelStmt The ast node to generate kernel code for. 332 void createKernel(__isl_take isl_ast_node *KernelStmt); 333 334 /// Generate code that computes the size of an array. 335 /// 336 /// @param Array The array for which to compute a size. 337 Value *getArraySize(gpu_array_info *Array); 338 339 /// Generate code to compute the minimal offset at which an array is accessed. 340 /// 341 /// The offset of an array is the minimal array location accessed in a scop. 342 /// 343 /// Example: 344 /// 345 /// for (long i = 0; i < 100; i++) 346 /// A[i + 42] += ... 347 /// 348 /// getArrayOffset(A) results in 42. 349 /// 350 /// @param Array The array for which to compute the offset. 351 /// @returns An llvm::Value that contains the offset of the array. 352 Value *getArrayOffset(gpu_array_info *Array); 353 354 /// Prepare the kernel arguments for kernel code generation 355 /// 356 /// @param Kernel The kernel to generate code for. 357 /// @param FN The function created for the kernel. 358 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 359 360 /// Create kernel function. 361 /// 362 /// Create a kernel function located in a newly created module that can serve 363 /// as target for device code generation. Set the Builder to point to the 364 /// start block of this newly created function. 365 /// 366 /// @param Kernel The kernel to generate code for. 367 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 368 void createKernelFunction(ppcg_kernel *Kernel, 369 SetVector<Value *> &SubtreeValues); 370 371 /// Create the declaration of a kernel function. 372 /// 373 /// The kernel function takes as arguments: 374 /// 375 /// - One i8 pointer for each external array reference used in the kernel. 376 /// - Host iterators 377 /// - Parameters 378 /// - Other LLVM Value references (TODO) 379 /// 380 /// @param Kernel The kernel to generate the function declaration for. 381 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 382 /// 383 /// @returns The newly declared function. 384 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 385 SetVector<Value *> &SubtreeValues); 386 387 /// Insert intrinsic functions to obtain thread and block ids. 388 /// 389 /// @param The kernel to generate the intrinsic functions for. 390 void insertKernelIntrinsics(ppcg_kernel *Kernel); 391 392 /// Create a global-to-shared or shared-to-global copy statement. 393 /// 394 /// @param CopyStmt The copy statement to generate code for 395 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 396 397 /// Create code for a ScopStmt called in @p Expr. 398 /// 399 /// @param Expr The expression containing the call. 400 /// @param KernelStmt The kernel statement referenced in the call. 401 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 402 403 /// Create an in-kernel synchronization call. 404 void createKernelSync(); 405 406 /// Create a PTX assembly string for the current GPU kernel. 407 /// 408 /// @returns A string containing the corresponding PTX assembly code. 409 std::string createKernelASM(); 410 411 /// Remove references from the dominator tree to the kernel function @p F. 412 /// 413 /// @param F The function to remove references to. 414 void clearDominators(Function *F); 415 416 /// Remove references from scalar evolution to the kernel function @p F. 417 /// 418 /// @param F The function to remove references to. 419 void clearScalarEvolution(Function *F); 420 421 /// Remove references from loop info to the kernel function @p F. 422 /// 423 /// @param F The function to remove references to. 424 void clearLoops(Function *F); 425 426 /// Finalize the generation of the kernel function. 427 /// 428 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 429 /// dump its IR to stderr. 430 /// 431 /// @returns The Assembly string of the kernel. 432 std::string finalizeKernelFunction(); 433 434 /// Finalize the generation of the kernel arguments. 435 /// 436 /// This function ensures that not-read-only scalars used in a kernel are 437 /// stored back to the global memory location they are backed with before 438 /// the kernel terminates. 439 /// 440 /// @params Kernel The kernel to finalize kernel arguments for. 441 void finalizeKernelArguments(ppcg_kernel *Kernel); 442 443 /// Create code that allocates memory to store arrays on device. 444 void allocateDeviceArrays(); 445 446 /// Free all allocated device arrays. 447 void freeDeviceArrays(); 448 449 /// Create a call to initialize the GPU context. 450 /// 451 /// @returns A pointer to the newly initialized context. 452 Value *createCallInitContext(); 453 454 /// Create a call to get the device pointer for a kernel allocation. 455 /// 456 /// @param Allocation The Polly GPU allocation 457 /// 458 /// @returns The device parameter corresponding to this allocation. 459 Value *createCallGetDevicePtr(Value *Allocation); 460 461 /// Create a call to free the GPU context. 462 /// 463 /// @param Context A pointer to an initialized GPU context. 464 void createCallFreeContext(Value *Context); 465 466 /// Create a call to allocate memory on the device. 467 /// 468 /// @param Size The size of memory to allocate 469 /// 470 /// @returns A pointer that identifies this allocation. 471 Value *createCallAllocateMemoryForDevice(Value *Size); 472 473 /// Create a call to free a device array. 474 /// 475 /// @param Array The device array to free. 476 void createCallFreeDeviceMemory(Value *Array); 477 478 /// Create a call to copy data from host to device. 479 /// 480 /// @param HostPtr A pointer to the host data that should be copied. 481 /// @param DevicePtr A device pointer specifying the location to copy to. 482 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 483 Value *Size); 484 485 /// Create a call to copy data from device to host. 486 /// 487 /// @param DevicePtr A pointer to the device data that should be copied. 488 /// @param HostPtr A host pointer specifying the location to copy to. 489 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 490 Value *Size); 491 492 /// Create a call to synchronize Host & Device. 493 /// \note 494 /// This is to be used only with managed memory. 495 void createCallSynchronizeDevice(); 496 497 /// Create a call to get a kernel from an assembly string. 498 /// 499 /// @param Buffer The string describing the kernel. 500 /// @param Entry The name of the kernel function to call. 501 /// 502 /// @returns A pointer to a kernel object 503 Value *createCallGetKernel(Value *Buffer, Value *Entry); 504 505 /// Create a call to free a GPU kernel. 506 /// 507 /// @param GPUKernel THe kernel to free. 508 void createCallFreeKernel(Value *GPUKernel); 509 510 /// Create a call to launch a GPU kernel. 511 /// 512 /// @param GPUKernel The kernel to launch. 513 /// @param GridDimX The size of the first grid dimension. 514 /// @param GridDimY The size of the second grid dimension. 515 /// @param GridBlockX The size of the first block dimension. 516 /// @param GridBlockY The size of the second block dimension. 517 /// @param GridBlockZ The size of the third block dimension. 518 /// @param Parameters A pointer to an array that contains itself pointers to 519 /// the parameter values passed for each kernel argument. 520 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 521 Value *GridDimY, Value *BlockDimX, 522 Value *BlockDimY, Value *BlockDimZ, 523 Value *Parameters); 524 }; 525 526 void GPUNodeBuilder::initializeAfterRTH() { 527 BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 528 &*Builder.GetInsertPoint(), &DT, &LI); 529 NewBB->setName("polly.acc.initialize"); 530 Builder.SetInsertPoint(&NewBB->front()); 531 532 GPUContext = createCallInitContext(); 533 534 if (!ManagedMemory) 535 allocateDeviceArrays(); 536 } 537 538 void GPUNodeBuilder::finalize() { 539 if (!ManagedMemory) 540 freeDeviceArrays(); 541 542 createCallFreeContext(GPUContext); 543 IslNodeBuilder::finalize(); 544 } 545 546 void GPUNodeBuilder::allocateDeviceArrays() { 547 assert(!ManagedMemory && "Managed memory will directly send host pointers " 548 "to the kernel. There is no need for device arrays"); 549 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 550 551 for (int i = 0; i < Prog->n_array; ++i) { 552 gpu_array_info *Array = &Prog->array[i]; 553 auto *ScopArray = (ScopArrayInfo *)Array->user; 554 std::string DevArrayName("p_dev_array_"); 555 DevArrayName.append(Array->name); 556 557 Value *ArraySize = getArraySize(Array); 558 Value *Offset = getArrayOffset(Array); 559 if (Offset) 560 ArraySize = Builder.CreateSub( 561 ArraySize, 562 Builder.CreateMul(Offset, 563 Builder.getInt64(ScopArray->getElemSizeInBytes()))); 564 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 565 DevArray->setName(DevArrayName); 566 DeviceAllocations[ScopArray] = DevArray; 567 } 568 569 isl_ast_build_free(Build); 570 } 571 572 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 573 Value *BlockDimY, Value *BlockDimZ) { 574 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 575 576 for (auto &F : *M) { 577 if (F.getCallingConv() != CallingConv::PTX_Kernel) 578 continue; 579 580 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 581 582 Metadata *Elements[] = { 583 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 584 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 585 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 586 ValueAsMetadata::get(V[2]), 587 }; 588 MDNode *Node = MDNode::get(M->getContext(), Elements); 589 AnnotationNode->addOperand(Node); 590 } 591 } 592 593 void GPUNodeBuilder::freeDeviceArrays() { 594 assert(!ManagedMemory && "Managed memory does not use device arrays"); 595 for (auto &Array : DeviceAllocations) 596 createCallFreeDeviceMemory(Array.second); 597 } 598 599 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 600 const char *Name = "polly_getKernel"; 601 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 602 Function *F = M->getFunction(Name); 603 604 // If F is not available, declare it. 605 if (!F) { 606 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 607 std::vector<Type *> Args; 608 Args.push_back(Builder.getInt8PtrTy()); 609 Args.push_back(Builder.getInt8PtrTy()); 610 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 611 F = Function::Create(Ty, Linkage, Name, M); 612 } 613 614 return Builder.CreateCall(F, {Buffer, Entry}); 615 } 616 617 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 618 const char *Name = "polly_getDevicePtr"; 619 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 620 Function *F = M->getFunction(Name); 621 622 // If F is not available, declare it. 623 if (!F) { 624 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 625 std::vector<Type *> Args; 626 Args.push_back(Builder.getInt8PtrTy()); 627 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 628 F = Function::Create(Ty, Linkage, Name, M); 629 } 630 631 return Builder.CreateCall(F, {Allocation}); 632 } 633 634 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 635 Value *GridDimY, Value *BlockDimX, 636 Value *BlockDimY, Value *BlockDimZ, 637 Value *Parameters) { 638 const char *Name = "polly_launchKernel"; 639 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 640 Function *F = M->getFunction(Name); 641 642 // If F is not available, declare it. 643 if (!F) { 644 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 645 std::vector<Type *> Args; 646 Args.push_back(Builder.getInt8PtrTy()); 647 Args.push_back(Builder.getInt32Ty()); 648 Args.push_back(Builder.getInt32Ty()); 649 Args.push_back(Builder.getInt32Ty()); 650 Args.push_back(Builder.getInt32Ty()); 651 Args.push_back(Builder.getInt32Ty()); 652 Args.push_back(Builder.getInt8PtrTy()); 653 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 654 F = Function::Create(Ty, Linkage, Name, M); 655 } 656 657 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 658 BlockDimZ, Parameters}); 659 } 660 661 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 662 const char *Name = "polly_freeKernel"; 663 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 664 Function *F = M->getFunction(Name); 665 666 // If F is not available, declare it. 667 if (!F) { 668 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 669 std::vector<Type *> Args; 670 Args.push_back(Builder.getInt8PtrTy()); 671 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 672 F = Function::Create(Ty, Linkage, Name, M); 673 } 674 675 Builder.CreateCall(F, {GPUKernel}); 676 } 677 678 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 679 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 680 "for device"); 681 const char *Name = "polly_freeDeviceMemory"; 682 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 683 Function *F = M->getFunction(Name); 684 685 // If F is not available, declare it. 686 if (!F) { 687 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 688 std::vector<Type *> Args; 689 Args.push_back(Builder.getInt8PtrTy()); 690 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 691 F = Function::Create(Ty, Linkage, Name, M); 692 } 693 694 Builder.CreateCall(F, {Array}); 695 } 696 697 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 698 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 699 "for device"); 700 const char *Name = "polly_allocateMemoryForDevice"; 701 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 702 Function *F = M->getFunction(Name); 703 704 // If F is not available, declare it. 705 if (!F) { 706 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 707 std::vector<Type *> Args; 708 Args.push_back(Builder.getInt64Ty()); 709 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 710 F = Function::Create(Ty, Linkage, Name, M); 711 } 712 713 return Builder.CreateCall(F, {Size}); 714 } 715 716 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 717 Value *DeviceData, 718 Value *Size) { 719 assert(!ManagedMemory && "Managed memory does not transfer memory between " 720 "device and host"); 721 const char *Name = "polly_copyFromHostToDevice"; 722 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 723 Function *F = M->getFunction(Name); 724 725 // If F is not available, declare it. 726 if (!F) { 727 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 728 std::vector<Type *> Args; 729 Args.push_back(Builder.getInt8PtrTy()); 730 Args.push_back(Builder.getInt8PtrTy()); 731 Args.push_back(Builder.getInt64Ty()); 732 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 733 F = Function::Create(Ty, Linkage, Name, M); 734 } 735 736 Builder.CreateCall(F, {HostData, DeviceData, Size}); 737 } 738 739 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 740 Value *HostData, 741 Value *Size) { 742 assert(!ManagedMemory && "Managed memory does not transfer memory between " 743 "device and host"); 744 const char *Name = "polly_copyFromDeviceToHost"; 745 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 746 Function *F = M->getFunction(Name); 747 748 // If F is not available, declare it. 749 if (!F) { 750 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 751 std::vector<Type *> Args; 752 Args.push_back(Builder.getInt8PtrTy()); 753 Args.push_back(Builder.getInt8PtrTy()); 754 Args.push_back(Builder.getInt64Ty()); 755 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 756 F = Function::Create(Ty, Linkage, Name, M); 757 } 758 759 Builder.CreateCall(F, {DeviceData, HostData, Size}); 760 } 761 762 void GPUNodeBuilder::createCallSynchronizeDevice() { 763 assert(ManagedMemory && "explicit synchronization is only necessary for " 764 "managed memory"); 765 const char *Name = "polly_synchronizeDevice"; 766 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 767 Function *F = M->getFunction(Name); 768 769 // If F is not available, declare it. 770 if (!F) { 771 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 772 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); 773 F = Function::Create(Ty, Linkage, Name, M); 774 } 775 776 Builder.CreateCall(F); 777 } 778 779 Value *GPUNodeBuilder::createCallInitContext() { 780 const char *Name; 781 782 switch (Runtime) { 783 case GPURuntime::CUDA: 784 Name = "polly_initContextCUDA"; 785 break; 786 case GPURuntime::OpenCL: 787 Name = "polly_initContextCL"; 788 break; 789 } 790 791 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 792 Function *F = M->getFunction(Name); 793 794 // If F is not available, declare it. 795 if (!F) { 796 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 797 std::vector<Type *> Args; 798 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 799 F = Function::Create(Ty, Linkage, Name, M); 800 } 801 802 return Builder.CreateCall(F, {}); 803 } 804 805 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 806 const char *Name = "polly_freeContext"; 807 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 808 Function *F = M->getFunction(Name); 809 810 // If F is not available, declare it. 811 if (!F) { 812 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 813 std::vector<Type *> Args; 814 Args.push_back(Builder.getInt8PtrTy()); 815 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 816 F = Function::Create(Ty, Linkage, Name, M); 817 } 818 819 Builder.CreateCall(F, {Context}); 820 } 821 822 /// Check if one string is a prefix of another. 823 /// 824 /// @param String The string in which to look for the prefix. 825 /// @param Prefix The prefix to look for. 826 static bool isPrefix(std::string String, std::string Prefix) { 827 return String.find(Prefix) == 0; 828 } 829 830 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 831 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 832 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 833 834 if (!gpu_array_is_scalar(Array)) { 835 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 836 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 837 838 for (unsigned int i = 1; i < Array->n_index; i++) { 839 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 840 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 841 Res = isl_ast_expr_mul(Res, Expr); 842 } 843 844 Value *NumElements = ExprBuilder.create(Res); 845 if (NumElements->getType() != ArraySize->getType()) 846 NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); 847 ArraySize = Builder.CreateMul(ArraySize, NumElements); 848 } 849 isl_ast_build_free(Build); 850 return ArraySize; 851 } 852 853 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { 854 if (gpu_array_is_scalar(Array)) 855 return nullptr; 856 857 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 858 859 isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent)); 860 861 isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min)); 862 863 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) 864 ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0); 865 866 if (isl_set_is_subset(Min, ZeroSet)) { 867 isl_set_free(Min); 868 isl_set_free(ZeroSet); 869 isl_ast_build_free(Build); 870 return nullptr; 871 } 872 isl_set_free(ZeroSet); 873 874 isl_ast_expr *Result = 875 isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0)); 876 877 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { 878 if (i > 0) { 879 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); 880 isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 881 Result = isl_ast_expr_mul(Result, BExpr); 882 } 883 isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i); 884 isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin); 885 Result = isl_ast_expr_add(Result, MExpr); 886 } 887 888 Value *ResultValue = ExprBuilder.create(Result); 889 isl_set_free(Min); 890 isl_ast_build_free(Build); 891 892 return ResultValue; 893 } 894 895 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, 896 ScopArrayInfo *ArrayInfo) { 897 898 assert(ManagedMemory && "Only used when you wish to get a host " 899 "pointer for sending data to the kernel, " 900 "with managed memory"); 901 std::map<ScopArrayInfo *, Value *>::iterator it; 902 if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { 903 return it->second; 904 } else { 905 Value *HostPtr; 906 907 if (gpu_array_is_scalar(Array)) 908 HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); 909 else 910 HostPtr = ArrayInfo->getBasePtr(); 911 912 Value *Offset = getArrayOffset(Array); 913 if (Offset) { 914 HostPtr = Builder.CreatePointerCast( 915 HostPtr, ArrayInfo->getElementType()->getPointerTo()); 916 HostPtr = Builder.CreateGEP(HostPtr, Offset); 917 } 918 919 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 920 DeviceAllocations[ArrayInfo] = HostPtr; 921 return HostPtr; 922 } 923 } 924 925 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 926 enum DataDirection Direction) { 927 assert(!ManagedMemory && "Managed memory needs no data transfers"); 928 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 929 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 930 isl_id *Id = isl_ast_expr_get_id(Arg); 931 auto Array = (gpu_array_info *)isl_id_get_user(Id); 932 auto ScopArray = (ScopArrayInfo *)(Array->user); 933 934 Value *Size = getArraySize(Array); 935 Value *Offset = getArrayOffset(Array); 936 Value *DevPtr = DeviceAllocations[ScopArray]; 937 938 Value *HostPtr; 939 940 if (gpu_array_is_scalar(Array)) 941 HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 942 else 943 HostPtr = ScopArray->getBasePtr(); 944 945 if (Offset) { 946 HostPtr = Builder.CreatePointerCast( 947 HostPtr, ScopArray->getElementType()->getPointerTo()); 948 HostPtr = Builder.CreateGEP(HostPtr, Offset); 949 } 950 951 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 952 953 if (Offset) { 954 Size = Builder.CreateSub( 955 Size, Builder.CreateMul( 956 Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); 957 } 958 959 if (Direction == HOST_TO_DEVICE) 960 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 961 else 962 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 963 964 isl_id_free(Id); 965 isl_ast_expr_free(Arg); 966 isl_ast_expr_free(Expr); 967 isl_ast_node_free(TransferStmt); 968 } 969 970 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 971 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 972 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 973 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 974 isl_id_free(Id); 975 isl_ast_expr_free(StmtExpr); 976 977 const char *Str = isl_id_get_name(Id); 978 if (!strcmp(Str, "kernel")) { 979 createKernel(UserStmt); 980 isl_ast_expr_free(Expr); 981 return; 982 } 983 984 if (isPrefix(Str, "to_device")) { 985 if (!ManagedMemory) 986 createDataTransfer(UserStmt, HOST_TO_DEVICE); 987 else 988 isl_ast_node_free(UserStmt); 989 990 isl_ast_expr_free(Expr); 991 return; 992 } 993 994 if (isPrefix(Str, "from_device")) { 995 if (!ManagedMemory) { 996 createDataTransfer(UserStmt, DEVICE_TO_HOST); 997 } else { 998 createCallSynchronizeDevice(); 999 isl_ast_node_free(UserStmt); 1000 } 1001 isl_ast_expr_free(Expr); 1002 return; 1003 } 1004 1005 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 1006 struct ppcg_kernel_stmt *KernelStmt = 1007 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 1008 isl_id_free(Anno); 1009 1010 switch (KernelStmt->type) { 1011 case ppcg_kernel_domain: 1012 createScopStmt(Expr, KernelStmt); 1013 isl_ast_node_free(UserStmt); 1014 return; 1015 case ppcg_kernel_copy: 1016 createKernelCopy(KernelStmt); 1017 isl_ast_expr_free(Expr); 1018 isl_ast_node_free(UserStmt); 1019 return; 1020 case ppcg_kernel_sync: 1021 createKernelSync(); 1022 isl_ast_expr_free(Expr); 1023 isl_ast_node_free(UserStmt); 1024 return; 1025 } 1026 1027 isl_ast_expr_free(Expr); 1028 isl_ast_node_free(UserStmt); 1029 return; 1030 } 1031 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 1032 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 1033 LocalIndex = isl_ast_expr_address_of(LocalIndex); 1034 Value *LocalAddr = ExprBuilder.create(LocalIndex); 1035 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 1036 Index = isl_ast_expr_address_of(Index); 1037 Value *GlobalAddr = ExprBuilder.create(Index); 1038 1039 if (KernelStmt->u.c.read) { 1040 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 1041 Builder.CreateStore(Load, LocalAddr); 1042 } else { 1043 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 1044 Builder.CreateStore(Load, GlobalAddr); 1045 } 1046 } 1047 1048 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 1049 ppcg_kernel_stmt *KernelStmt) { 1050 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1051 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 1052 1053 LoopToScevMapT LTS; 1054 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 1055 1056 createSubstitutions(Expr, Stmt, LTS); 1057 1058 if (Stmt->isBlockStmt()) 1059 BlockGen.copyStmt(*Stmt, LTS, Indexes); 1060 else 1061 RegionGen.copyStmt(*Stmt, LTS, Indexes); 1062 } 1063 1064 void GPUNodeBuilder::createKernelSync() { 1065 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1066 1067 Function *Sync; 1068 1069 switch (Arch) { 1070 case GPUArch::NVPTX64: 1071 Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 1072 break; 1073 } 1074 1075 Builder.CreateCall(Sync, {}); 1076 } 1077 1078 /// Collect llvm::Values referenced from @p Node 1079 /// 1080 /// This function only applies to isl_ast_nodes that are user_nodes referring 1081 /// to a ScopStmt. All other node types are ignore. 1082 /// 1083 /// @param Node The node to collect references for. 1084 /// @param User A user pointer used as storage for the data that is collected. 1085 /// 1086 /// @returns isl_bool_true if data could be collected successfully. 1087 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 1088 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 1089 return isl_bool_true; 1090 1091 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 1092 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1093 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1094 const char *Str = isl_id_get_name(Id); 1095 isl_id_free(Id); 1096 isl_ast_expr_free(StmtExpr); 1097 isl_ast_expr_free(Expr); 1098 1099 if (!isPrefix(Str, "Stmt")) 1100 return isl_bool_true; 1101 1102 Id = isl_ast_node_get_annotation(Node); 1103 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 1104 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1105 isl_id_free(Id); 1106 1107 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 1108 1109 return isl_bool_true; 1110 } 1111 1112 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 1113 SetVector<Value *> SubtreeValues; 1114 SetVector<const SCEV *> SCEVs; 1115 SetVector<const Loop *> Loops; 1116 SubtreeReferences References = { 1117 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 1118 1119 for (const auto &I : IDToValue) 1120 SubtreeValues.insert(I.second); 1121 1122 isl_ast_node_foreach_descendant_top_down( 1123 Kernel->tree, collectReferencesInGPUStmt, &References); 1124 1125 for (const SCEV *Expr : SCEVs) 1126 findValues(Expr, SE, SubtreeValues); 1127 1128 for (auto &SAI : S.arrays()) 1129 SubtreeValues.remove(SAI->getBasePtr()); 1130 1131 isl_space *Space = S.getParamSpace(); 1132 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 1133 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 1134 assert(IDToValue.count(Id)); 1135 Value *Val = IDToValue[Id]; 1136 SubtreeValues.remove(Val); 1137 isl_id_free(Id); 1138 } 1139 isl_space_free(Space); 1140 1141 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 1142 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1143 assert(IDToValue.count(Id)); 1144 Value *Val = IDToValue[Id]; 1145 SubtreeValues.remove(Val); 1146 isl_id_free(Id); 1147 } 1148 1149 return SubtreeValues; 1150 } 1151 1152 void GPUNodeBuilder::clearDominators(Function *F) { 1153 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 1154 std::vector<BasicBlock *> Nodes; 1155 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 1156 Nodes.push_back(I->getBlock()); 1157 1158 for (BasicBlock *BB : Nodes) 1159 DT.eraseNode(BB); 1160 } 1161 1162 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 1163 for (BasicBlock &BB : *F) { 1164 Loop *L = LI.getLoopFor(&BB); 1165 if (L) 1166 SE.forgetLoop(L); 1167 } 1168 } 1169 1170 void GPUNodeBuilder::clearLoops(Function *F) { 1171 for (BasicBlock &BB : *F) { 1172 Loop *L = LI.getLoopFor(&BB); 1173 if (L) 1174 SE.forgetLoop(L); 1175 LI.removeBlock(&BB); 1176 } 1177 } 1178 1179 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 1180 std::vector<Value *> Sizes; 1181 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 1182 1183 for (long i = 0; i < Kernel->n_grid; i++) { 1184 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 1185 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 1186 Value *Res = ExprBuilder.create(GridSize); 1187 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 1188 Sizes.push_back(Res); 1189 } 1190 isl_ast_build_free(Context); 1191 1192 for (long i = Kernel->n_grid; i < 3; i++) 1193 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1194 1195 return std::make_tuple(Sizes[0], Sizes[1]); 1196 } 1197 1198 std::tuple<Value *, Value *, Value *> 1199 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 1200 std::vector<Value *> Sizes; 1201 1202 for (long i = 0; i < Kernel->n_block; i++) { 1203 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 1204 Sizes.push_back(Res); 1205 } 1206 1207 for (long i = Kernel->n_block; i < 3; i++) 1208 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1209 1210 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 1211 } 1212 1213 void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, 1214 Instruction *Param, int Index) { 1215 Value *Slot = Builder.CreateGEP( 1216 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1217 Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1218 Builder.CreateStore(ParamTyped, Slot); 1219 } 1220 1221 Value * 1222 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 1223 SetVector<Value *> SubtreeValues) { 1224 const int NumArgs = F->arg_size(); 1225 std::vector<int> ArgSizes(NumArgs); 1226 1227 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); 1228 1229 BasicBlock *EntryBlock = 1230 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 1231 auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); 1232 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 1233 Instruction *Parameters = new AllocaInst( 1234 ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); 1235 1236 int Index = 0; 1237 for (long i = 0; i < Prog->n_array; i++) { 1238 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1239 continue; 1240 1241 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1242 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1243 1244 ArgSizes[Index] = SAI->getElemSizeInBytes(); 1245 1246 Value *DevArray = nullptr; 1247 if (ManagedMemory) { 1248 DevArray = getOrCreateManagedDeviceArray( 1249 &Prog->array[i], const_cast<ScopArrayInfo *>(SAI)); 1250 } else { 1251 DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)]; 1252 DevArray = createCallGetDevicePtr(DevArray); 1253 } 1254 assert(DevArray != nullptr && "Array to be offloaded to device not " 1255 "initialized"); 1256 Value *Offset = getArrayOffset(&Prog->array[i]); 1257 1258 if (Offset) { 1259 DevArray = Builder.CreatePointerCast( 1260 DevArray, SAI->getElementType()->getPointerTo()); 1261 DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); 1262 DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); 1263 } 1264 Value *Slot = Builder.CreateGEP( 1265 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1266 1267 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1268 Value *ValPtr = nullptr; 1269 if (ManagedMemory) 1270 ValPtr = DevArray; 1271 else 1272 ValPtr = BlockGen.getOrCreateAlloca(SAI); 1273 1274 assert(ValPtr != nullptr && "ValPtr that should point to a valid object" 1275 " to be stored into Parameters"); 1276 Value *ValPtrCast = 1277 Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); 1278 Builder.CreateStore(ValPtrCast, Slot); 1279 } else { 1280 Instruction *Param = 1281 new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, 1282 Launch + "_param_" + std::to_string(Index), 1283 EntryBlock->getTerminator()); 1284 Builder.CreateStore(DevArray, Param); 1285 Value *ParamTyped = 1286 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1287 Builder.CreateStore(ParamTyped, Slot); 1288 } 1289 Index++; 1290 } 1291 1292 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1293 1294 for (long i = 0; i < NumHostIters; i++) { 1295 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1296 Value *Val = IDToValue[Id]; 1297 isl_id_free(Id); 1298 1299 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1300 1301 Instruction *Param = 1302 new AllocaInst(Val->getType(), AddressSpace, 1303 Launch + "_param_" + std::to_string(Index), 1304 EntryBlock->getTerminator()); 1305 Builder.CreateStore(Val, Param); 1306 insertStoreParameter(Parameters, Param, Index); 1307 Index++; 1308 } 1309 1310 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1311 1312 for (long i = 0; i < NumVars; i++) { 1313 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1314 Value *Val = IDToValue[Id]; 1315 isl_id_free(Id); 1316 1317 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1318 1319 Instruction *Param = 1320 new AllocaInst(Val->getType(), AddressSpace, 1321 Launch + "_param_" + std::to_string(Index), 1322 EntryBlock->getTerminator()); 1323 Builder.CreateStore(Val, Param); 1324 insertStoreParameter(Parameters, Param, Index); 1325 Index++; 1326 } 1327 1328 for (auto Val : SubtreeValues) { 1329 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1330 1331 Instruction *Param = 1332 new AllocaInst(Val->getType(), AddressSpace, 1333 Launch + "_param_" + std::to_string(Index), 1334 EntryBlock->getTerminator()); 1335 Builder.CreateStore(Val, Param); 1336 insertStoreParameter(Parameters, Param, Index); 1337 Index++; 1338 } 1339 1340 for (int i = 0; i < NumArgs; i++) { 1341 Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); 1342 Instruction *Param = 1343 new AllocaInst(Builder.getInt32Ty(), AddressSpace, 1344 Launch + "_param_size_" + std::to_string(i), 1345 EntryBlock->getTerminator()); 1346 Builder.CreateStore(Val, Param); 1347 insertStoreParameter(Parameters, Param, Index); 1348 Index++; 1349 } 1350 1351 auto Location = EntryBlock->getTerminator(); 1352 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1353 Launch + "_params_i8ptr", Location); 1354 } 1355 1356 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1357 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1358 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1359 isl_id_free(Id); 1360 isl_ast_node_free(KernelStmt); 1361 1362 if (Kernel->n_grid > 1) 1363 DeepestParallel = 1364 std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set)); 1365 else 1366 DeepestSequential = 1367 std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set)); 1368 1369 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1370 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1371 1372 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 1373 1374 assert(Kernel->tree && "Device AST of kernel node is empty"); 1375 1376 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1377 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1378 ValueMapT HostValueMap = ValueMap; 1379 BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; 1380 ScalarMap.clear(); 1381 1382 SetVector<const Loop *> Loops; 1383 1384 // Create for all loops we depend on values that contain the current loop 1385 // iteration. These values are necessary to generate code for SCEVs that 1386 // depend on such loops. As a result we need to pass them to the subfunction. 1387 for (const Loop *L : Loops) { 1388 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1389 SE.getUnknown(Builder.getInt64(1)), 1390 L, SCEV::FlagAnyWrap); 1391 Value *V = generateSCEV(OuterLIV); 1392 OutsideLoopIterations[L] = SE.getUnknown(V); 1393 SubtreeValues.insert(V); 1394 } 1395 1396 createKernelFunction(Kernel, SubtreeValues); 1397 1398 create(isl_ast_node_copy(Kernel->tree)); 1399 1400 finalizeKernelArguments(Kernel); 1401 Function *F = Builder.GetInsertBlock()->getParent(); 1402 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1403 clearDominators(F); 1404 clearScalarEvolution(F); 1405 clearLoops(F); 1406 1407 IDToValue = HostIDs; 1408 1409 ValueMap = std::move(HostValueMap); 1410 ScalarMap = std::move(HostScalarMap); 1411 EscapeMap.clear(); 1412 IDToSAI.clear(); 1413 Annotator.resetAlternativeAliasBases(); 1414 for (auto &BasePtr : LocalArrays) 1415 S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); 1416 LocalArrays.clear(); 1417 1418 std::string ASMString = finalizeKernelFunction(); 1419 Builder.SetInsertPoint(&HostInsertPoint); 1420 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1421 1422 std::string Name = "kernel_" + std::to_string(Kernel->id); 1423 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1424 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1425 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1426 1427 Value *GridDimX, *GridDimY; 1428 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1429 1430 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1431 BlockDimZ, Parameters); 1432 createCallFreeKernel(GPUKernel); 1433 1434 for (auto Id : KernelIds) 1435 isl_id_free(Id); 1436 1437 KernelIds.clear(); 1438 } 1439 1440 /// Compute the DataLayout string for the NVPTX backend. 1441 /// 1442 /// @param is64Bit Are we looking for a 64 bit architecture? 1443 static std::string computeNVPTXDataLayout(bool is64Bit) { 1444 std::string Ret = ""; 1445 1446 if (!is64Bit) { 1447 Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1448 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1449 "64-v128:128:128-n16:32:64"; 1450 } else { 1451 Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1452 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1453 "64-v128:128:128-n16:32:64"; 1454 } 1455 1456 return Ret; 1457 } 1458 1459 Function * 1460 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1461 SetVector<Value *> &SubtreeValues) { 1462 std::vector<Type *> Args; 1463 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1464 1465 for (long i = 0; i < Prog->n_array; i++) { 1466 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1467 continue; 1468 1469 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1470 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1471 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1472 Args.push_back(SAI->getElementType()); 1473 } else { 1474 static const int UseGlobalMemory = 1; 1475 Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); 1476 } 1477 } 1478 1479 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1480 1481 for (long i = 0; i < NumHostIters; i++) 1482 Args.push_back(Builder.getInt64Ty()); 1483 1484 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1485 1486 for (long i = 0; i < NumVars; i++) { 1487 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1488 Value *Val = IDToValue[Id]; 1489 isl_id_free(Id); 1490 Args.push_back(Val->getType()); 1491 } 1492 1493 for (auto *V : SubtreeValues) 1494 Args.push_back(V->getType()); 1495 1496 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1497 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1498 GPUModule.get()); 1499 1500 switch (Arch) { 1501 case GPUArch::NVPTX64: 1502 FN->setCallingConv(CallingConv::PTX_Kernel); 1503 break; 1504 } 1505 1506 auto Arg = FN->arg_begin(); 1507 for (long i = 0; i < Kernel->n_array; i++) { 1508 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1509 continue; 1510 1511 Arg->setName(Kernel->array[i].array->name); 1512 1513 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1514 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1515 Type *EleTy = SAI->getElementType(); 1516 Value *Val = &*Arg; 1517 SmallVector<const SCEV *, 4> Sizes; 1518 isl_ast_build *Build = 1519 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1520 Sizes.push_back(nullptr); 1521 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1522 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1523 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1524 auto V = ExprBuilder.create(DimSize); 1525 Sizes.push_back(SE.getSCEV(V)); 1526 } 1527 const ScopArrayInfo *SAIRep = 1528 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); 1529 LocalArrays.push_back(Val); 1530 1531 isl_ast_build_free(Build); 1532 KernelIds.push_back(Id); 1533 IDToSAI[Id] = SAIRep; 1534 Arg++; 1535 } 1536 1537 for (long i = 0; i < NumHostIters; i++) { 1538 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1539 Arg->setName(isl_id_get_name(Id)); 1540 IDToValue[Id] = &*Arg; 1541 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1542 Arg++; 1543 } 1544 1545 for (long i = 0; i < NumVars; i++) { 1546 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1547 Arg->setName(isl_id_get_name(Id)); 1548 Value *Val = IDToValue[Id]; 1549 ValueMap[Val] = &*Arg; 1550 IDToValue[Id] = &*Arg; 1551 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1552 Arg++; 1553 } 1554 1555 for (auto *V : SubtreeValues) { 1556 Arg->setName(V->getName()); 1557 ValueMap[V] = &*Arg; 1558 Arg++; 1559 } 1560 1561 return FN; 1562 } 1563 1564 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1565 Intrinsic::ID IntrinsicsBID[2]; 1566 Intrinsic::ID IntrinsicsTID[3]; 1567 1568 switch (Arch) { 1569 case GPUArch::NVPTX64: 1570 IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; 1571 IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; 1572 1573 IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; 1574 IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; 1575 IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; 1576 break; 1577 } 1578 1579 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1580 std::string Name = isl_id_get_name(Id); 1581 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1582 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1583 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1584 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1585 IDToValue[Id] = Val; 1586 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1587 }; 1588 1589 for (int i = 0; i < Kernel->n_grid; ++i) { 1590 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1591 addId(Id, IntrinsicsBID[i]); 1592 } 1593 1594 for (int i = 0; i < Kernel->n_block; ++i) { 1595 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1596 addId(Id, IntrinsicsTID[i]); 1597 } 1598 } 1599 1600 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1601 auto Arg = FN->arg_begin(); 1602 for (long i = 0; i < Kernel->n_array; i++) { 1603 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1604 continue; 1605 1606 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1607 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1608 isl_id_free(Id); 1609 1610 if (SAI->getNumberOfDimensions() > 0) { 1611 Arg++; 1612 continue; 1613 } 1614 1615 Value *Val = &*Arg; 1616 1617 if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { 1618 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1619 Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); 1620 Val = Builder.CreateLoad(TypedArgPtr); 1621 } 1622 1623 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1624 Builder.CreateStore(Val, Alloca); 1625 1626 Arg++; 1627 } 1628 } 1629 1630 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { 1631 auto *FN = Builder.GetInsertBlock()->getParent(); 1632 auto Arg = FN->arg_begin(); 1633 1634 bool StoredScalar = false; 1635 for (long i = 0; i < Kernel->n_array; i++) { 1636 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1637 continue; 1638 1639 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1640 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1641 isl_id_free(Id); 1642 1643 if (SAI->getNumberOfDimensions() > 0) { 1644 Arg++; 1645 continue; 1646 } 1647 1648 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1649 Arg++; 1650 continue; 1651 } 1652 1653 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1654 Value *ArgPtr = &*Arg; 1655 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1656 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1657 Value *Val = Builder.CreateLoad(Alloca); 1658 Builder.CreateStore(Val, TypedArgPtr); 1659 StoredScalar = true; 1660 1661 Arg++; 1662 } 1663 1664 if (StoredScalar) 1665 /// In case more than one thread contains scalar stores, the generated 1666 /// code might be incorrect, if we only store at the end of the kernel. 1667 /// To support this case we need to store these scalars back at each 1668 /// memory store or at least before each kernel barrier. 1669 if (Kernel->n_block != 0 || Kernel->n_grid != 0) 1670 BuildSuccessful = 0; 1671 } 1672 1673 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1674 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1675 1676 for (int i = 0; i < Kernel->n_var; ++i) { 1677 struct ppcg_kernel_var &Var = Kernel->var[i]; 1678 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1679 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1680 1681 Type *ArrayTy = EleTy; 1682 SmallVector<const SCEV *, 4> Sizes; 1683 1684 Sizes.push_back(nullptr); 1685 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1686 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1687 long Bound = isl_val_get_num_si(Val); 1688 isl_val_free(Val); 1689 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1690 } 1691 1692 for (int j = Var.array->n_index - 1; j >= 0; --j) { 1693 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1694 long Bound = isl_val_get_num_si(Val); 1695 isl_val_free(Val); 1696 ArrayTy = ArrayType::get(ArrayTy, Bound); 1697 } 1698 1699 const ScopArrayInfo *SAI; 1700 Value *Allocation; 1701 if (Var.type == ppcg_access_shared) { 1702 auto GlobalVar = new GlobalVariable( 1703 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1704 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1705 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1706 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1707 1708 Allocation = GlobalVar; 1709 } else if (Var.type == ppcg_access_private) { 1710 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1711 } else { 1712 llvm_unreachable("unknown variable type"); 1713 } 1714 SAI = 1715 S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); 1716 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1717 IDToValue[Id] = Allocation; 1718 LocalArrays.push_back(Allocation); 1719 KernelIds.push_back(Id); 1720 IDToSAI[Id] = SAI; 1721 } 1722 } 1723 1724 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1725 SetVector<Value *> &SubtreeValues) { 1726 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1727 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1728 1729 switch (Arch) { 1730 case GPUArch::NVPTX64: 1731 if (Runtime == GPURuntime::CUDA) 1732 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1733 else if (Runtime == GPURuntime::OpenCL) 1734 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); 1735 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1736 break; 1737 } 1738 1739 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1740 1741 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1742 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1743 1744 DT.addNewBlock(EntryBlock, PrevBlock); 1745 1746 Builder.SetInsertPoint(EntryBlock); 1747 Builder.CreateRetVoid(); 1748 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1749 1750 ScopDetection::markFunctionAsInvalid(FN); 1751 1752 prepareKernelArguments(Kernel, FN); 1753 createKernelVariables(Kernel, FN); 1754 insertKernelIntrinsics(Kernel); 1755 } 1756 1757 std::string GPUNodeBuilder::createKernelASM() { 1758 llvm::Triple GPUTriple; 1759 1760 switch (Arch) { 1761 case GPUArch::NVPTX64: 1762 switch (Runtime) { 1763 case GPURuntime::CUDA: 1764 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); 1765 break; 1766 case GPURuntime::OpenCL: 1767 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); 1768 break; 1769 } 1770 break; 1771 } 1772 1773 std::string ErrMsg; 1774 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1775 1776 if (!GPUTarget) { 1777 errs() << ErrMsg << "\n"; 1778 return ""; 1779 } 1780 1781 TargetOptions Options; 1782 Options.UnsafeFPMath = FastMath; 1783 1784 std::string subtarget; 1785 1786 switch (Arch) { 1787 case GPUArch::NVPTX64: 1788 subtarget = CudaVersion; 1789 break; 1790 } 1791 1792 std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine( 1793 GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>())); 1794 1795 SmallString<0> ASMString; 1796 raw_svector_ostream ASMStream(ASMString); 1797 llvm::legacy::PassManager PM; 1798 1799 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1800 1801 if (TargetM->addPassesToEmitFile( 1802 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1803 errs() << "The target does not support generation of this file type!\n"; 1804 return ""; 1805 } 1806 1807 PM.run(*GPUModule); 1808 1809 return ASMStream.str(); 1810 } 1811 1812 std::string GPUNodeBuilder::finalizeKernelFunction() { 1813 if (verifyModule(*GPUModule)) { 1814 BuildSuccessful = false; 1815 return ""; 1816 } 1817 1818 if (DumpKernelIR) 1819 outs() << *GPUModule << "\n"; 1820 1821 // Optimize module. 1822 llvm::legacy::PassManager OptPasses; 1823 PassManagerBuilder PassBuilder; 1824 PassBuilder.OptLevel = 3; 1825 PassBuilder.SizeLevel = 0; 1826 PassBuilder.populateModulePassManager(OptPasses); 1827 OptPasses.run(*GPUModule); 1828 1829 std::string Assembly = createKernelASM(); 1830 1831 if (DumpKernelASM) 1832 outs() << Assembly << "\n"; 1833 1834 GPUModule.release(); 1835 KernelIDs.clear(); 1836 1837 return Assembly; 1838 } 1839 1840 namespace { 1841 class PPCGCodeGeneration : public ScopPass { 1842 public: 1843 static char ID; 1844 1845 GPURuntime Runtime = GPURuntime::CUDA; 1846 1847 GPUArch Architecture = GPUArch::NVPTX64; 1848 1849 /// The scop that is currently processed. 1850 Scop *S; 1851 1852 LoopInfo *LI; 1853 DominatorTree *DT; 1854 ScalarEvolution *SE; 1855 const DataLayout *DL; 1856 RegionInfo *RI; 1857 1858 PPCGCodeGeneration() : ScopPass(ID) {} 1859 1860 /// Construct compilation options for PPCG. 1861 /// 1862 /// @returns The compilation options. 1863 ppcg_options *createPPCGOptions() { 1864 auto DebugOptions = 1865 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1866 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1867 1868 DebugOptions->dump_schedule_constraints = false; 1869 DebugOptions->dump_schedule = false; 1870 DebugOptions->dump_final_schedule = false; 1871 DebugOptions->dump_sizes = false; 1872 DebugOptions->verbose = false; 1873 1874 Options->debug = DebugOptions; 1875 1876 Options->reschedule = true; 1877 Options->scale_tile_loops = false; 1878 Options->wrap = false; 1879 1880 Options->non_negative_parameters = false; 1881 Options->ctx = nullptr; 1882 Options->sizes = nullptr; 1883 1884 Options->tile_size = 32; 1885 1886 Options->use_private_memory = PrivateMemory; 1887 Options->use_shared_memory = SharedMemory; 1888 Options->max_shared_memory = 48 * 1024; 1889 1890 Options->target = PPCG_TARGET_CUDA; 1891 Options->openmp = false; 1892 Options->linearize_device_arrays = true; 1893 Options->live_range_reordering = false; 1894 1895 Options->opencl_compiler_options = nullptr; 1896 Options->opencl_use_gpu = false; 1897 Options->opencl_n_include_file = 0; 1898 Options->opencl_include_files = nullptr; 1899 Options->opencl_print_kernel_types = false; 1900 Options->opencl_embed_kernel_code = false; 1901 1902 Options->save_schedule_file = nullptr; 1903 Options->load_schedule_file = nullptr; 1904 1905 return Options; 1906 } 1907 1908 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1909 /// 1910 /// Instead of a normal access of the form: 1911 /// 1912 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1913 /// 1914 /// a tagged access has the form 1915 /// 1916 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1917 /// 1918 /// where 'id' is an additional space that references the memory access that 1919 /// triggered the access. 1920 /// 1921 /// @param AccessTy The type of the memory accesses to collect. 1922 /// 1923 /// @return The relation describing all tagged memory accesses. 1924 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1925 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1926 1927 for (auto &Stmt : *S) 1928 for (auto &Acc : Stmt) 1929 if (Acc->getType() == AccessTy) { 1930 isl_map *Relation = Acc->getAccessRelation(); 1931 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1932 1933 isl_space *Space = isl_map_get_space(Relation); 1934 Space = isl_space_range(Space); 1935 Space = isl_space_from_range(Space); 1936 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1937 isl_map *Universe = isl_map_universe(Space); 1938 Relation = isl_map_domain_product(Relation, Universe); 1939 Accesses = isl_union_map_add_map(Accesses, Relation); 1940 } 1941 1942 return Accesses; 1943 } 1944 1945 /// Get the set of all read accesses, tagged with the access id. 1946 /// 1947 /// @see getTaggedAccesses 1948 isl_union_map *getTaggedReads() { 1949 return getTaggedAccesses(MemoryAccess::READ); 1950 } 1951 1952 /// Get the set of all may (and must) accesses, tagged with the access id. 1953 /// 1954 /// @see getTaggedAccesses 1955 isl_union_map *getTaggedMayWrites() { 1956 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1957 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1958 } 1959 1960 /// Get the set of all must accesses, tagged with the access id. 1961 /// 1962 /// @see getTaggedAccesses 1963 isl_union_map *getTaggedMustWrites() { 1964 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1965 } 1966 1967 /// Collect parameter and array names as isl_ids. 1968 /// 1969 /// To reason about the different parameters and arrays used, ppcg requires 1970 /// a list of all isl_ids in use. As PPCG traditionally performs 1971 /// source-to-source compilation each of these isl_ids is mapped to the 1972 /// expression that represents it. As we do not have a corresponding 1973 /// expression in Polly, we just map each id to a 'zero' expression to match 1974 /// the data format that ppcg expects. 1975 /// 1976 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1977 __isl_give isl_id_to_ast_expr *getNames() { 1978 auto *Names = isl_id_to_ast_expr_alloc( 1979 S->getIslCtx(), 1980 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1981 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1982 auto *Space = S->getParamSpace(); 1983 1984 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1985 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1986 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1987 } 1988 1989 for (auto &Array : S->arrays()) { 1990 auto Id = Array->getBasePtrId(); 1991 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1992 } 1993 1994 isl_space_free(Space); 1995 isl_ast_expr_free(Zero); 1996 1997 return Names; 1998 } 1999 2000 /// Create a new PPCG scop from the current scop. 2001 /// 2002 /// The PPCG scop is initialized with data from the current polly::Scop. From 2003 /// this initial data, the data-dependences in the PPCG scop are initialized. 2004 /// We do not use Polly's dependence analysis for now, to ensure we match 2005 /// the PPCG default behaviour more closely. 2006 /// 2007 /// @returns A new ppcg scop. 2008 ppcg_scop *createPPCGScop() { 2009 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 2010 2011 PPCGScop->options = createPPCGOptions(); 2012 2013 PPCGScop->start = 0; 2014 PPCGScop->end = 0; 2015 2016 PPCGScop->context = S->getContext(); 2017 PPCGScop->domain = S->getDomains(); 2018 PPCGScop->call = nullptr; 2019 PPCGScop->tagged_reads = getTaggedReads(); 2020 PPCGScop->reads = S->getReads(); 2021 PPCGScop->live_in = nullptr; 2022 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 2023 PPCGScop->may_writes = S->getWrites(); 2024 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 2025 PPCGScop->must_writes = S->getMustWrites(); 2026 PPCGScop->live_out = nullptr; 2027 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 2028 PPCGScop->tagger = nullptr; 2029 2030 PPCGScop->independence = nullptr; 2031 PPCGScop->dep_flow = nullptr; 2032 PPCGScop->tagged_dep_flow = nullptr; 2033 PPCGScop->dep_false = nullptr; 2034 PPCGScop->dep_forced = nullptr; 2035 PPCGScop->dep_order = nullptr; 2036 PPCGScop->tagged_dep_order = nullptr; 2037 2038 PPCGScop->schedule = S->getScheduleTree(); 2039 PPCGScop->names = getNames(); 2040 2041 PPCGScop->pet = nullptr; 2042 2043 compute_tagger(PPCGScop); 2044 compute_dependences(PPCGScop); 2045 2046 return PPCGScop; 2047 } 2048 2049 /// Collect the array accesses in a statement. 2050 /// 2051 /// @param Stmt The statement for which to collect the accesses. 2052 /// 2053 /// @returns A list of array accesses. 2054 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 2055 gpu_stmt_access *Accesses = nullptr; 2056 2057 for (MemoryAccess *Acc : Stmt) { 2058 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 2059 Access->read = Acc->isRead(); 2060 Access->write = Acc->isWrite(); 2061 Access->access = Acc->getAccessRelation(); 2062 isl_space *Space = isl_map_get_space(Access->access); 2063 Space = isl_space_range(Space); 2064 Space = isl_space_from_range(Space); 2065 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 2066 isl_map *Universe = isl_map_universe(Space); 2067 Access->tagged_access = 2068 isl_map_domain_product(Acc->getAccessRelation(), Universe); 2069 Access->exact_write = !Acc->isMayWrite(); 2070 Access->ref_id = Acc->getId(); 2071 Access->next = Accesses; 2072 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 2073 Accesses = Access; 2074 } 2075 2076 return Accesses; 2077 } 2078 2079 /// Collect the list of GPU statements. 2080 /// 2081 /// Each statement has an id, a pointer to the underlying data structure, 2082 /// as well as a list with all memory accesses. 2083 /// 2084 /// TODO: Initialize the list of memory accesses. 2085 /// 2086 /// @returns A linked-list of statements. 2087 gpu_stmt *getStatements() { 2088 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 2089 std::distance(S->begin(), S->end())); 2090 2091 int i = 0; 2092 for (auto &Stmt : *S) { 2093 gpu_stmt *GPUStmt = &Stmts[i]; 2094 2095 GPUStmt->id = Stmt.getDomainId(); 2096 2097 // We use the pet stmt pointer to keep track of the Polly statements. 2098 GPUStmt->stmt = (pet_stmt *)&Stmt; 2099 GPUStmt->accesses = getStmtAccesses(Stmt); 2100 i++; 2101 } 2102 2103 return Stmts; 2104 } 2105 2106 /// Derive the extent of an array. 2107 /// 2108 /// The extent of an array is the set of elements that are within the 2109 /// accessed array. For the inner dimensions, the extent constraints are 2110 /// 0 and the size of the corresponding array dimension. For the first 2111 /// (outermost) dimension, the extent constraints are the minimal and maximal 2112 /// subscript value for the first dimension. 2113 /// 2114 /// @param Array The array to derive the extent for. 2115 /// 2116 /// @returns An isl_set describing the extent of the array. 2117 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 2118 unsigned NumDims = Array->getNumberOfDimensions(); 2119 isl_union_map *Accesses = S->getAccesses(); 2120 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 2121 Accesses = isl_union_map_detect_equalities(Accesses); 2122 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 2123 AccessUSet = isl_union_set_coalesce(AccessUSet); 2124 AccessUSet = isl_union_set_detect_equalities(AccessUSet); 2125 AccessUSet = isl_union_set_coalesce(AccessUSet); 2126 2127 if (isl_union_set_is_empty(AccessUSet)) { 2128 isl_union_set_free(AccessUSet); 2129 return isl_set_empty(Array->getSpace()); 2130 } 2131 2132 if (Array->getNumberOfDimensions() == 0) { 2133 isl_union_set_free(AccessUSet); 2134 return isl_set_universe(Array->getSpace()); 2135 } 2136 2137 isl_set *AccessSet = 2138 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 2139 2140 isl_union_set_free(AccessUSet); 2141 isl_local_space *LS = isl_local_space_from_space(Array->getSpace()); 2142 2143 isl_pw_aff *Val = 2144 isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 2145 2146 isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 2147 isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 2148 OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 2149 isl_pw_aff_dim(Val, isl_dim_in)); 2150 OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 2151 isl_pw_aff_dim(Val, isl_dim_in)); 2152 OuterMin = 2153 isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); 2154 OuterMax = 2155 isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); 2156 2157 isl_set *Extent = isl_set_universe(Array->getSpace()); 2158 2159 Extent = isl_set_intersect( 2160 Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 2161 Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 2162 2163 for (unsigned i = 1; i < NumDims; ++i) 2164 Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 2165 2166 for (unsigned i = 0; i < NumDims; ++i) { 2167 isl_pw_aff *PwAff = 2168 const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i)); 2169 2170 // isl_pw_aff can be NULL for zero dimension. Only in the case of a 2171 // Fortran array will we have a legitimate dimension. 2172 if (!PwAff) { 2173 assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); 2174 continue; 2175 } 2176 2177 isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 2178 isl_local_space_from_space(Array->getSpace()), isl_dim_set, i)); 2179 PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 2180 isl_pw_aff_dim(Val, isl_dim_in)); 2181 PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 2182 isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 2183 auto *Set = isl_pw_aff_gt_set(PwAff, Val); 2184 Extent = isl_set_intersect(Set, Extent); 2185 } 2186 2187 return Extent; 2188 } 2189 2190 /// Derive the bounds of an array. 2191 /// 2192 /// For the first dimension we derive the bound of the array from the extent 2193 /// of this dimension. For inner dimensions we obtain their size directly from 2194 /// ScopArrayInfo. 2195 /// 2196 /// @param PPCGArray The array to compute bounds for. 2197 /// @param Array The polly array from which to take the information. 2198 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 2199 if (PPCGArray.n_index > 0) { 2200 if (isl_set_is_empty(PPCGArray.extent)) { 2201 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2202 isl_local_space *LS = isl_local_space_from_space( 2203 isl_space_params(isl_set_get_space(Dom))); 2204 isl_set_free(Dom); 2205 isl_aff *Zero = isl_aff_zero_on_domain(LS); 2206 PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero); 2207 } else { 2208 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2209 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 2210 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 2211 isl_set_free(Dom); 2212 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 2213 isl_local_space *LS = 2214 isl_local_space_from_space(isl_set_get_space(Dom)); 2215 isl_aff *One = isl_aff_zero_on_domain(LS); 2216 One = isl_aff_add_constant_si(One, 1); 2217 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 2218 Bound = isl_pw_aff_gist(Bound, S->getContext()); 2219 PPCGArray.bound[0] = Bound; 2220 } 2221 } 2222 2223 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 2224 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 2225 auto LS = isl_pw_aff_get_domain_space(Bound); 2226 auto Aff = isl_multi_aff_zero(LS); 2227 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 2228 PPCGArray.bound[i] = Bound; 2229 } 2230 } 2231 2232 /// Create the arrays for @p PPCGProg. 2233 /// 2234 /// @param PPCGProg The program to compute the arrays for. 2235 void createArrays(gpu_prog *PPCGProg) { 2236 int i = 0; 2237 for (auto &Array : S->arrays()) { 2238 std::string TypeName; 2239 raw_string_ostream OS(TypeName); 2240 2241 OS << *Array->getElementType(); 2242 TypeName = OS.str(); 2243 2244 gpu_array_info &PPCGArray = PPCGProg->array[i]; 2245 2246 PPCGArray.space = Array->getSpace(); 2247 PPCGArray.type = strdup(TypeName.c_str()); 2248 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 2249 PPCGArray.name = strdup(Array->getName().c_str()); 2250 PPCGArray.extent = nullptr; 2251 PPCGArray.n_index = Array->getNumberOfDimensions(); 2252 PPCGArray.bound = 2253 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 2254 PPCGArray.extent = getExtent(Array); 2255 PPCGArray.n_ref = 0; 2256 PPCGArray.refs = nullptr; 2257 PPCGArray.accessed = true; 2258 PPCGArray.read_only_scalar = 2259 Array->isReadOnly() && Array->getNumberOfDimensions() == 0; 2260 PPCGArray.has_compound_element = false; 2261 PPCGArray.local = false; 2262 PPCGArray.declare_local = false; 2263 PPCGArray.global = false; 2264 PPCGArray.linearize = false; 2265 PPCGArray.dep_order = nullptr; 2266 PPCGArray.user = Array; 2267 2268 setArrayBounds(PPCGArray, Array); 2269 i++; 2270 2271 collect_references(PPCGProg, &PPCGArray); 2272 } 2273 } 2274 2275 /// Create an identity map between the arrays in the scop. 2276 /// 2277 /// @returns An identity map between the arrays in the scop. 2278 isl_union_map *getArrayIdentity() { 2279 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 2280 2281 for (auto &Array : S->arrays()) { 2282 isl_space *Space = Array->getSpace(); 2283 Space = isl_space_map_from_set(Space); 2284 isl_map *Identity = isl_map_identity(Space); 2285 Maps = isl_union_map_add_map(Maps, Identity); 2286 } 2287 2288 return Maps; 2289 } 2290 2291 /// Create a default-initialized PPCG GPU program. 2292 /// 2293 /// @returns A new gpu program description. 2294 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 2295 2296 if (!PPCGScop) 2297 return nullptr; 2298 2299 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 2300 2301 PPCGProg->ctx = S->getIslCtx(); 2302 PPCGProg->scop = PPCGScop; 2303 PPCGProg->context = isl_set_copy(PPCGScop->context); 2304 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 2305 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 2306 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 2307 PPCGProg->tagged_must_kill = 2308 isl_union_map_copy(PPCGScop->tagged_must_kills); 2309 PPCGProg->to_inner = getArrayIdentity(); 2310 PPCGProg->to_outer = getArrayIdentity(); 2311 PPCGProg->any_to_outer = nullptr; 2312 PPCGProg->array_order = nullptr; 2313 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 2314 PPCGProg->stmts = getStatements(); 2315 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 2316 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 2317 PPCGProg->n_array); 2318 2319 createArrays(PPCGProg); 2320 2321 PPCGProg->may_persist = compute_may_persist(PPCGProg); 2322 2323 return PPCGProg; 2324 } 2325 2326 struct PrintGPUUserData { 2327 struct cuda_info *CudaInfo; 2328 struct gpu_prog *PPCGProg; 2329 std::vector<ppcg_kernel *> Kernels; 2330 }; 2331 2332 /// Print a user statement node in the host code. 2333 /// 2334 /// We use ppcg's printing facilities to print the actual statement and 2335 /// additionally build up a list of all kernels that are encountered in the 2336 /// host ast. 2337 /// 2338 /// @param P The printer to print to 2339 /// @param Options The printing options to use 2340 /// @param Node The node to print 2341 /// @param User A user pointer to carry additional data. This pointer is 2342 /// expected to be of type PrintGPUUserData. 2343 /// 2344 /// @returns A printer to which the output has been printed. 2345 static __isl_give isl_printer * 2346 printHostUser(__isl_take isl_printer *P, 2347 __isl_take isl_ast_print_options *Options, 2348 __isl_take isl_ast_node *Node, void *User) { 2349 auto Data = (struct PrintGPUUserData *)User; 2350 auto Id = isl_ast_node_get_annotation(Node); 2351 2352 if (Id) { 2353 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 2354 2355 // If this is a user statement, format it ourselves as ppcg would 2356 // otherwise try to call pet functionality that is not available in 2357 // Polly. 2358 if (IsUser) { 2359 P = isl_printer_start_line(P); 2360 P = isl_printer_print_ast_node(P, Node); 2361 P = isl_printer_end_line(P); 2362 isl_id_free(Id); 2363 isl_ast_print_options_free(Options); 2364 return P; 2365 } 2366 2367 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 2368 isl_id_free(Id); 2369 Data->Kernels.push_back(Kernel); 2370 } 2371 2372 return print_host_user(P, Options, Node, User); 2373 } 2374 2375 /// Print C code corresponding to the control flow in @p Kernel. 2376 /// 2377 /// @param Kernel The kernel to print 2378 void printKernel(ppcg_kernel *Kernel) { 2379 auto *P = isl_printer_to_str(S->getIslCtx()); 2380 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2381 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2382 P = isl_ast_node_print(Kernel->tree, P, Options); 2383 char *String = isl_printer_get_str(P); 2384 printf("%s\n", String); 2385 free(String); 2386 isl_printer_free(P); 2387 } 2388 2389 /// Print C code corresponding to the GPU code described by @p Tree. 2390 /// 2391 /// @param Tree An AST describing GPU code 2392 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 2393 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 2394 auto *P = isl_printer_to_str(S->getIslCtx()); 2395 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2396 2397 PrintGPUUserData Data; 2398 Data.PPCGProg = PPCGProg; 2399 2400 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2401 Options = 2402 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 2403 P = isl_ast_node_print(Tree, P, Options); 2404 char *String = isl_printer_get_str(P); 2405 printf("# host\n"); 2406 printf("%s\n", String); 2407 free(String); 2408 isl_printer_free(P); 2409 2410 for (auto Kernel : Data.Kernels) { 2411 printf("# kernel%d\n", Kernel->id); 2412 printKernel(Kernel); 2413 } 2414 } 2415 2416 // Generate a GPU program using PPCG. 2417 // 2418 // GPU mapping consists of multiple steps: 2419 // 2420 // 1) Compute new schedule for the program. 2421 // 2) Map schedule to GPU (TODO) 2422 // 3) Generate code for new schedule (TODO) 2423 // 2424 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 2425 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 2426 // strategy directly from this pass. 2427 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 2428 2429 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 2430 2431 PPCGGen->ctx = S->getIslCtx(); 2432 PPCGGen->options = PPCGScop->options; 2433 PPCGGen->print = nullptr; 2434 PPCGGen->print_user = nullptr; 2435 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 2436 PPCGGen->prog = PPCGProg; 2437 PPCGGen->tree = nullptr; 2438 PPCGGen->types.n = 0; 2439 PPCGGen->types.name = nullptr; 2440 PPCGGen->sizes = nullptr; 2441 PPCGGen->used_sizes = nullptr; 2442 PPCGGen->kernel_id = 0; 2443 2444 // Set scheduling strategy to same strategy PPCG is using. 2445 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 2446 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 2447 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 2448 2449 isl_schedule *Schedule = get_schedule(PPCGGen); 2450 2451 int has_permutable = has_any_permutable_node(Schedule); 2452 2453 if (!has_permutable || has_permutable < 0) { 2454 Schedule = isl_schedule_free(Schedule); 2455 } else { 2456 Schedule = map_to_device(PPCGGen, Schedule); 2457 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 2458 } 2459 2460 if (DumpSchedule) { 2461 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 2462 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 2463 P = isl_printer_print_str(P, "Schedule\n"); 2464 P = isl_printer_print_str(P, "========\n"); 2465 if (Schedule) 2466 P = isl_printer_print_schedule(P, Schedule); 2467 else 2468 P = isl_printer_print_str(P, "No schedule found\n"); 2469 2470 printf("%s\n", isl_printer_get_str(P)); 2471 isl_printer_free(P); 2472 } 2473 2474 if (DumpCode) { 2475 printf("Code\n"); 2476 printf("====\n"); 2477 if (PPCGGen->tree) 2478 printGPUTree(PPCGGen->tree, PPCGProg); 2479 else 2480 printf("No code generated\n"); 2481 } 2482 2483 isl_schedule_free(Schedule); 2484 2485 return PPCGGen; 2486 } 2487 2488 /// Free gpu_gen structure. 2489 /// 2490 /// @param PPCGGen The ppcg_gen object to free. 2491 void freePPCGGen(gpu_gen *PPCGGen) { 2492 isl_ast_node_free(PPCGGen->tree); 2493 isl_union_map_free(PPCGGen->sizes); 2494 isl_union_map_free(PPCGGen->used_sizes); 2495 free(PPCGGen); 2496 } 2497 2498 /// Free the options in the ppcg scop structure. 2499 /// 2500 /// ppcg is not freeing these options for us. To avoid leaks we do this 2501 /// ourselves. 2502 /// 2503 /// @param PPCGScop The scop referencing the options to free. 2504 void freeOptions(ppcg_scop *PPCGScop) { 2505 free(PPCGScop->options->debug); 2506 PPCGScop->options->debug = nullptr; 2507 free(PPCGScop->options); 2508 PPCGScop->options = nullptr; 2509 } 2510 2511 /// Approximate the number of points in the set. 2512 /// 2513 /// This function returns an ast expression that overapproximates the number 2514 /// of points in an isl set through the rectangular hull surrounding this set. 2515 /// 2516 /// @param Set The set to count. 2517 /// @param Build The isl ast build object to use for creating the ast 2518 /// expression. 2519 /// 2520 /// @returns An approximation of the number of points in the set. 2521 __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, 2522 __isl_keep isl_ast_build *Build) { 2523 2524 isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); 2525 auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); 2526 2527 isl_space *Space = isl_set_get_space(Set); 2528 Space = isl_space_params(Space); 2529 auto *Univ = isl_set_universe(Space); 2530 isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); 2531 2532 for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) { 2533 isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); 2534 isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); 2535 isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); 2536 DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); 2537 auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); 2538 Expr = isl_ast_expr_mul(Expr, DimSizeExpr); 2539 } 2540 2541 isl_set_free(Set); 2542 isl_pw_aff_free(OneAff); 2543 2544 return Expr; 2545 } 2546 2547 /// Approximate a number of dynamic instructions executed by a given 2548 /// statement. 2549 /// 2550 /// @param Stmt The statement for which to compute the number of dynamic 2551 /// instructions. 2552 /// @param Build The isl ast build object to use for creating the ast 2553 /// expression. 2554 /// @returns An approximation of the number of dynamic instructions executed 2555 /// by @p Stmt. 2556 __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, 2557 __isl_keep isl_ast_build *Build) { 2558 auto Iterations = approxPointsInSet(Stmt.getDomain(), Build); 2559 2560 long InstCount = 0; 2561 2562 if (Stmt.isBlockStmt()) { 2563 auto *BB = Stmt.getBasicBlock(); 2564 InstCount = std::distance(BB->begin(), BB->end()); 2565 } else { 2566 auto *R = Stmt.getRegion(); 2567 2568 for (auto *BB : R->blocks()) { 2569 InstCount += std::distance(BB->begin(), BB->end()); 2570 } 2571 } 2572 2573 isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount); 2574 auto *InstExpr = isl_ast_expr_from_val(InstVal); 2575 return isl_ast_expr_mul(InstExpr, Iterations); 2576 } 2577 2578 /// Approximate dynamic instructions executed in scop. 2579 /// 2580 /// @param S The scop for which to approximate dynamic instructions. 2581 /// @param Build The isl ast build object to use for creating the ast 2582 /// expression. 2583 /// @returns An approximation of the number of dynamic instructions executed 2584 /// in @p S. 2585 __isl_give isl_ast_expr * 2586 getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { 2587 isl_ast_expr *Instructions; 2588 2589 isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0); 2590 Instructions = isl_ast_expr_from_val(Zero); 2591 2592 for (ScopStmt &Stmt : S) { 2593 isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); 2594 Instructions = isl_ast_expr_add(Instructions, StmtInstructions); 2595 } 2596 return Instructions; 2597 } 2598 2599 /// Create a check that ensures sufficient compute in scop. 2600 /// 2601 /// @param S The scop for which to ensure sufficient compute. 2602 /// @param Build The isl ast build object to use for creating the ast 2603 /// expression. 2604 /// @returns An expression that evaluates to TRUE in case of sufficient 2605 /// compute and to FALSE, otherwise. 2606 __isl_give isl_ast_expr * 2607 createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { 2608 auto Iterations = getNumberOfIterations(S, Build); 2609 auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute); 2610 auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); 2611 return isl_ast_expr_ge(Iterations, MinComputeExpr); 2612 } 2613 2614 /// Generate code for a given GPU AST described by @p Root. 2615 /// 2616 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 2617 /// @param Prog The GPU Program to generate code for. 2618 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 2619 ScopAnnotator Annotator; 2620 Annotator.buildAliasScopes(*S); 2621 2622 Region *R = &S->getRegion(); 2623 2624 simplifyRegion(R, DT, LI, RI); 2625 2626 BasicBlock *EnteringBB = R->getEnteringBlock(); 2627 2628 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 2629 2630 // Only build the run-time condition and parameters _after_ having 2631 // introduced the conditional branch. This is important as the conditional 2632 // branch will guard the original scop from new induction variables that 2633 // the SCEVExpander may introduce while code generating the parameters and 2634 // which may introduce scalar dependences that prevent us from correctly 2635 // code generating this scop. 2636 BasicBlock *StartBlock = 2637 executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); 2638 2639 GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, 2640 StartBlock, Prog, Runtime, Architecture); 2641 2642 // TODO: Handle LICM 2643 auto SplitBlock = StartBlock->getSinglePredecessor(); 2644 Builder.SetInsertPoint(SplitBlock->getTerminator()); 2645 NodeBuilder.addParameters(S->getContext()); 2646 2647 isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 2648 isl_ast_expr *Condition = IslAst::buildRunCondition(*S, Build); 2649 isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); 2650 Condition = isl_ast_expr_and(Condition, SufficientCompute); 2651 isl_ast_build_free(Build); 2652 2653 Value *RTC = NodeBuilder.createRTC(Condition); 2654 Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 2655 2656 Builder.SetInsertPoint(&*StartBlock->begin()); 2657 2658 NodeBuilder.initializeAfterRTH(); 2659 NodeBuilder.create(Root); 2660 NodeBuilder.finalize(); 2661 2662 /// In case a sequential kernel has more surrounding loops as any parallel 2663 /// kernel, the SCoP is probably mostly sequential. Hence, there is no 2664 /// point in running it on a GPU. 2665 if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) 2666 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2667 2668 if (!NodeBuilder.BuildSuccessful) 2669 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2670 } 2671 2672 bool runOnScop(Scop &CurrentScop) override { 2673 S = &CurrentScop; 2674 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2675 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2676 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2677 DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); 2678 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2679 2680 // We currently do not support scops with invariant loads. 2681 if (S->hasInvariantAccesses()) 2682 return false; 2683 2684 auto PPCGScop = createPPCGScop(); 2685 auto PPCGProg = createPPCGProg(PPCGScop); 2686 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2687 2688 if (PPCGGen->tree) 2689 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2690 2691 freeOptions(PPCGScop); 2692 freePPCGGen(PPCGGen); 2693 gpu_prog_free(PPCGProg); 2694 ppcg_scop_free(PPCGScop); 2695 2696 return true; 2697 } 2698 2699 void printScop(raw_ostream &, Scop &) const override {} 2700 2701 void getAnalysisUsage(AnalysisUsage &AU) const override { 2702 AU.addRequired<DominatorTreeWrapperPass>(); 2703 AU.addRequired<RegionInfoPass>(); 2704 AU.addRequired<ScalarEvolutionWrapperPass>(); 2705 AU.addRequired<ScopDetectionWrapperPass>(); 2706 AU.addRequired<ScopInfoRegionPass>(); 2707 AU.addRequired<LoopInfoWrapperPass>(); 2708 2709 AU.addPreserved<AAResultsWrapperPass>(); 2710 AU.addPreserved<BasicAAWrapperPass>(); 2711 AU.addPreserved<LoopInfoWrapperPass>(); 2712 AU.addPreserved<DominatorTreeWrapperPass>(); 2713 AU.addPreserved<GlobalsAAWrapperPass>(); 2714 AU.addPreserved<ScopDetectionWrapperPass>(); 2715 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2716 AU.addPreserved<SCEVAAWrapperPass>(); 2717 2718 // FIXME: We do not yet add regions for the newly generated code to the 2719 // region tree. 2720 AU.addPreserved<RegionInfoPass>(); 2721 AU.addPreserved<ScopInfoRegionPass>(); 2722 } 2723 }; 2724 } // namespace 2725 2726 char PPCGCodeGeneration::ID = 1; 2727 2728 Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { 2729 PPCGCodeGeneration *generator = new PPCGCodeGeneration(); 2730 generator->Runtime = Runtime; 2731 generator->Architecture = Arch; 2732 return generator; 2733 } 2734 2735 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 2736 "Polly - Apply PPCG translation to SCOP", false, false) 2737 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 2738 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 2739 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 2740 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 2741 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 2742 INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); 2743 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 2744 "Polly - Apply PPCG translation to SCOP", false, false) 2745