1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslAst.h" 16 #include "polly/CodeGen/IslNodeBuilder.h" 17 #include "polly/CodeGen/Utils.h" 18 #include "polly/DependenceInfo.h" 19 #include "polly/LinkAllPasses.h" 20 #include "polly/Options.h" 21 #include "polly/ScopDetection.h" 22 #include "polly/ScopInfo.h" 23 #include "polly/Support/SCEVValidator.h" 24 #include "llvm/ADT/PostOrderIterator.h" 25 #include "llvm/Analysis/AliasAnalysis.h" 26 #include "llvm/Analysis/BasicAliasAnalysis.h" 27 #include "llvm/Analysis/GlobalsModRef.h" 28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/LegacyPassManager.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/Support/TargetRegistry.h" 34 #include "llvm/Support/TargetSelect.h" 35 #include "llvm/Target/TargetMachine.h" 36 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 37 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 38 39 #include "isl/union_map.h" 40 41 extern "C" { 42 #include "ppcg/cuda.h" 43 #include "ppcg/gpu.h" 44 #include "ppcg/gpu_print.h" 45 #include "ppcg/ppcg.h" 46 #include "ppcg/schedule.h" 47 } 48 49 #include "llvm/Support/Debug.h" 50 51 using namespace polly; 52 using namespace llvm; 53 54 #define DEBUG_TYPE "polly-codegen-ppcg" 55 56 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 57 cl::desc("Dump the computed GPU Schedule"), 58 cl::Hidden, cl::init(false), cl::ZeroOrMore, 59 cl::cat(PollyCategory)); 60 61 static cl::opt<bool> 62 DumpCode("polly-acc-dump-code", 63 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 64 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 65 66 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 67 cl::desc("Dump the kernel LLVM-IR"), 68 cl::Hidden, cl::init(false), cl::ZeroOrMore, 69 cl::cat(PollyCategory)); 70 71 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 72 cl::desc("Dump the kernel assembly code"), 73 cl::Hidden, cl::init(false), cl::ZeroOrMore, 74 cl::cat(PollyCategory)); 75 76 static cl::opt<bool> FastMath("polly-acc-fastmath", 77 cl::desc("Allow unsafe math optimizations"), 78 cl::Hidden, cl::init(false), cl::ZeroOrMore, 79 cl::cat(PollyCategory)); 80 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 81 cl::desc("Use shared memory"), cl::Hidden, 82 cl::init(false), cl::ZeroOrMore, 83 cl::cat(PollyCategory)); 84 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 85 cl::desc("Use private memory"), cl::Hidden, 86 cl::init(false), cl::ZeroOrMore, 87 cl::cat(PollyCategory)); 88 89 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory", 90 cl::desc("Generate Host kernel code assuming" 91 " that all memory has been" 92 " declared as managed memory"), 93 cl::Hidden, cl::init(false), cl::ZeroOrMore, 94 cl::cat(PollyCategory)); 95 96 static cl::opt<std::string> 97 CudaVersion("polly-acc-cuda-version", 98 cl::desc("The CUDA version to compile for"), cl::Hidden, 99 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 100 101 static cl::opt<int> 102 MinCompute("polly-acc-mincompute", 103 cl::desc("Minimal number of compute statements to run on GPU."), 104 cl::Hidden, cl::init(10 * 512 * 512)); 105 106 /// Create the ast expressions for a ScopStmt. 107 /// 108 /// This function is a callback for to generate the ast expressions for each 109 /// of the scheduled ScopStmts. 110 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 111 void *StmtT, isl_ast_build *Build, 112 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 113 isl_id *Id, void *User), 114 void *UserIndex, 115 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 116 void *UserExpr) { 117 118 ScopStmt *Stmt = (ScopStmt *)StmtT; 119 120 isl_ctx *Ctx; 121 122 if (!Stmt || !Build) 123 return NULL; 124 125 Ctx = isl_ast_build_get_ctx(Build); 126 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 127 128 for (MemoryAccess *Acc : *Stmt) { 129 isl_map *AddrFunc = Acc->getAddressFunction(); 130 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 131 isl_id *RefId = Acc->getId(); 132 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 133 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 134 MPA = isl_multi_pw_aff_coalesce(MPA); 135 MPA = FunctionIndex(MPA, RefId, UserIndex); 136 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 137 Access = FunctionExpr(Access, RefId, UserExpr); 138 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 139 } 140 141 return RefToExpr; 142 } 143 144 /// Generate code for a GPU specific isl AST. 145 /// 146 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 147 /// generates code for general-prupose AST nodes, with special functionality 148 /// for generating GPU specific user nodes. 149 /// 150 /// @see GPUNodeBuilder::createUser 151 class GPUNodeBuilder : public IslNodeBuilder { 152 public: 153 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, 154 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 155 DominatorTree &DT, Scop &S, BasicBlock *StartBlock, 156 gpu_prog *Prog) 157 : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), 158 Prog(Prog) { 159 getExprBuilder().setIDToSAI(&IDToSAI); 160 } 161 162 /// Create after-run-time-check initialization code. 163 void initializeAfterRTH(); 164 165 /// Finalize the generated scop. 166 virtual void finalize(); 167 168 /// Track if the full build process was successful. 169 /// 170 /// This value is set to false, if throughout the build process an error 171 /// occurred which prevents us from generating valid GPU code. 172 bool BuildSuccessful = true; 173 174 /// The maximal number of loops surrounding a sequential kernel. 175 unsigned DeepestSequential = 0; 176 177 /// The maximal number of loops surrounding a parallel kernel. 178 unsigned DeepestParallel = 0; 179 180 private: 181 /// A vector of array base pointers for which a new ScopArrayInfo was created. 182 /// 183 /// This vector is used to delete the ScopArrayInfo when it is not needed any 184 /// more. 185 std::vector<Value *> LocalArrays; 186 187 /// A map from ScopArrays to their corresponding device allocations. 188 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 189 190 /// The current GPU context. 191 Value *GPUContext; 192 193 /// The set of isl_ids allocated in the kernel 194 std::vector<isl_id *> KernelIds; 195 196 /// A module containing GPU code. 197 /// 198 /// This pointer is only set in case we are currently generating GPU code. 199 std::unique_ptr<Module> GPUModule; 200 201 /// The GPU program we generate code for. 202 gpu_prog *Prog; 203 204 /// Class to free isl_ids. 205 class IslIdDeleter { 206 public: 207 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 208 }; 209 210 /// A set containing all isl_ids allocated in a GPU kernel. 211 /// 212 /// By releasing this set all isl_ids will be freed. 213 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 214 215 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 216 217 /// Create code for user-defined AST nodes. 218 /// 219 /// These AST nodes can be of type: 220 /// 221 /// - ScopStmt: A computational statement (TODO) 222 /// - Kernel: A GPU kernel call (TODO) 223 /// - Data-Transfer: A GPU <-> CPU data-transfer 224 /// - In-kernel synchronization 225 /// - In-kernel memory copy statement 226 /// 227 /// @param UserStmt The ast node to generate code for. 228 virtual void createUser(__isl_take isl_ast_node *UserStmt); 229 230 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 231 232 /// Create code for a data transfer statement 233 /// 234 /// @param TransferStmt The data transfer statement. 235 /// @param Direction The direction in which to transfer data. 236 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 237 enum DataDirection Direction); 238 239 /// Find llvm::Values referenced in GPU kernel. 240 /// 241 /// @param Kernel The kernel to scan for llvm::Values 242 /// 243 /// @returns A set of values referenced by the kernel. 244 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 245 246 /// Compute the sizes of the execution grid for a given kernel. 247 /// 248 /// @param Kernel The kernel to compute grid sizes for. 249 /// 250 /// @returns A tuple with grid sizes for X and Y dimension 251 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 252 253 /// Creates a array that can be sent to the kernel on the device using a 254 /// host pointer. This is required for managed memory, when we directly send 255 /// host pointers to the device. 256 /// \note 257 /// This is to be used only with managed memory 258 Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, 259 ScopArrayInfo *ArrayInfo); 260 261 /// Compute the sizes of the thread blocks for a given kernel. 262 /// 263 /// @param Kernel The kernel to compute thread block sizes for. 264 /// 265 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 266 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 267 268 /// Create kernel launch parameters. 269 /// 270 /// @param Kernel The kernel to create parameters for. 271 /// @param F The kernel function that has been created. 272 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 273 /// 274 /// @returns A stack allocated array with pointers to the parameter 275 /// values that are passed to the kernel. 276 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 277 SetVector<Value *> SubtreeValues); 278 279 /// Create declarations for kernel variable. 280 /// 281 /// This includes shared memory declarations. 282 /// 283 /// @param Kernel The kernel definition to create variables for. 284 /// @param FN The function into which to generate the variables. 285 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 286 287 /// Add CUDA annotations to module. 288 /// 289 /// Add a set of CUDA annotations that declares the maximal block dimensions 290 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 291 /// PTX compiler to bound the number of allocated registers to ensure the 292 /// resulting kernel is known to run with up to as many block dimensions 293 /// as specified here. 294 /// 295 /// @param M The module to add the annotations to. 296 /// @param BlockDimX The size of block dimension X. 297 /// @param BlockDimY The size of block dimension Y. 298 /// @param BlockDimZ The size of block dimension Z. 299 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 300 Value *BlockDimZ); 301 302 /// Create GPU kernel. 303 /// 304 /// Code generate the kernel described by @p KernelStmt. 305 /// 306 /// @param KernelStmt The ast node to generate kernel code for. 307 void createKernel(__isl_take isl_ast_node *KernelStmt); 308 309 /// Generate code that computes the size of an array. 310 /// 311 /// @param Array The array for which to compute a size. 312 Value *getArraySize(gpu_array_info *Array); 313 314 /// Generate code to compute the minimal offset at which an array is accessed. 315 /// 316 /// The offset of an array is the minimal array location accessed in a scop. 317 /// 318 /// Example: 319 /// 320 /// for (long i = 0; i < 100; i++) 321 /// A[i + 42] += ... 322 /// 323 /// getArrayOffset(A) results in 42. 324 /// 325 /// @param Array The array for which to compute the offset. 326 /// @returns An llvm::Value that contains the offset of the array. 327 Value *getArrayOffset(gpu_array_info *Array); 328 329 /// Prepare the kernel arguments for kernel code generation 330 /// 331 /// @param Kernel The kernel to generate code for. 332 /// @param FN The function created for the kernel. 333 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 334 335 /// Create kernel function. 336 /// 337 /// Create a kernel function located in a newly created module that can serve 338 /// as target for device code generation. Set the Builder to point to the 339 /// start block of this newly created function. 340 /// 341 /// @param Kernel The kernel to generate code for. 342 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 343 void createKernelFunction(ppcg_kernel *Kernel, 344 SetVector<Value *> &SubtreeValues); 345 346 /// Create the declaration of a kernel function. 347 /// 348 /// The kernel function takes as arguments: 349 /// 350 /// - One i8 pointer for each external array reference used in the kernel. 351 /// - Host iterators 352 /// - Parameters 353 /// - Other LLVM Value references (TODO) 354 /// 355 /// @param Kernel The kernel to generate the function declaration for. 356 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 357 /// 358 /// @returns The newly declared function. 359 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 360 SetVector<Value *> &SubtreeValues); 361 362 /// Insert intrinsic functions to obtain thread and block ids. 363 /// 364 /// @param The kernel to generate the intrinsic functions for. 365 void insertKernelIntrinsics(ppcg_kernel *Kernel); 366 367 /// Create a global-to-shared or shared-to-global copy statement. 368 /// 369 /// @param CopyStmt The copy statement to generate code for 370 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 371 372 /// Create code for a ScopStmt called in @p Expr. 373 /// 374 /// @param Expr The expression containing the call. 375 /// @param KernelStmt The kernel statement referenced in the call. 376 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 377 378 /// Create an in-kernel synchronization call. 379 void createKernelSync(); 380 381 /// Create a PTX assembly string for the current GPU kernel. 382 /// 383 /// @returns A string containing the corresponding PTX assembly code. 384 std::string createKernelASM(); 385 386 /// Remove references from the dominator tree to the kernel function @p F. 387 /// 388 /// @param F The function to remove references to. 389 void clearDominators(Function *F); 390 391 /// Remove references from scalar evolution to the kernel function @p F. 392 /// 393 /// @param F The function to remove references to. 394 void clearScalarEvolution(Function *F); 395 396 /// Remove references from loop info to the kernel function @p F. 397 /// 398 /// @param F The function to remove references to. 399 void clearLoops(Function *F); 400 401 /// Finalize the generation of the kernel function. 402 /// 403 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 404 /// dump its IR to stderr. 405 /// 406 /// @returns The Assembly string of the kernel. 407 std::string finalizeKernelFunction(); 408 409 /// Finalize the generation of the kernel arguments. 410 /// 411 /// This function ensures that not-read-only scalars used in a kernel are 412 /// stored back to the global memory location they ared backed up with before 413 /// the kernel terminates. 414 /// 415 /// @params Kernel The kernel to finalize kernel arguments for. 416 void finalizeKernelArguments(ppcg_kernel *Kernel); 417 418 /// Create code that allocates memory to store arrays on device. 419 void allocateDeviceArrays(); 420 421 /// Free all allocated device arrays. 422 void freeDeviceArrays(); 423 424 /// Create a call to initialize the GPU context. 425 /// 426 /// @returns A pointer to the newly initialized context. 427 Value *createCallInitContext(); 428 429 /// Create a call to get the device pointer for a kernel allocation. 430 /// 431 /// @param Allocation The Polly GPU allocation 432 /// 433 /// @returns The device parameter corresponding to this allocation. 434 Value *createCallGetDevicePtr(Value *Allocation); 435 436 /// Create a call to free the GPU context. 437 /// 438 /// @param Context A pointer to an initialized GPU context. 439 void createCallFreeContext(Value *Context); 440 441 /// Create a call to allocate memory on the device. 442 /// 443 /// @param Size The size of memory to allocate 444 /// 445 /// @returns A pointer that identifies this allocation. 446 Value *createCallAllocateMemoryForDevice(Value *Size); 447 448 /// Create a call to free a device array. 449 /// 450 /// @param Array The device array to free. 451 void createCallFreeDeviceMemory(Value *Array); 452 453 /// Create a call to copy data from host to device. 454 /// 455 /// @param HostPtr A pointer to the host data that should be copied. 456 /// @param DevicePtr A device pointer specifying the location to copy to. 457 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 458 Value *Size); 459 460 /// Create a call to copy data from device to host. 461 /// 462 /// @param DevicePtr A pointer to the device data that should be copied. 463 /// @param HostPtr A host pointer specifying the location to copy to. 464 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 465 Value *Size); 466 467 /// Create a call to synchronize Host & Device. 468 /// \note 469 /// This is to be used only with managed memory. 470 void createCallSynchronizeDevice(); 471 472 /// Create a call to get a kernel from an assembly string. 473 /// 474 /// @param Buffer The string describing the kernel. 475 /// @param Entry The name of the kernel function to call. 476 /// 477 /// @returns A pointer to a kernel object 478 Value *createCallGetKernel(Value *Buffer, Value *Entry); 479 480 /// Create a call to free a GPU kernel. 481 /// 482 /// @param GPUKernel THe kernel to free. 483 void createCallFreeKernel(Value *GPUKernel); 484 485 /// Create a call to launch a GPU kernel. 486 /// 487 /// @param GPUKernel The kernel to launch. 488 /// @param GridDimX The size of the first grid dimension. 489 /// @param GridDimY The size of the second grid dimension. 490 /// @param GridBlockX The size of the first block dimension. 491 /// @param GridBlockY The size of the second block dimension. 492 /// @param GridBlockZ The size of the third block dimension. 493 /// @param Paramters A pointer to an array that contains itself pointers to 494 /// the parameter values passed for each kernel argument. 495 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 496 Value *GridDimY, Value *BlockDimX, 497 Value *BlockDimY, Value *BlockDimZ, 498 Value *Parameters); 499 }; 500 501 void GPUNodeBuilder::initializeAfterRTH() { 502 BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 503 &*Builder.GetInsertPoint(), &DT, &LI); 504 NewBB->setName("polly.acc.initialize"); 505 Builder.SetInsertPoint(&NewBB->front()); 506 507 GPUContext = createCallInitContext(); 508 509 if (!ManagedMemory) 510 allocateDeviceArrays(); 511 } 512 513 void GPUNodeBuilder::finalize() { 514 if (!ManagedMemory) 515 freeDeviceArrays(); 516 517 createCallFreeContext(GPUContext); 518 IslNodeBuilder::finalize(); 519 } 520 521 void GPUNodeBuilder::allocateDeviceArrays() { 522 assert(!ManagedMemory && "Managed memory will directly send host pointers " 523 "to the kernel. There is no need for device arrays"); 524 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 525 526 for (int i = 0; i < Prog->n_array; ++i) { 527 gpu_array_info *Array = &Prog->array[i]; 528 auto *ScopArray = (ScopArrayInfo *)Array->user; 529 std::string DevArrayName("p_dev_array_"); 530 DevArrayName.append(Array->name); 531 532 Value *ArraySize = getArraySize(Array); 533 Value *Offset = getArrayOffset(Array); 534 if (Offset) 535 ArraySize = Builder.CreateSub( 536 ArraySize, 537 Builder.CreateMul(Offset, 538 Builder.getInt64(ScopArray->getElemSizeInBytes()))); 539 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 540 DevArray->setName(DevArrayName); 541 DeviceAllocations[ScopArray] = DevArray; 542 } 543 544 isl_ast_build_free(Build); 545 } 546 547 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 548 Value *BlockDimY, Value *BlockDimZ) { 549 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 550 551 for (auto &F : *M) { 552 if (F.getCallingConv() != CallingConv::PTX_Kernel) 553 continue; 554 555 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 556 557 Metadata *Elements[] = { 558 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 559 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 560 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 561 ValueAsMetadata::get(V[2]), 562 }; 563 MDNode *Node = MDNode::get(M->getContext(), Elements); 564 AnnotationNode->addOperand(Node); 565 } 566 } 567 568 void GPUNodeBuilder::freeDeviceArrays() { 569 assert(!ManagedMemory && "Managed memory does not use device arrays"); 570 for (auto &Array : DeviceAllocations) 571 createCallFreeDeviceMemory(Array.second); 572 } 573 574 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 575 const char *Name = "polly_getKernel"; 576 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 577 Function *F = M->getFunction(Name); 578 579 // If F is not available, declare it. 580 if (!F) { 581 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 582 std::vector<Type *> Args; 583 Args.push_back(Builder.getInt8PtrTy()); 584 Args.push_back(Builder.getInt8PtrTy()); 585 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 586 F = Function::Create(Ty, Linkage, Name, M); 587 } 588 589 return Builder.CreateCall(F, {Buffer, Entry}); 590 } 591 592 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 593 const char *Name = "polly_getDevicePtr"; 594 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 595 Function *F = M->getFunction(Name); 596 597 // If F is not available, declare it. 598 if (!F) { 599 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 600 std::vector<Type *> Args; 601 Args.push_back(Builder.getInt8PtrTy()); 602 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 603 F = Function::Create(Ty, Linkage, Name, M); 604 } 605 606 return Builder.CreateCall(F, {Allocation}); 607 } 608 609 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 610 Value *GridDimY, Value *BlockDimX, 611 Value *BlockDimY, Value *BlockDimZ, 612 Value *Parameters) { 613 const char *Name = "polly_launchKernel"; 614 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 615 Function *F = M->getFunction(Name); 616 617 // If F is not available, declare it. 618 if (!F) { 619 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 620 std::vector<Type *> Args; 621 Args.push_back(Builder.getInt8PtrTy()); 622 Args.push_back(Builder.getInt32Ty()); 623 Args.push_back(Builder.getInt32Ty()); 624 Args.push_back(Builder.getInt32Ty()); 625 Args.push_back(Builder.getInt32Ty()); 626 Args.push_back(Builder.getInt32Ty()); 627 Args.push_back(Builder.getInt8PtrTy()); 628 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 629 F = Function::Create(Ty, Linkage, Name, M); 630 } 631 632 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 633 BlockDimZ, Parameters}); 634 } 635 636 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 637 const char *Name = "polly_freeKernel"; 638 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 639 Function *F = M->getFunction(Name); 640 641 // If F is not available, declare it. 642 if (!F) { 643 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 644 std::vector<Type *> Args; 645 Args.push_back(Builder.getInt8PtrTy()); 646 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 647 F = Function::Create(Ty, Linkage, Name, M); 648 } 649 650 Builder.CreateCall(F, {GPUKernel}); 651 } 652 653 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 654 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 655 "for device"); 656 const char *Name = "polly_freeDeviceMemory"; 657 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 658 Function *F = M->getFunction(Name); 659 660 // If F is not available, declare it. 661 if (!F) { 662 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 663 std::vector<Type *> Args; 664 Args.push_back(Builder.getInt8PtrTy()); 665 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 666 F = Function::Create(Ty, Linkage, Name, M); 667 } 668 669 Builder.CreateCall(F, {Array}); 670 } 671 672 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 673 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 674 "for device"); 675 const char *Name = "polly_allocateMemoryForDevice"; 676 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 677 Function *F = M->getFunction(Name); 678 679 // If F is not available, declare it. 680 if (!F) { 681 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 682 std::vector<Type *> Args; 683 Args.push_back(Builder.getInt64Ty()); 684 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 685 F = Function::Create(Ty, Linkage, Name, M); 686 } 687 688 return Builder.CreateCall(F, {Size}); 689 } 690 691 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 692 Value *DeviceData, 693 Value *Size) { 694 assert(!ManagedMemory && "Managed memory does not transfer memory between " 695 "device and host"); 696 const char *Name = "polly_copyFromHostToDevice"; 697 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 698 Function *F = M->getFunction(Name); 699 700 // If F is not available, declare it. 701 if (!F) { 702 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 703 std::vector<Type *> Args; 704 Args.push_back(Builder.getInt8PtrTy()); 705 Args.push_back(Builder.getInt8PtrTy()); 706 Args.push_back(Builder.getInt64Ty()); 707 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 708 F = Function::Create(Ty, Linkage, Name, M); 709 } 710 711 Builder.CreateCall(F, {HostData, DeviceData, Size}); 712 } 713 714 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 715 Value *HostData, 716 Value *Size) { 717 assert(!ManagedMemory && "Managed memory does not transfer memory between " 718 "device and host"); 719 const char *Name = "polly_copyFromDeviceToHost"; 720 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 721 Function *F = M->getFunction(Name); 722 723 // If F is not available, declare it. 724 if (!F) { 725 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 726 std::vector<Type *> Args; 727 Args.push_back(Builder.getInt8PtrTy()); 728 Args.push_back(Builder.getInt8PtrTy()); 729 Args.push_back(Builder.getInt64Ty()); 730 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 731 F = Function::Create(Ty, Linkage, Name, M); 732 } 733 734 Builder.CreateCall(F, {DeviceData, HostData, Size}); 735 } 736 737 void GPUNodeBuilder::createCallSynchronizeDevice() { 738 assert(ManagedMemory && "explicit synchronization is only necessary for " 739 "managed memory"); 740 const char *Name = "polly_synchronizeDevice"; 741 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 742 Function *F = M->getFunction(Name); 743 744 // If F is not available, declare it. 745 if (!F) { 746 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 747 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); 748 F = Function::Create(Ty, Linkage, Name, M); 749 } 750 751 Builder.CreateCall(F); 752 } 753 754 Value *GPUNodeBuilder::createCallInitContext() { 755 const char *Name = "polly_initContext"; 756 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 757 Function *F = M->getFunction(Name); 758 759 // If F is not available, declare it. 760 if (!F) { 761 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 762 std::vector<Type *> Args; 763 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 764 F = Function::Create(Ty, Linkage, Name, M); 765 } 766 767 return Builder.CreateCall(F, {}); 768 } 769 770 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 771 const char *Name = "polly_freeContext"; 772 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 773 Function *F = M->getFunction(Name); 774 775 // If F is not available, declare it. 776 if (!F) { 777 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 778 std::vector<Type *> Args; 779 Args.push_back(Builder.getInt8PtrTy()); 780 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 781 F = Function::Create(Ty, Linkage, Name, M); 782 } 783 784 Builder.CreateCall(F, {Context}); 785 } 786 787 /// Check if one string is a prefix of another. 788 /// 789 /// @param String The string in which to look for the prefix. 790 /// @param Prefix The prefix to look for. 791 static bool isPrefix(std::string String, std::string Prefix) { 792 return String.find(Prefix) == 0; 793 } 794 795 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 796 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 797 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 798 799 if (!gpu_array_is_scalar(Array)) { 800 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 801 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 802 803 for (unsigned int i = 1; i < Array->n_index; i++) { 804 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 805 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 806 Res = isl_ast_expr_mul(Res, Expr); 807 } 808 809 Value *NumElements = ExprBuilder.create(Res); 810 if (NumElements->getType() != ArraySize->getType()) 811 NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); 812 ArraySize = Builder.CreateMul(ArraySize, NumElements); 813 } 814 isl_ast_build_free(Build); 815 return ArraySize; 816 } 817 818 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { 819 if (gpu_array_is_scalar(Array)) 820 return nullptr; 821 822 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 823 824 isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent)); 825 826 isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min)); 827 828 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) 829 ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0); 830 831 if (isl_set_is_subset(Min, ZeroSet)) { 832 isl_set_free(Min); 833 isl_set_free(ZeroSet); 834 isl_ast_build_free(Build); 835 return nullptr; 836 } 837 isl_set_free(ZeroSet); 838 839 isl_ast_expr *Result = 840 isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0)); 841 842 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { 843 if (i > 0) { 844 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); 845 isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 846 Result = isl_ast_expr_mul(Result, BExpr); 847 } 848 isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i); 849 isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin); 850 Result = isl_ast_expr_add(Result, MExpr); 851 } 852 853 Value *ResultValue = ExprBuilder.create(Result); 854 isl_set_free(Min); 855 isl_ast_build_free(Build); 856 857 return ResultValue; 858 } 859 860 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, 861 ScopArrayInfo *ArrayInfo) { 862 863 assert(ManagedMemory && "Only used when you wish to get a host " 864 "pointer for sending data to the kernel, " 865 "with managed memory"); 866 std::map<ScopArrayInfo *, Value *>::iterator it; 867 if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { 868 return it->second; 869 } else { 870 Value *HostPtr; 871 872 if (gpu_array_is_scalar(Array)) 873 HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); 874 else 875 HostPtr = ArrayInfo->getBasePtr(); 876 877 Value *Offset = getArrayOffset(Array); 878 if (Offset) { 879 HostPtr = Builder.CreatePointerCast( 880 HostPtr, ArrayInfo->getElementType()->getPointerTo()); 881 HostPtr = Builder.CreateGEP(HostPtr, Offset); 882 } 883 884 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 885 DeviceAllocations[ArrayInfo] = HostPtr; 886 return HostPtr; 887 } 888 } 889 890 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 891 enum DataDirection Direction) { 892 assert(!ManagedMemory && "Managed memory needs no data transfers"); 893 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 894 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 895 isl_id *Id = isl_ast_expr_get_id(Arg); 896 auto Array = (gpu_array_info *)isl_id_get_user(Id); 897 auto ScopArray = (ScopArrayInfo *)(Array->user); 898 899 Value *Size = getArraySize(Array); 900 Value *Offset = getArrayOffset(Array); 901 Value *DevPtr = DeviceAllocations[ScopArray]; 902 903 Value *HostPtr; 904 905 if (gpu_array_is_scalar(Array)) 906 HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 907 else 908 HostPtr = ScopArray->getBasePtr(); 909 910 if (Offset) { 911 HostPtr = Builder.CreatePointerCast( 912 HostPtr, ScopArray->getElementType()->getPointerTo()); 913 HostPtr = Builder.CreateGEP(HostPtr, Offset); 914 } 915 916 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 917 918 if (Offset) { 919 Size = Builder.CreateSub( 920 Size, Builder.CreateMul( 921 Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); 922 } 923 924 if (Direction == HOST_TO_DEVICE) 925 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 926 else 927 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 928 929 isl_id_free(Id); 930 isl_ast_expr_free(Arg); 931 isl_ast_expr_free(Expr); 932 isl_ast_node_free(TransferStmt); 933 } 934 935 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 936 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 937 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 938 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 939 isl_id_free(Id); 940 isl_ast_expr_free(StmtExpr); 941 942 const char *Str = isl_id_get_name(Id); 943 if (!strcmp(Str, "kernel")) { 944 createKernel(UserStmt); 945 isl_ast_expr_free(Expr); 946 return; 947 } 948 949 if (isPrefix(Str, "to_device")) { 950 if (!ManagedMemory) 951 createDataTransfer(UserStmt, HOST_TO_DEVICE); 952 else 953 isl_ast_node_free(UserStmt); 954 955 isl_ast_expr_free(Expr); 956 return; 957 } 958 959 if (isPrefix(Str, "from_device")) { 960 if (!ManagedMemory) { 961 createDataTransfer(UserStmt, DEVICE_TO_HOST); 962 } else { 963 createCallSynchronizeDevice(); 964 isl_ast_node_free(UserStmt); 965 } 966 isl_ast_expr_free(Expr); 967 return; 968 } 969 970 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 971 struct ppcg_kernel_stmt *KernelStmt = 972 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 973 isl_id_free(Anno); 974 975 switch (KernelStmt->type) { 976 case ppcg_kernel_domain: 977 createScopStmt(Expr, KernelStmt); 978 isl_ast_node_free(UserStmt); 979 return; 980 case ppcg_kernel_copy: 981 createKernelCopy(KernelStmt); 982 isl_ast_expr_free(Expr); 983 isl_ast_node_free(UserStmt); 984 return; 985 case ppcg_kernel_sync: 986 createKernelSync(); 987 isl_ast_expr_free(Expr); 988 isl_ast_node_free(UserStmt); 989 return; 990 } 991 992 isl_ast_expr_free(Expr); 993 isl_ast_node_free(UserStmt); 994 return; 995 } 996 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 997 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 998 LocalIndex = isl_ast_expr_address_of(LocalIndex); 999 Value *LocalAddr = ExprBuilder.create(LocalIndex); 1000 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 1001 Index = isl_ast_expr_address_of(Index); 1002 Value *GlobalAddr = ExprBuilder.create(Index); 1003 1004 if (KernelStmt->u.c.read) { 1005 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 1006 Builder.CreateStore(Load, LocalAddr); 1007 } else { 1008 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 1009 Builder.CreateStore(Load, GlobalAddr); 1010 } 1011 } 1012 1013 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 1014 ppcg_kernel_stmt *KernelStmt) { 1015 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1016 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 1017 1018 LoopToScevMapT LTS; 1019 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 1020 1021 createSubstitutions(Expr, Stmt, LTS); 1022 1023 if (Stmt->isBlockStmt()) 1024 BlockGen.copyStmt(*Stmt, LTS, Indexes); 1025 else 1026 RegionGen.copyStmt(*Stmt, LTS, Indexes); 1027 } 1028 1029 void GPUNodeBuilder::createKernelSync() { 1030 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1031 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 1032 Builder.CreateCall(Sync, {}); 1033 } 1034 1035 /// Collect llvm::Values referenced from @p Node 1036 /// 1037 /// This function only applies to isl_ast_nodes that are user_nodes referring 1038 /// to a ScopStmt. All other node types are ignore. 1039 /// 1040 /// @param Node The node to collect references for. 1041 /// @param User A user pointer used as storage for the data that is collected. 1042 /// 1043 /// @returns isl_bool_true if data could be collected successfully. 1044 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 1045 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 1046 return isl_bool_true; 1047 1048 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 1049 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1050 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1051 const char *Str = isl_id_get_name(Id); 1052 isl_id_free(Id); 1053 isl_ast_expr_free(StmtExpr); 1054 isl_ast_expr_free(Expr); 1055 1056 if (!isPrefix(Str, "Stmt")) 1057 return isl_bool_true; 1058 1059 Id = isl_ast_node_get_annotation(Node); 1060 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 1061 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1062 isl_id_free(Id); 1063 1064 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 1065 1066 return isl_bool_true; 1067 } 1068 1069 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 1070 SetVector<Value *> SubtreeValues; 1071 SetVector<const SCEV *> SCEVs; 1072 SetVector<const Loop *> Loops; 1073 SubtreeReferences References = { 1074 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 1075 1076 for (const auto &I : IDToValue) 1077 SubtreeValues.insert(I.second); 1078 1079 isl_ast_node_foreach_descendant_top_down( 1080 Kernel->tree, collectReferencesInGPUStmt, &References); 1081 1082 for (const SCEV *Expr : SCEVs) 1083 findValues(Expr, SE, SubtreeValues); 1084 1085 for (auto &SAI : S.arrays()) 1086 SubtreeValues.remove(SAI->getBasePtr()); 1087 1088 isl_space *Space = S.getParamSpace(); 1089 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 1090 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 1091 assert(IDToValue.count(Id)); 1092 Value *Val = IDToValue[Id]; 1093 SubtreeValues.remove(Val); 1094 isl_id_free(Id); 1095 } 1096 isl_space_free(Space); 1097 1098 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 1099 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1100 assert(IDToValue.count(Id)); 1101 Value *Val = IDToValue[Id]; 1102 SubtreeValues.remove(Val); 1103 isl_id_free(Id); 1104 } 1105 1106 return SubtreeValues; 1107 } 1108 1109 void GPUNodeBuilder::clearDominators(Function *F) { 1110 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 1111 std::vector<BasicBlock *> Nodes; 1112 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 1113 Nodes.push_back(I->getBlock()); 1114 1115 for (BasicBlock *BB : Nodes) 1116 DT.eraseNode(BB); 1117 } 1118 1119 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 1120 for (BasicBlock &BB : *F) { 1121 Loop *L = LI.getLoopFor(&BB); 1122 if (L) 1123 SE.forgetLoop(L); 1124 } 1125 } 1126 1127 void GPUNodeBuilder::clearLoops(Function *F) { 1128 for (BasicBlock &BB : *F) { 1129 Loop *L = LI.getLoopFor(&BB); 1130 if (L) 1131 SE.forgetLoop(L); 1132 LI.removeBlock(&BB); 1133 } 1134 } 1135 1136 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 1137 std::vector<Value *> Sizes; 1138 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 1139 1140 for (long i = 0; i < Kernel->n_grid; i++) { 1141 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 1142 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 1143 Value *Res = ExprBuilder.create(GridSize); 1144 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 1145 Sizes.push_back(Res); 1146 } 1147 isl_ast_build_free(Context); 1148 1149 for (long i = Kernel->n_grid; i < 3; i++) 1150 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1151 1152 return std::make_tuple(Sizes[0], Sizes[1]); 1153 } 1154 1155 std::tuple<Value *, Value *, Value *> 1156 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 1157 std::vector<Value *> Sizes; 1158 1159 for (long i = 0; i < Kernel->n_block; i++) { 1160 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 1161 Sizes.push_back(Res); 1162 } 1163 1164 for (long i = Kernel->n_block; i < 3; i++) 1165 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1166 1167 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 1168 } 1169 1170 Value * 1171 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 1172 SetVector<Value *> SubtreeValues) { 1173 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 1174 std::distance(F->arg_begin(), F->arg_end())); 1175 1176 BasicBlock *EntryBlock = 1177 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 1178 auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); 1179 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 1180 Instruction *Parameters = new AllocaInst( 1181 ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); 1182 1183 int Index = 0; 1184 for (long i = 0; i < Prog->n_array; i++) { 1185 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1186 continue; 1187 1188 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1189 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1190 1191 Value *DevArray = nullptr; 1192 if (ManagedMemory) { 1193 DevArray = getOrCreateManagedDeviceArray( 1194 &Prog->array[i], const_cast<ScopArrayInfo *>(SAI)); 1195 } else { 1196 DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)]; 1197 DevArray = createCallGetDevicePtr(DevArray); 1198 } 1199 assert(DevArray != nullptr && "Array to be offloaded to device not " 1200 "initialized"); 1201 Value *Offset = getArrayOffset(&Prog->array[i]); 1202 1203 if (Offset) { 1204 DevArray = Builder.CreatePointerCast( 1205 DevArray, SAI->getElementType()->getPointerTo()); 1206 DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); 1207 DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); 1208 } 1209 Value *Slot = Builder.CreateGEP( 1210 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1211 1212 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1213 Value *ValPtr = nullptr; 1214 if (ManagedMemory) 1215 ValPtr = DevArray; 1216 else 1217 ValPtr = BlockGen.getOrCreateAlloca(SAI); 1218 1219 assert(ValPtr != nullptr && "ValPtr that should point to a valid object" 1220 " to be stored into Parameters"); 1221 Value *ValPtrCast = 1222 Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); 1223 Builder.CreateStore(ValPtrCast, Slot); 1224 } else { 1225 Instruction *Param = 1226 new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, 1227 Launch + "_param_" + std::to_string(Index), 1228 EntryBlock->getTerminator()); 1229 Builder.CreateStore(DevArray, Param); 1230 Value *ParamTyped = 1231 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1232 Builder.CreateStore(ParamTyped, Slot); 1233 } 1234 Index++; 1235 } 1236 1237 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1238 1239 for (long i = 0; i < NumHostIters; i++) { 1240 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1241 Value *Val = IDToValue[Id]; 1242 isl_id_free(Id); 1243 Instruction *Param = 1244 new AllocaInst(Val->getType(), AddressSpace, 1245 Launch + "_param_" + std::to_string(Index), 1246 EntryBlock->getTerminator()); 1247 Builder.CreateStore(Val, Param); 1248 Value *Slot = Builder.CreateGEP( 1249 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1250 Value *ParamTyped = 1251 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1252 Builder.CreateStore(ParamTyped, Slot); 1253 Index++; 1254 } 1255 1256 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1257 1258 for (long i = 0; i < NumVars; i++) { 1259 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1260 Value *Val = IDToValue[Id]; 1261 isl_id_free(Id); 1262 Instruction *Param = 1263 new AllocaInst(Val->getType(), AddressSpace, 1264 Launch + "_param_" + std::to_string(Index), 1265 EntryBlock->getTerminator()); 1266 Builder.CreateStore(Val, Param); 1267 Value *Slot = Builder.CreateGEP( 1268 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1269 Value *ParamTyped = 1270 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1271 Builder.CreateStore(ParamTyped, Slot); 1272 Index++; 1273 } 1274 1275 for (auto Val : SubtreeValues) { 1276 Instruction *Param = 1277 new AllocaInst(Val->getType(), AddressSpace, 1278 Launch + "_param_" + std::to_string(Index), 1279 EntryBlock->getTerminator()); 1280 Builder.CreateStore(Val, Param); 1281 Value *Slot = Builder.CreateGEP( 1282 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1283 Value *ParamTyped = 1284 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1285 Builder.CreateStore(ParamTyped, Slot); 1286 Index++; 1287 } 1288 1289 auto Location = EntryBlock->getTerminator(); 1290 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1291 Launch + "_params_i8ptr", Location); 1292 } 1293 1294 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1295 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1296 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1297 isl_id_free(Id); 1298 isl_ast_node_free(KernelStmt); 1299 1300 if (Kernel->n_grid > 1) 1301 DeepestParallel = 1302 std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set)); 1303 else 1304 DeepestSequential = 1305 std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set)); 1306 1307 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1308 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1309 1310 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 1311 1312 assert(Kernel->tree && "Device AST of kernel node is empty"); 1313 1314 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1315 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1316 ValueMapT HostValueMap = ValueMap; 1317 BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; 1318 ScalarMap.clear(); 1319 1320 SetVector<const Loop *> Loops; 1321 1322 // Create for all loops we depend on values that contain the current loop 1323 // iteration. These values are necessary to generate code for SCEVs that 1324 // depend on such loops. As a result we need to pass them to the subfunction. 1325 for (const Loop *L : Loops) { 1326 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1327 SE.getUnknown(Builder.getInt64(1)), 1328 L, SCEV::FlagAnyWrap); 1329 Value *V = generateSCEV(OuterLIV); 1330 OutsideLoopIterations[L] = SE.getUnknown(V); 1331 SubtreeValues.insert(V); 1332 } 1333 1334 createKernelFunction(Kernel, SubtreeValues); 1335 1336 create(isl_ast_node_copy(Kernel->tree)); 1337 1338 finalizeKernelArguments(Kernel); 1339 Function *F = Builder.GetInsertBlock()->getParent(); 1340 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1341 clearDominators(F); 1342 clearScalarEvolution(F); 1343 clearLoops(F); 1344 1345 IDToValue = HostIDs; 1346 1347 ValueMap = std::move(HostValueMap); 1348 ScalarMap = std::move(HostScalarMap); 1349 EscapeMap.clear(); 1350 IDToSAI.clear(); 1351 Annotator.resetAlternativeAliasBases(); 1352 for (auto &BasePtr : LocalArrays) 1353 S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); 1354 LocalArrays.clear(); 1355 1356 std::string ASMString = finalizeKernelFunction(); 1357 Builder.SetInsertPoint(&HostInsertPoint); 1358 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1359 1360 std::string Name = "kernel_" + std::to_string(Kernel->id); 1361 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1362 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1363 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1364 1365 Value *GridDimX, *GridDimY; 1366 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1367 1368 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1369 BlockDimZ, Parameters); 1370 createCallFreeKernel(GPUKernel); 1371 1372 for (auto Id : KernelIds) 1373 isl_id_free(Id); 1374 1375 KernelIds.clear(); 1376 } 1377 1378 /// Compute the DataLayout string for the NVPTX backend. 1379 /// 1380 /// @param is64Bit Are we looking for a 64 bit architecture? 1381 static std::string computeNVPTXDataLayout(bool is64Bit) { 1382 std::string Ret = ""; 1383 1384 if (!is64Bit) { 1385 Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1386 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1387 "64-v128:128:128-n16:32:64"; 1388 } else { 1389 Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1390 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1391 "64-v128:128:128-n16:32:64"; 1392 } 1393 1394 return Ret; 1395 } 1396 1397 Function * 1398 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1399 SetVector<Value *> &SubtreeValues) { 1400 std::vector<Type *> Args; 1401 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1402 1403 for (long i = 0; i < Prog->n_array; i++) { 1404 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1405 continue; 1406 1407 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1408 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1409 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1410 Args.push_back(SAI->getElementType()); 1411 } else { 1412 static const int UseGlobalMemory = 1; 1413 Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); 1414 } 1415 } 1416 1417 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1418 1419 for (long i = 0; i < NumHostIters; i++) 1420 Args.push_back(Builder.getInt64Ty()); 1421 1422 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1423 1424 for (long i = 0; i < NumVars; i++) { 1425 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1426 Value *Val = IDToValue[Id]; 1427 isl_id_free(Id); 1428 Args.push_back(Val->getType()); 1429 } 1430 1431 for (auto *V : SubtreeValues) 1432 Args.push_back(V->getType()); 1433 1434 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1435 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1436 GPUModule.get()); 1437 FN->setCallingConv(CallingConv::PTX_Kernel); 1438 1439 auto Arg = FN->arg_begin(); 1440 for (long i = 0; i < Kernel->n_array; i++) { 1441 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1442 continue; 1443 1444 Arg->setName(Kernel->array[i].array->name); 1445 1446 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1447 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1448 Type *EleTy = SAI->getElementType(); 1449 Value *Val = &*Arg; 1450 SmallVector<const SCEV *, 4> Sizes; 1451 isl_ast_build *Build = 1452 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1453 Sizes.push_back(nullptr); 1454 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1455 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1456 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1457 auto V = ExprBuilder.create(DimSize); 1458 Sizes.push_back(SE.getSCEV(V)); 1459 } 1460 const ScopArrayInfo *SAIRep = 1461 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); 1462 LocalArrays.push_back(Val); 1463 1464 isl_ast_build_free(Build); 1465 KernelIds.push_back(Id); 1466 IDToSAI[Id] = SAIRep; 1467 Arg++; 1468 } 1469 1470 for (long i = 0; i < NumHostIters; i++) { 1471 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1472 Arg->setName(isl_id_get_name(Id)); 1473 IDToValue[Id] = &*Arg; 1474 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1475 Arg++; 1476 } 1477 1478 for (long i = 0; i < NumVars; i++) { 1479 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1480 Arg->setName(isl_id_get_name(Id)); 1481 Value *Val = IDToValue[Id]; 1482 ValueMap[Val] = &*Arg; 1483 IDToValue[Id] = &*Arg; 1484 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1485 Arg++; 1486 } 1487 1488 for (auto *V : SubtreeValues) { 1489 Arg->setName(V->getName()); 1490 ValueMap[V] = &*Arg; 1491 Arg++; 1492 } 1493 1494 return FN; 1495 } 1496 1497 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1498 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 1499 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 1500 1501 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 1502 Intrinsic::nvvm_read_ptx_sreg_tid_y, 1503 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 1504 1505 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1506 std::string Name = isl_id_get_name(Id); 1507 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1508 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1509 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1510 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1511 IDToValue[Id] = Val; 1512 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1513 }; 1514 1515 for (int i = 0; i < Kernel->n_grid; ++i) { 1516 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1517 addId(Id, IntrinsicsBID[i]); 1518 } 1519 1520 for (int i = 0; i < Kernel->n_block; ++i) { 1521 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1522 addId(Id, IntrinsicsTID[i]); 1523 } 1524 } 1525 1526 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1527 auto Arg = FN->arg_begin(); 1528 for (long i = 0; i < Kernel->n_array; i++) { 1529 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1530 continue; 1531 1532 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1533 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1534 isl_id_free(Id); 1535 1536 if (SAI->getNumberOfDimensions() > 0) { 1537 Arg++; 1538 continue; 1539 } 1540 1541 Value *Val = &*Arg; 1542 1543 if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { 1544 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1545 Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); 1546 Val = Builder.CreateLoad(TypedArgPtr); 1547 } 1548 1549 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1550 Builder.CreateStore(Val, Alloca); 1551 1552 Arg++; 1553 } 1554 } 1555 1556 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { 1557 auto *FN = Builder.GetInsertBlock()->getParent(); 1558 auto Arg = FN->arg_begin(); 1559 1560 bool StoredScalar = false; 1561 for (long i = 0; i < Kernel->n_array; i++) { 1562 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1563 continue; 1564 1565 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1566 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1567 isl_id_free(Id); 1568 1569 if (SAI->getNumberOfDimensions() > 0) { 1570 Arg++; 1571 continue; 1572 } 1573 1574 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1575 Arg++; 1576 continue; 1577 } 1578 1579 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1580 Value *ArgPtr = &*Arg; 1581 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1582 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1583 Value *Val = Builder.CreateLoad(Alloca); 1584 Builder.CreateStore(Val, TypedArgPtr); 1585 StoredScalar = true; 1586 1587 Arg++; 1588 } 1589 1590 if (StoredScalar) 1591 /// In case more than one thread contains scalar stores, the generated 1592 /// code might be incorrect, if we only store at the end of the kernel. 1593 /// To support this case we need to store these scalars back at each 1594 /// memory store or at least before each kernel barrier. 1595 if (Kernel->n_block != 0 || Kernel->n_grid != 0) 1596 BuildSuccessful = 0; 1597 } 1598 1599 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1600 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1601 1602 for (int i = 0; i < Kernel->n_var; ++i) { 1603 struct ppcg_kernel_var &Var = Kernel->var[i]; 1604 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1605 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1606 1607 Type *ArrayTy = EleTy; 1608 SmallVector<const SCEV *, 4> Sizes; 1609 1610 Sizes.push_back(nullptr); 1611 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1612 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1613 long Bound = isl_val_get_num_si(Val); 1614 isl_val_free(Val); 1615 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1616 } 1617 1618 for (int j = Var.array->n_index - 1; j >= 0; --j) { 1619 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1620 long Bound = isl_val_get_num_si(Val); 1621 isl_val_free(Val); 1622 ArrayTy = ArrayType::get(ArrayTy, Bound); 1623 } 1624 1625 const ScopArrayInfo *SAI; 1626 Value *Allocation; 1627 if (Var.type == ppcg_access_shared) { 1628 auto GlobalVar = new GlobalVariable( 1629 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1630 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1631 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1632 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1633 1634 Allocation = GlobalVar; 1635 } else if (Var.type == ppcg_access_private) { 1636 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1637 } else { 1638 llvm_unreachable("unknown variable type"); 1639 } 1640 SAI = 1641 S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); 1642 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1643 IDToValue[Id] = Allocation; 1644 LocalArrays.push_back(Allocation); 1645 KernelIds.push_back(Id); 1646 IDToSAI[Id] = SAI; 1647 } 1648 } 1649 1650 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1651 SetVector<Value *> &SubtreeValues) { 1652 1653 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1654 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1655 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1656 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1657 1658 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1659 1660 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1661 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1662 1663 DT.addNewBlock(EntryBlock, PrevBlock); 1664 1665 Builder.SetInsertPoint(EntryBlock); 1666 Builder.CreateRetVoid(); 1667 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1668 1669 ScopDetection::markFunctionAsInvalid(FN); 1670 1671 prepareKernelArguments(Kernel, FN); 1672 createKernelVariables(Kernel, FN); 1673 insertKernelIntrinsics(Kernel); 1674 } 1675 1676 std::string GPUNodeBuilder::createKernelASM() { 1677 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1678 std::string ErrMsg; 1679 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1680 1681 if (!GPUTarget) { 1682 errs() << ErrMsg << "\n"; 1683 return ""; 1684 } 1685 1686 TargetOptions Options; 1687 Options.UnsafeFPMath = FastMath; 1688 std::unique_ptr<TargetMachine> TargetM( 1689 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 1690 Options, Optional<Reloc::Model>())); 1691 1692 SmallString<0> ASMString; 1693 raw_svector_ostream ASMStream(ASMString); 1694 llvm::legacy::PassManager PM; 1695 1696 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1697 1698 if (TargetM->addPassesToEmitFile( 1699 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1700 errs() << "The target does not support generation of this file type!\n"; 1701 return ""; 1702 } 1703 1704 PM.run(*GPUModule); 1705 1706 return ASMStream.str(); 1707 } 1708 1709 std::string GPUNodeBuilder::finalizeKernelFunction() { 1710 if (verifyModule(*GPUModule)) { 1711 BuildSuccessful = false; 1712 return ""; 1713 } 1714 1715 if (DumpKernelIR) 1716 outs() << *GPUModule << "\n"; 1717 1718 // Optimize module. 1719 llvm::legacy::PassManager OptPasses; 1720 PassManagerBuilder PassBuilder; 1721 PassBuilder.OptLevel = 3; 1722 PassBuilder.SizeLevel = 0; 1723 PassBuilder.populateModulePassManager(OptPasses); 1724 OptPasses.run(*GPUModule); 1725 1726 std::string Assembly = createKernelASM(); 1727 1728 if (DumpKernelASM) 1729 outs() << Assembly << "\n"; 1730 1731 GPUModule.release(); 1732 KernelIDs.clear(); 1733 1734 return Assembly; 1735 } 1736 1737 namespace { 1738 class PPCGCodeGeneration : public ScopPass { 1739 public: 1740 static char ID; 1741 1742 /// The scop that is currently processed. 1743 Scop *S; 1744 1745 LoopInfo *LI; 1746 DominatorTree *DT; 1747 ScalarEvolution *SE; 1748 const DataLayout *DL; 1749 RegionInfo *RI; 1750 1751 PPCGCodeGeneration() : ScopPass(ID) {} 1752 1753 /// Construct compilation options for PPCG. 1754 /// 1755 /// @returns The compilation options. 1756 ppcg_options *createPPCGOptions() { 1757 auto DebugOptions = 1758 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1759 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1760 1761 DebugOptions->dump_schedule_constraints = false; 1762 DebugOptions->dump_schedule = false; 1763 DebugOptions->dump_final_schedule = false; 1764 DebugOptions->dump_sizes = false; 1765 DebugOptions->verbose = false; 1766 1767 Options->debug = DebugOptions; 1768 1769 Options->reschedule = true; 1770 Options->scale_tile_loops = false; 1771 Options->wrap = false; 1772 1773 Options->non_negative_parameters = false; 1774 Options->ctx = nullptr; 1775 Options->sizes = nullptr; 1776 1777 Options->tile_size = 32; 1778 1779 Options->use_private_memory = PrivateMemory; 1780 Options->use_shared_memory = SharedMemory; 1781 Options->max_shared_memory = 48 * 1024; 1782 1783 Options->target = PPCG_TARGET_CUDA; 1784 Options->openmp = false; 1785 Options->linearize_device_arrays = true; 1786 Options->live_range_reordering = false; 1787 1788 Options->opencl_compiler_options = nullptr; 1789 Options->opencl_use_gpu = false; 1790 Options->opencl_n_include_file = 0; 1791 Options->opencl_include_files = nullptr; 1792 Options->opencl_print_kernel_types = false; 1793 Options->opencl_embed_kernel_code = false; 1794 1795 Options->save_schedule_file = nullptr; 1796 Options->load_schedule_file = nullptr; 1797 1798 return Options; 1799 } 1800 1801 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1802 /// 1803 /// Instead of a normal access of the form: 1804 /// 1805 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1806 /// 1807 /// a tagged access has the form 1808 /// 1809 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1810 /// 1811 /// where 'id' is an additional space that references the memory access that 1812 /// triggered the access. 1813 /// 1814 /// @param AccessTy The type of the memory accesses to collect. 1815 /// 1816 /// @return The relation describing all tagged memory accesses. 1817 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1818 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1819 1820 for (auto &Stmt : *S) 1821 for (auto &Acc : Stmt) 1822 if (Acc->getType() == AccessTy) { 1823 isl_map *Relation = Acc->getAccessRelation(); 1824 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1825 1826 isl_space *Space = isl_map_get_space(Relation); 1827 Space = isl_space_range(Space); 1828 Space = isl_space_from_range(Space); 1829 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1830 isl_map *Universe = isl_map_universe(Space); 1831 Relation = isl_map_domain_product(Relation, Universe); 1832 Accesses = isl_union_map_add_map(Accesses, Relation); 1833 } 1834 1835 return Accesses; 1836 } 1837 1838 /// Get the set of all read accesses, tagged with the access id. 1839 /// 1840 /// @see getTaggedAccesses 1841 isl_union_map *getTaggedReads() { 1842 return getTaggedAccesses(MemoryAccess::READ); 1843 } 1844 1845 /// Get the set of all may (and must) accesses, tagged with the access id. 1846 /// 1847 /// @see getTaggedAccesses 1848 isl_union_map *getTaggedMayWrites() { 1849 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1850 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1851 } 1852 1853 /// Get the set of all must accesses, tagged with the access id. 1854 /// 1855 /// @see getTaggedAccesses 1856 isl_union_map *getTaggedMustWrites() { 1857 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1858 } 1859 1860 /// Collect parameter and array names as isl_ids. 1861 /// 1862 /// To reason about the different parameters and arrays used, ppcg requires 1863 /// a list of all isl_ids in use. As PPCG traditionally performs 1864 /// source-to-source compilation each of these isl_ids is mapped to the 1865 /// expression that represents it. As we do not have a corresponding 1866 /// expression in Polly, we just map each id to a 'zero' expression to match 1867 /// the data format that ppcg expects. 1868 /// 1869 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1870 __isl_give isl_id_to_ast_expr *getNames() { 1871 auto *Names = isl_id_to_ast_expr_alloc( 1872 S->getIslCtx(), 1873 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1874 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1875 auto *Space = S->getParamSpace(); 1876 1877 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1878 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1879 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1880 } 1881 1882 for (auto &Array : S->arrays()) { 1883 auto Id = Array->getBasePtrId(); 1884 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1885 } 1886 1887 isl_space_free(Space); 1888 isl_ast_expr_free(Zero); 1889 1890 return Names; 1891 } 1892 1893 /// Create a new PPCG scop from the current scop. 1894 /// 1895 /// The PPCG scop is initialized with data from the current polly::Scop. From 1896 /// this initial data, the data-dependences in the PPCG scop are initialized. 1897 /// We do not use Polly's dependence analysis for now, to ensure we match 1898 /// the PPCG default behaviour more closely. 1899 /// 1900 /// @returns A new ppcg scop. 1901 ppcg_scop *createPPCGScop() { 1902 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1903 1904 PPCGScop->options = createPPCGOptions(); 1905 1906 PPCGScop->start = 0; 1907 PPCGScop->end = 0; 1908 1909 PPCGScop->context = S->getContext(); 1910 PPCGScop->domain = S->getDomains(); 1911 PPCGScop->call = nullptr; 1912 PPCGScop->tagged_reads = getTaggedReads(); 1913 PPCGScop->reads = S->getReads(); 1914 PPCGScop->live_in = nullptr; 1915 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1916 PPCGScop->may_writes = S->getWrites(); 1917 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1918 PPCGScop->must_writes = S->getMustWrites(); 1919 PPCGScop->live_out = nullptr; 1920 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1921 PPCGScop->tagger = nullptr; 1922 1923 PPCGScop->independence = nullptr; 1924 PPCGScop->dep_flow = nullptr; 1925 PPCGScop->tagged_dep_flow = nullptr; 1926 PPCGScop->dep_false = nullptr; 1927 PPCGScop->dep_forced = nullptr; 1928 PPCGScop->dep_order = nullptr; 1929 PPCGScop->tagged_dep_order = nullptr; 1930 1931 PPCGScop->schedule = S->getScheduleTree(); 1932 PPCGScop->names = getNames(); 1933 1934 PPCGScop->pet = nullptr; 1935 1936 compute_tagger(PPCGScop); 1937 compute_dependences(PPCGScop); 1938 1939 return PPCGScop; 1940 } 1941 1942 /// Collect the array acesses in a statement. 1943 /// 1944 /// @param Stmt The statement for which to collect the accesses. 1945 /// 1946 /// @returns A list of array accesses. 1947 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1948 gpu_stmt_access *Accesses = nullptr; 1949 1950 for (MemoryAccess *Acc : Stmt) { 1951 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1952 Access->read = Acc->isRead(); 1953 Access->write = Acc->isWrite(); 1954 Access->access = Acc->getAccessRelation(); 1955 isl_space *Space = isl_map_get_space(Access->access); 1956 Space = isl_space_range(Space); 1957 Space = isl_space_from_range(Space); 1958 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1959 isl_map *Universe = isl_map_universe(Space); 1960 Access->tagged_access = 1961 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1962 Access->exact_write = !Acc->isMayWrite(); 1963 Access->ref_id = Acc->getId(); 1964 Access->next = Accesses; 1965 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 1966 Accesses = Access; 1967 } 1968 1969 return Accesses; 1970 } 1971 1972 /// Collect the list of GPU statements. 1973 /// 1974 /// Each statement has an id, a pointer to the underlying data structure, 1975 /// as well as a list with all memory accesses. 1976 /// 1977 /// TODO: Initialize the list of memory accesses. 1978 /// 1979 /// @returns A linked-list of statements. 1980 gpu_stmt *getStatements() { 1981 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1982 std::distance(S->begin(), S->end())); 1983 1984 int i = 0; 1985 for (auto &Stmt : *S) { 1986 gpu_stmt *GPUStmt = &Stmts[i]; 1987 1988 GPUStmt->id = Stmt.getDomainId(); 1989 1990 // We use the pet stmt pointer to keep track of the Polly statements. 1991 GPUStmt->stmt = (pet_stmt *)&Stmt; 1992 GPUStmt->accesses = getStmtAccesses(Stmt); 1993 i++; 1994 } 1995 1996 return Stmts; 1997 } 1998 1999 /// Derive the extent of an array. 2000 /// 2001 /// The extent of an array is the set of elements that are within the 2002 /// accessed array. For the inner dimensions, the extent constraints are 2003 /// 0 and the size of the corresponding array dimension. For the first 2004 /// (outermost) dimension, the extent constraints are the minimal and maximal 2005 /// subscript value for the first dimension. 2006 /// 2007 /// @param Array The array to derive the extent for. 2008 /// 2009 /// @returns An isl_set describing the extent of the array. 2010 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 2011 unsigned NumDims = Array->getNumberOfDimensions(); 2012 isl_union_map *Accesses = S->getAccesses(); 2013 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 2014 Accesses = isl_union_map_detect_equalities(Accesses); 2015 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 2016 AccessUSet = isl_union_set_coalesce(AccessUSet); 2017 AccessUSet = isl_union_set_detect_equalities(AccessUSet); 2018 AccessUSet = isl_union_set_coalesce(AccessUSet); 2019 2020 if (isl_union_set_is_empty(AccessUSet)) { 2021 isl_union_set_free(AccessUSet); 2022 return isl_set_empty(Array->getSpace()); 2023 } 2024 2025 if (Array->getNumberOfDimensions() == 0) { 2026 isl_union_set_free(AccessUSet); 2027 return isl_set_universe(Array->getSpace()); 2028 } 2029 2030 isl_set *AccessSet = 2031 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 2032 2033 isl_union_set_free(AccessUSet); 2034 isl_local_space *LS = isl_local_space_from_space(Array->getSpace()); 2035 2036 isl_pw_aff *Val = 2037 isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 2038 2039 isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 2040 isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 2041 OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 2042 isl_pw_aff_dim(Val, isl_dim_in)); 2043 OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 2044 isl_pw_aff_dim(Val, isl_dim_in)); 2045 OuterMin = 2046 isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); 2047 OuterMax = 2048 isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); 2049 2050 isl_set *Extent = isl_set_universe(Array->getSpace()); 2051 2052 Extent = isl_set_intersect( 2053 Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 2054 Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 2055 2056 for (unsigned i = 1; i < NumDims; ++i) 2057 Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 2058 2059 for (unsigned i = 1; i < NumDims; ++i) { 2060 isl_pw_aff *PwAff = 2061 const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i)); 2062 isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 2063 isl_local_space_from_space(Array->getSpace()), isl_dim_set, i)); 2064 PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 2065 isl_pw_aff_dim(Val, isl_dim_in)); 2066 PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 2067 isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 2068 auto *Set = isl_pw_aff_gt_set(PwAff, Val); 2069 Extent = isl_set_intersect(Set, Extent); 2070 } 2071 2072 return Extent; 2073 } 2074 2075 /// Derive the bounds of an array. 2076 /// 2077 /// For the first dimension we derive the bound of the array from the extent 2078 /// of this dimension. For inner dimensions we obtain their size directly from 2079 /// ScopArrayInfo. 2080 /// 2081 /// @param PPCGArray The array to compute bounds for. 2082 /// @param Array The polly array from which to take the information. 2083 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 2084 if (PPCGArray.n_index > 0) { 2085 if (isl_set_is_empty(PPCGArray.extent)) { 2086 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2087 isl_local_space *LS = isl_local_space_from_space( 2088 isl_space_params(isl_set_get_space(Dom))); 2089 isl_set_free(Dom); 2090 isl_aff *Zero = isl_aff_zero_on_domain(LS); 2091 PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero); 2092 } else { 2093 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2094 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 2095 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 2096 isl_set_free(Dom); 2097 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 2098 isl_local_space *LS = 2099 isl_local_space_from_space(isl_set_get_space(Dom)); 2100 isl_aff *One = isl_aff_zero_on_domain(LS); 2101 One = isl_aff_add_constant_si(One, 1); 2102 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 2103 Bound = isl_pw_aff_gist(Bound, S->getContext()); 2104 PPCGArray.bound[0] = Bound; 2105 } 2106 } 2107 2108 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 2109 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 2110 auto LS = isl_pw_aff_get_domain_space(Bound); 2111 auto Aff = isl_multi_aff_zero(LS); 2112 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 2113 PPCGArray.bound[i] = Bound; 2114 } 2115 } 2116 2117 /// Create the arrays for @p PPCGProg. 2118 /// 2119 /// @param PPCGProg The program to compute the arrays for. 2120 void createArrays(gpu_prog *PPCGProg) { 2121 int i = 0; 2122 for (auto &Array : S->arrays()) { 2123 std::string TypeName; 2124 raw_string_ostream OS(TypeName); 2125 2126 OS << *Array->getElementType(); 2127 TypeName = OS.str(); 2128 2129 gpu_array_info &PPCGArray = PPCGProg->array[i]; 2130 2131 PPCGArray.space = Array->getSpace(); 2132 PPCGArray.type = strdup(TypeName.c_str()); 2133 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 2134 PPCGArray.name = strdup(Array->getName().c_str()); 2135 PPCGArray.extent = nullptr; 2136 PPCGArray.n_index = Array->getNumberOfDimensions(); 2137 PPCGArray.bound = 2138 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 2139 PPCGArray.extent = getExtent(Array); 2140 PPCGArray.n_ref = 0; 2141 PPCGArray.refs = nullptr; 2142 PPCGArray.accessed = true; 2143 PPCGArray.read_only_scalar = 2144 Array->isReadOnly() && Array->getNumberOfDimensions() == 0; 2145 PPCGArray.has_compound_element = false; 2146 PPCGArray.local = false; 2147 PPCGArray.declare_local = false; 2148 PPCGArray.global = false; 2149 PPCGArray.linearize = false; 2150 PPCGArray.dep_order = nullptr; 2151 PPCGArray.user = Array; 2152 2153 setArrayBounds(PPCGArray, Array); 2154 i++; 2155 2156 collect_references(PPCGProg, &PPCGArray); 2157 } 2158 } 2159 2160 /// Create an identity map between the arrays in the scop. 2161 /// 2162 /// @returns An identity map between the arrays in the scop. 2163 isl_union_map *getArrayIdentity() { 2164 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 2165 2166 for (auto &Array : S->arrays()) { 2167 isl_space *Space = Array->getSpace(); 2168 Space = isl_space_map_from_set(Space); 2169 isl_map *Identity = isl_map_identity(Space); 2170 Maps = isl_union_map_add_map(Maps, Identity); 2171 } 2172 2173 return Maps; 2174 } 2175 2176 /// Create a default-initialized PPCG GPU program. 2177 /// 2178 /// @returns A new gpu grogram description. 2179 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 2180 2181 if (!PPCGScop) 2182 return nullptr; 2183 2184 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 2185 2186 PPCGProg->ctx = S->getIslCtx(); 2187 PPCGProg->scop = PPCGScop; 2188 PPCGProg->context = isl_set_copy(PPCGScop->context); 2189 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 2190 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 2191 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 2192 PPCGProg->tagged_must_kill = 2193 isl_union_map_copy(PPCGScop->tagged_must_kills); 2194 PPCGProg->to_inner = getArrayIdentity(); 2195 PPCGProg->to_outer = getArrayIdentity(); 2196 PPCGProg->any_to_outer = nullptr; 2197 PPCGProg->array_order = nullptr; 2198 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 2199 PPCGProg->stmts = getStatements(); 2200 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 2201 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 2202 PPCGProg->n_array); 2203 2204 createArrays(PPCGProg); 2205 2206 PPCGProg->may_persist = compute_may_persist(PPCGProg); 2207 2208 return PPCGProg; 2209 } 2210 2211 struct PrintGPUUserData { 2212 struct cuda_info *CudaInfo; 2213 struct gpu_prog *PPCGProg; 2214 std::vector<ppcg_kernel *> Kernels; 2215 }; 2216 2217 /// Print a user statement node in the host code. 2218 /// 2219 /// We use ppcg's printing facilities to print the actual statement and 2220 /// additionally build up a list of all kernels that are encountered in the 2221 /// host ast. 2222 /// 2223 /// @param P The printer to print to 2224 /// @param Options The printing options to use 2225 /// @param Node The node to print 2226 /// @param User A user pointer to carry additional data. This pointer is 2227 /// expected to be of type PrintGPUUserData. 2228 /// 2229 /// @returns A printer to which the output has been printed. 2230 static __isl_give isl_printer * 2231 printHostUser(__isl_take isl_printer *P, 2232 __isl_take isl_ast_print_options *Options, 2233 __isl_take isl_ast_node *Node, void *User) { 2234 auto Data = (struct PrintGPUUserData *)User; 2235 auto Id = isl_ast_node_get_annotation(Node); 2236 2237 if (Id) { 2238 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 2239 2240 // If this is a user statement, format it ourselves as ppcg would 2241 // otherwise try to call pet functionality that is not available in 2242 // Polly. 2243 if (IsUser) { 2244 P = isl_printer_start_line(P); 2245 P = isl_printer_print_ast_node(P, Node); 2246 P = isl_printer_end_line(P); 2247 isl_id_free(Id); 2248 isl_ast_print_options_free(Options); 2249 return P; 2250 } 2251 2252 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 2253 isl_id_free(Id); 2254 Data->Kernels.push_back(Kernel); 2255 } 2256 2257 return print_host_user(P, Options, Node, User); 2258 } 2259 2260 /// Print C code corresponding to the control flow in @p Kernel. 2261 /// 2262 /// @param Kernel The kernel to print 2263 void printKernel(ppcg_kernel *Kernel) { 2264 auto *P = isl_printer_to_str(S->getIslCtx()); 2265 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2266 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2267 P = isl_ast_node_print(Kernel->tree, P, Options); 2268 char *String = isl_printer_get_str(P); 2269 printf("%s\n", String); 2270 free(String); 2271 isl_printer_free(P); 2272 } 2273 2274 /// Print C code corresponding to the GPU code described by @p Tree. 2275 /// 2276 /// @param Tree An AST describing GPU code 2277 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 2278 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 2279 auto *P = isl_printer_to_str(S->getIslCtx()); 2280 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2281 2282 PrintGPUUserData Data; 2283 Data.PPCGProg = PPCGProg; 2284 2285 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2286 Options = 2287 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 2288 P = isl_ast_node_print(Tree, P, Options); 2289 char *String = isl_printer_get_str(P); 2290 printf("# host\n"); 2291 printf("%s\n", String); 2292 free(String); 2293 isl_printer_free(P); 2294 2295 for (auto Kernel : Data.Kernels) { 2296 printf("# kernel%d\n", Kernel->id); 2297 printKernel(Kernel); 2298 } 2299 } 2300 2301 // Generate a GPU program using PPCG. 2302 // 2303 // GPU mapping consists of multiple steps: 2304 // 2305 // 1) Compute new schedule for the program. 2306 // 2) Map schedule to GPU (TODO) 2307 // 3) Generate code for new schedule (TODO) 2308 // 2309 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 2310 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 2311 // strategy directly from this pass. 2312 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 2313 2314 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 2315 2316 PPCGGen->ctx = S->getIslCtx(); 2317 PPCGGen->options = PPCGScop->options; 2318 PPCGGen->print = nullptr; 2319 PPCGGen->print_user = nullptr; 2320 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 2321 PPCGGen->prog = PPCGProg; 2322 PPCGGen->tree = nullptr; 2323 PPCGGen->types.n = 0; 2324 PPCGGen->types.name = nullptr; 2325 PPCGGen->sizes = nullptr; 2326 PPCGGen->used_sizes = nullptr; 2327 PPCGGen->kernel_id = 0; 2328 2329 // Set scheduling strategy to same strategy PPCG is using. 2330 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 2331 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 2332 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 2333 2334 isl_schedule *Schedule = get_schedule(PPCGGen); 2335 2336 int has_permutable = has_any_permutable_node(Schedule); 2337 2338 if (!has_permutable || has_permutable < 0) { 2339 Schedule = isl_schedule_free(Schedule); 2340 } else { 2341 Schedule = map_to_device(PPCGGen, Schedule); 2342 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 2343 } 2344 2345 if (DumpSchedule) { 2346 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 2347 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 2348 P = isl_printer_print_str(P, "Schedule\n"); 2349 P = isl_printer_print_str(P, "========\n"); 2350 if (Schedule) 2351 P = isl_printer_print_schedule(P, Schedule); 2352 else 2353 P = isl_printer_print_str(P, "No schedule found\n"); 2354 2355 printf("%s\n", isl_printer_get_str(P)); 2356 isl_printer_free(P); 2357 } 2358 2359 if (DumpCode) { 2360 printf("Code\n"); 2361 printf("====\n"); 2362 if (PPCGGen->tree) 2363 printGPUTree(PPCGGen->tree, PPCGProg); 2364 else 2365 printf("No code generated\n"); 2366 } 2367 2368 isl_schedule_free(Schedule); 2369 2370 return PPCGGen; 2371 } 2372 2373 /// Free gpu_gen structure. 2374 /// 2375 /// @param PPCGGen The ppcg_gen object to free. 2376 void freePPCGGen(gpu_gen *PPCGGen) { 2377 isl_ast_node_free(PPCGGen->tree); 2378 isl_union_map_free(PPCGGen->sizes); 2379 isl_union_map_free(PPCGGen->used_sizes); 2380 free(PPCGGen); 2381 } 2382 2383 /// Free the options in the ppcg scop structure. 2384 /// 2385 /// ppcg is not freeing these options for us. To avoid leaks we do this 2386 /// ourselves. 2387 /// 2388 /// @param PPCGScop The scop referencing the options to free. 2389 void freeOptions(ppcg_scop *PPCGScop) { 2390 free(PPCGScop->options->debug); 2391 PPCGScop->options->debug = nullptr; 2392 free(PPCGScop->options); 2393 PPCGScop->options = nullptr; 2394 } 2395 2396 /// Approximate the number of points in the set. 2397 /// 2398 /// This function returns an ast expression that overapproximates the number 2399 /// of points in an isl set through the rectangular hull surrounding this set. 2400 /// 2401 /// @param Set The set to count. 2402 /// @param Build The isl ast build object to use for creating the ast 2403 /// expression. 2404 /// 2405 /// @returns An approximation of the number of points in the set. 2406 __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, 2407 __isl_keep isl_ast_build *Build) { 2408 2409 isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); 2410 auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); 2411 2412 isl_space *Space = isl_set_get_space(Set); 2413 Space = isl_space_params(Space); 2414 auto *Univ = isl_set_universe(Space); 2415 isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); 2416 2417 for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) { 2418 isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); 2419 isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); 2420 isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); 2421 DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); 2422 auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); 2423 Expr = isl_ast_expr_mul(Expr, DimSizeExpr); 2424 } 2425 2426 isl_set_free(Set); 2427 isl_pw_aff_free(OneAff); 2428 2429 return Expr; 2430 } 2431 2432 /// Approximate a number of dynamic instructions executed by a given 2433 /// statement. 2434 /// 2435 /// @param Stmt The statement for which to compute the number of dynamic 2436 /// instructions. 2437 /// @param Build The isl ast build object to use for creating the ast 2438 /// expression. 2439 /// @returns An approximation of the number of dynamic instructions executed 2440 /// by @p Stmt. 2441 __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, 2442 __isl_keep isl_ast_build *Build) { 2443 auto Iterations = approxPointsInSet(Stmt.getDomain(), Build); 2444 2445 long InstCount = 0; 2446 2447 if (Stmt.isBlockStmt()) { 2448 auto *BB = Stmt.getBasicBlock(); 2449 InstCount = std::distance(BB->begin(), BB->end()); 2450 } else { 2451 auto *R = Stmt.getRegion(); 2452 2453 for (auto *BB : R->blocks()) { 2454 InstCount += std::distance(BB->begin(), BB->end()); 2455 } 2456 } 2457 2458 isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount); 2459 auto *InstExpr = isl_ast_expr_from_val(InstVal); 2460 return isl_ast_expr_mul(InstExpr, Iterations); 2461 } 2462 2463 /// Approximate dynamic instructions executed in scop. 2464 /// 2465 /// @param S The scop for which to approximate dynamic instructions. 2466 /// @param Build The isl ast build object to use for creating the ast 2467 /// expression. 2468 /// @returns An approximation of the number of dynamic instructions executed 2469 /// in @p S. 2470 __isl_give isl_ast_expr * 2471 getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { 2472 isl_ast_expr *Instructions; 2473 2474 isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0); 2475 Instructions = isl_ast_expr_from_val(Zero); 2476 2477 for (ScopStmt &Stmt : S) { 2478 isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); 2479 Instructions = isl_ast_expr_add(Instructions, StmtInstructions); 2480 } 2481 return Instructions; 2482 } 2483 2484 /// Create a check that ensures sufficient compute in scop. 2485 /// 2486 /// @param S The scop for which to ensure sufficient compute. 2487 /// @param Build The isl ast build object to use for creating the ast 2488 /// expression. 2489 /// @returns An expression that evaluates to TRUE in case of sufficient 2490 /// compute and to FALSE, otherwise. 2491 __isl_give isl_ast_expr * 2492 createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { 2493 auto Iterations = getNumberOfIterations(S, Build); 2494 auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute); 2495 auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); 2496 return isl_ast_expr_ge(Iterations, MinComputeExpr); 2497 } 2498 2499 /// Generate code for a given GPU AST described by @p Root. 2500 /// 2501 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 2502 /// @param Prog The GPU Program to generate code for. 2503 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 2504 ScopAnnotator Annotator; 2505 Annotator.buildAliasScopes(*S); 2506 2507 Region *R = &S->getRegion(); 2508 2509 simplifyRegion(R, DT, LI, RI); 2510 2511 BasicBlock *EnteringBB = R->getEnteringBlock(); 2512 2513 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 2514 2515 // Only build the run-time condition and parameters _after_ having 2516 // introduced the conditional branch. This is important as the conditional 2517 // branch will guard the original scop from new induction variables that 2518 // the SCEVExpander may introduce while code generating the parameters and 2519 // which may introduce scalar dependences that prevent us from correctly 2520 // code generating this scop. 2521 BasicBlock *StartBlock = 2522 executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); 2523 2524 GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, 2525 StartBlock, Prog); 2526 2527 // TODO: Handle LICM 2528 auto SplitBlock = StartBlock->getSinglePredecessor(); 2529 Builder.SetInsertPoint(SplitBlock->getTerminator()); 2530 NodeBuilder.addParameters(S->getContext()); 2531 2532 isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 2533 isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build); 2534 isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); 2535 Condition = isl_ast_expr_and(Condition, SufficientCompute); 2536 isl_ast_build_free(Build); 2537 2538 Value *RTC = NodeBuilder.createRTC(Condition); 2539 Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 2540 2541 Builder.SetInsertPoint(&*StartBlock->begin()); 2542 2543 NodeBuilder.initializeAfterRTH(); 2544 NodeBuilder.create(Root); 2545 NodeBuilder.finalize(); 2546 2547 /// In case a sequential kernel has more surrounding loops as any parallel 2548 /// kernel, the SCoP is probably mostly sequential. Hence, there is no 2549 /// point in running it on a GPU. 2550 if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) 2551 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2552 2553 if (!NodeBuilder.BuildSuccessful) 2554 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2555 } 2556 2557 bool runOnScop(Scop &CurrentScop) override { 2558 S = &CurrentScop; 2559 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2560 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2561 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2562 DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); 2563 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2564 2565 // We currently do not support scops with invariant loads. 2566 if (S->hasInvariantAccesses()) 2567 return false; 2568 2569 auto PPCGScop = createPPCGScop(); 2570 auto PPCGProg = createPPCGProg(PPCGScop); 2571 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2572 2573 if (PPCGGen->tree) 2574 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2575 2576 freeOptions(PPCGScop); 2577 freePPCGGen(PPCGGen); 2578 gpu_prog_free(PPCGProg); 2579 ppcg_scop_free(PPCGScop); 2580 2581 return true; 2582 } 2583 2584 void printScop(raw_ostream &, Scop &) const override {} 2585 2586 void getAnalysisUsage(AnalysisUsage &AU) const override { 2587 AU.addRequired<DominatorTreeWrapperPass>(); 2588 AU.addRequired<RegionInfoPass>(); 2589 AU.addRequired<ScalarEvolutionWrapperPass>(); 2590 AU.addRequired<ScopDetection>(); 2591 AU.addRequired<ScopInfoRegionPass>(); 2592 AU.addRequired<LoopInfoWrapperPass>(); 2593 2594 AU.addPreserved<AAResultsWrapperPass>(); 2595 AU.addPreserved<BasicAAWrapperPass>(); 2596 AU.addPreserved<LoopInfoWrapperPass>(); 2597 AU.addPreserved<DominatorTreeWrapperPass>(); 2598 AU.addPreserved<GlobalsAAWrapperPass>(); 2599 AU.addPreserved<ScopDetection>(); 2600 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2601 AU.addPreserved<SCEVAAWrapperPass>(); 2602 2603 // FIXME: We do not yet add regions for the newly generated code to the 2604 // region tree. 2605 AU.addPreserved<RegionInfoPass>(); 2606 AU.addPreserved<ScopInfoRegionPass>(); 2607 } 2608 }; 2609 } // namespace 2610 2611 char PPCGCodeGeneration::ID = 1; 2612 2613 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 2614 2615 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 2616 "Polly - Apply PPCG translation to SCOP", false, false) 2617 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 2618 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 2619 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 2620 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 2621 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 2622 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 2623 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 2624 "Polly - Apply PPCG translation to SCOP", false, false) 2625