1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/PPCGCodeGeneration.h" 16 #include "polly/CodeGen/IslAst.h" 17 #include "polly/CodeGen/IslNodeBuilder.h" 18 #include "polly/CodeGen/Utils.h" 19 #include "polly/DependenceInfo.h" 20 #include "polly/LinkAllPasses.h" 21 #include "polly/Options.h" 22 #include "polly/ScopDetection.h" 23 #include "polly/ScopInfo.h" 24 #include "polly/Support/SCEVValidator.h" 25 #include "llvm/ADT/PostOrderIterator.h" 26 #include "llvm/Analysis/AliasAnalysis.h" 27 #include "llvm/Analysis/BasicAliasAnalysis.h" 28 #include "llvm/Analysis/GlobalsModRef.h" 29 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 30 #include "llvm/Analysis/TargetLibraryInfo.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/IR/LegacyPassManager.h" 33 #include "llvm/IR/Verifier.h" 34 #include "llvm/Support/TargetRegistry.h" 35 #include "llvm/Support/TargetSelect.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 40 #include "isl/union_map.h" 41 42 extern "C" { 43 #include "ppcg/cuda.h" 44 #include "ppcg/gpu.h" 45 #include "ppcg/gpu_print.h" 46 #include "ppcg/ppcg.h" 47 #include "ppcg/schedule.h" 48 } 49 50 #include "llvm/Support/Debug.h" 51 52 using namespace polly; 53 using namespace llvm; 54 55 #define DEBUG_TYPE "polly-codegen-ppcg" 56 57 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 58 cl::desc("Dump the computed GPU Schedule"), 59 cl::Hidden, cl::init(false), cl::ZeroOrMore, 60 cl::cat(PollyCategory)); 61 62 static cl::opt<bool> 63 DumpCode("polly-acc-dump-code", 64 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 65 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 66 67 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 68 cl::desc("Dump the kernel LLVM-IR"), 69 cl::Hidden, cl::init(false), cl::ZeroOrMore, 70 cl::cat(PollyCategory)); 71 72 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 73 cl::desc("Dump the kernel assembly code"), 74 cl::Hidden, cl::init(false), cl::ZeroOrMore, 75 cl::cat(PollyCategory)); 76 77 static cl::opt<bool> FastMath("polly-acc-fastmath", 78 cl::desc("Allow unsafe math optimizations"), 79 cl::Hidden, cl::init(false), cl::ZeroOrMore, 80 cl::cat(PollyCategory)); 81 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 82 cl::desc("Use shared memory"), cl::Hidden, 83 cl::init(false), cl::ZeroOrMore, 84 cl::cat(PollyCategory)); 85 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 86 cl::desc("Use private memory"), cl::Hidden, 87 cl::init(false), cl::ZeroOrMore, 88 cl::cat(PollyCategory)); 89 90 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory", 91 cl::desc("Generate Host kernel code assuming" 92 " that all memory has been" 93 " declared as managed memory"), 94 cl::Hidden, cl::init(false), cl::ZeroOrMore, 95 cl::cat(PollyCategory)); 96 97 static cl::opt<bool> 98 FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure", 99 cl::desc("Fail and generate a backtrace if" 100 " verifyModule fails on the GPU " 101 " kernel module."), 102 cl::Hidden, cl::init(false), cl::ZeroOrMore, 103 cl::cat(PollyCategory)); 104 105 static cl::opt<std::string> 106 CudaVersion("polly-acc-cuda-version", 107 cl::desc("The CUDA version to compile for"), cl::Hidden, 108 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 109 110 static cl::opt<int> 111 MinCompute("polly-acc-mincompute", 112 cl::desc("Minimal number of compute statements to run on GPU."), 113 cl::Hidden, cl::init(10 * 512 * 512)); 114 115 /// Create the ast expressions for a ScopStmt. 116 /// 117 /// This function is a callback for to generate the ast expressions for each 118 /// of the scheduled ScopStmts. 119 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 120 void *StmtT, isl_ast_build *Build, 121 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 122 isl_id *Id, void *User), 123 void *UserIndex, 124 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 125 void *UserExpr) { 126 127 ScopStmt *Stmt = (ScopStmt *)StmtT; 128 129 isl_ctx *Ctx; 130 131 if (!Stmt || !Build) 132 return NULL; 133 134 Ctx = isl_ast_build_get_ctx(Build); 135 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 136 137 for (MemoryAccess *Acc : *Stmt) { 138 isl_map *AddrFunc = Acc->getAddressFunction(); 139 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 140 isl_id *RefId = Acc->getId(); 141 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 142 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 143 MPA = isl_multi_pw_aff_coalesce(MPA); 144 MPA = FunctionIndex(MPA, RefId, UserIndex); 145 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 146 Access = FunctionExpr(Access, RefId, UserExpr); 147 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 148 } 149 150 return RefToExpr; 151 } 152 153 /// Given a LLVM Type, compute its size in bytes, 154 static int computeSizeInBytes(const Type *T) { 155 int bytes = T->getPrimitiveSizeInBits() / 8; 156 if (bytes == 0) 157 bytes = T->getScalarSizeInBits() / 8; 158 return bytes; 159 } 160 161 /// Generate code for a GPU specific isl AST. 162 /// 163 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 164 /// generates code for general-purpose AST nodes, with special functionality 165 /// for generating GPU specific user nodes. 166 /// 167 /// @see GPUNodeBuilder::createUser 168 class GPUNodeBuilder : public IslNodeBuilder { 169 public: 170 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, 171 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 172 DominatorTree &DT, Scop &S, BasicBlock *StartBlock, 173 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) 174 : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), 175 Prog(Prog), Runtime(Runtime), Arch(Arch) { 176 getExprBuilder().setIDToSAI(&IDToSAI); 177 } 178 179 /// Create after-run-time-check initialization code. 180 void initializeAfterRTH(); 181 182 /// Finalize the generated scop. 183 virtual void finalize(); 184 185 /// Track if the full build process was successful. 186 /// 187 /// This value is set to false, if throughout the build process an error 188 /// occurred which prevents us from generating valid GPU code. 189 bool BuildSuccessful = true; 190 191 /// The maximal number of loops surrounding a sequential kernel. 192 unsigned DeepestSequential = 0; 193 194 /// The maximal number of loops surrounding a parallel kernel. 195 unsigned DeepestParallel = 0; 196 197 private: 198 /// A vector of array base pointers for which a new ScopArrayInfo was created. 199 /// 200 /// This vector is used to delete the ScopArrayInfo when it is not needed any 201 /// more. 202 std::vector<Value *> LocalArrays; 203 204 /// A map from ScopArrays to their corresponding device allocations. 205 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 206 207 /// The current GPU context. 208 Value *GPUContext; 209 210 /// The set of isl_ids allocated in the kernel 211 std::vector<isl_id *> KernelIds; 212 213 /// A module containing GPU code. 214 /// 215 /// This pointer is only set in case we are currently generating GPU code. 216 std::unique_ptr<Module> GPUModule; 217 218 /// The GPU program we generate code for. 219 gpu_prog *Prog; 220 221 /// The GPU Runtime implementation to use (OpenCL or CUDA). 222 GPURuntime Runtime; 223 224 /// The GPU Architecture to target. 225 GPUArch Arch; 226 227 /// Class to free isl_ids. 228 class IslIdDeleter { 229 public: 230 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 231 }; 232 233 /// A set containing all isl_ids allocated in a GPU kernel. 234 /// 235 /// By releasing this set all isl_ids will be freed. 236 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 237 238 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 239 240 /// Create code for user-defined AST nodes. 241 /// 242 /// These AST nodes can be of type: 243 /// 244 /// - ScopStmt: A computational statement (TODO) 245 /// - Kernel: A GPU kernel call (TODO) 246 /// - Data-Transfer: A GPU <-> CPU data-transfer 247 /// - In-kernel synchronization 248 /// - In-kernel memory copy statement 249 /// 250 /// @param UserStmt The ast node to generate code for. 251 virtual void createUser(__isl_take isl_ast_node *UserStmt); 252 253 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 254 255 /// Create code for a data transfer statement 256 /// 257 /// @param TransferStmt The data transfer statement. 258 /// @param Direction The direction in which to transfer data. 259 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 260 enum DataDirection Direction); 261 262 /// Find llvm::Values referenced in GPU kernel. 263 /// 264 /// @param Kernel The kernel to scan for llvm::Values 265 /// 266 /// @returns A pair, whose first element contains the set of values 267 /// referenced by the kernel, and whose second element contains the 268 /// set of functions referenced by the kernel. All functions in the 269 /// second set satisfy isValidFunctionInKernel. 270 std::pair<SetVector<Value *>, SetVector<Function *>> 271 getReferencesInKernel(ppcg_kernel *Kernel); 272 273 /// Compute the sizes of the execution grid for a given kernel. 274 /// 275 /// @param Kernel The kernel to compute grid sizes for. 276 /// 277 /// @returns A tuple with grid sizes for X and Y dimension 278 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 279 280 /// Creates a array that can be sent to the kernel on the device using a 281 /// host pointer. This is required for managed memory, when we directly send 282 /// host pointers to the device. 283 /// \note 284 /// This is to be used only with managed memory 285 Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, 286 ScopArrayInfo *ArrayInfo); 287 288 /// Compute the sizes of the thread blocks for a given kernel. 289 /// 290 /// @param Kernel The kernel to compute thread block sizes for. 291 /// 292 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 293 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 294 295 /// Store a specific kernel launch parameter in the array of kernel launch 296 /// parameters. 297 /// 298 /// @param Parameters The list of parameters in which to store. 299 /// @param Param The kernel launch parameter to store. 300 /// @param Index The index in the parameter list, at which to store the 301 /// parameter. 302 void insertStoreParameter(Instruction *Parameters, Instruction *Param, 303 int Index); 304 305 /// Create kernel launch parameters. 306 /// 307 /// @param Kernel The kernel to create parameters for. 308 /// @param F The kernel function that has been created. 309 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 310 /// 311 /// @returns A stack allocated array with pointers to the parameter 312 /// values that are passed to the kernel. 313 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 314 SetVector<Value *> SubtreeValues); 315 316 /// Create declarations for kernel variable. 317 /// 318 /// This includes shared memory declarations. 319 /// 320 /// @param Kernel The kernel definition to create variables for. 321 /// @param FN The function into which to generate the variables. 322 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 323 324 /// Add CUDA annotations to module. 325 /// 326 /// Add a set of CUDA annotations that declares the maximal block dimensions 327 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 328 /// PTX compiler to bound the number of allocated registers to ensure the 329 /// resulting kernel is known to run with up to as many block dimensions 330 /// as specified here. 331 /// 332 /// @param M The module to add the annotations to. 333 /// @param BlockDimX The size of block dimension X. 334 /// @param BlockDimY The size of block dimension Y. 335 /// @param BlockDimZ The size of block dimension Z. 336 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 337 Value *BlockDimZ); 338 339 /// Create GPU kernel. 340 /// 341 /// Code generate the kernel described by @p KernelStmt. 342 /// 343 /// @param KernelStmt The ast node to generate kernel code for. 344 void createKernel(__isl_take isl_ast_node *KernelStmt); 345 346 /// Generate code that computes the size of an array. 347 /// 348 /// @param Array The array for which to compute a size. 349 Value *getArraySize(gpu_array_info *Array); 350 351 /// Generate code to compute the minimal offset at which an array is accessed. 352 /// 353 /// The offset of an array is the minimal array location accessed in a scop. 354 /// 355 /// Example: 356 /// 357 /// for (long i = 0; i < 100; i++) 358 /// A[i + 42] += ... 359 /// 360 /// getArrayOffset(A) results in 42. 361 /// 362 /// @param Array The array for which to compute the offset. 363 /// @returns An llvm::Value that contains the offset of the array. 364 Value *getArrayOffset(gpu_array_info *Array); 365 366 /// Prepare the kernel arguments for kernel code generation 367 /// 368 /// @param Kernel The kernel to generate code for. 369 /// @param FN The function created for the kernel. 370 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 371 372 /// Create kernel function. 373 /// 374 /// Create a kernel function located in a newly created module that can serve 375 /// as target for device code generation. Set the Builder to point to the 376 /// start block of this newly created function. 377 /// 378 /// @param Kernel The kernel to generate code for. 379 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 380 /// @param SubtreeFunctions The set of llvm::Functions referenced by this 381 /// kernel. 382 void createKernelFunction(ppcg_kernel *Kernel, 383 SetVector<Value *> &SubtreeValues, 384 SetVector<Function *> &SubtreeFunctions); 385 386 /// Create the declaration of a kernel function. 387 /// 388 /// The kernel function takes as arguments: 389 /// 390 /// - One i8 pointer for each external array reference used in the kernel. 391 /// - Host iterators 392 /// - Parameters 393 /// - Other LLVM Value references (TODO) 394 /// 395 /// @param Kernel The kernel to generate the function declaration for. 396 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 397 /// 398 /// @returns The newly declared function. 399 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 400 SetVector<Value *> &SubtreeValues); 401 402 /// Insert intrinsic functions to obtain thread and block ids. 403 /// 404 /// @param The kernel to generate the intrinsic functions for. 405 void insertKernelIntrinsics(ppcg_kernel *Kernel); 406 407 /// Setup the creation of functions referenced by the GPU kernel. 408 /// 409 /// 1. Create new function declarations in GPUModule which are the same as 410 /// SubtreeFunctions. 411 /// 412 /// 2. Populate IslNodeBuilder::ValueMap with mappings from 413 /// old functions (that come from the original module) to new functions 414 /// (that are created within GPUModule). That way, we generate references 415 /// to the correct function (in GPUModule) in BlockGenerator. 416 /// 417 /// @see IslNodeBuilder::ValueMap 418 /// @see BlockGenerator::GlobalMap 419 /// @see BlockGenerator::getNewValue 420 /// @see GPUNodeBuilder::getReferencesInKernel. 421 /// 422 /// @param SubtreeFunctions The set of llvm::Functions referenced by 423 /// this kernel. 424 void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions); 425 426 /// Create a global-to-shared or shared-to-global copy statement. 427 /// 428 /// @param CopyStmt The copy statement to generate code for 429 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 430 431 /// Create code for a ScopStmt called in @p Expr. 432 /// 433 /// @param Expr The expression containing the call. 434 /// @param KernelStmt The kernel statement referenced in the call. 435 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 436 437 /// Create an in-kernel synchronization call. 438 void createKernelSync(); 439 440 /// Create a PTX assembly string for the current GPU kernel. 441 /// 442 /// @returns A string containing the corresponding PTX assembly code. 443 std::string createKernelASM(); 444 445 /// Remove references from the dominator tree to the kernel function @p F. 446 /// 447 /// @param F The function to remove references to. 448 void clearDominators(Function *F); 449 450 /// Remove references from scalar evolution to the kernel function @p F. 451 /// 452 /// @param F The function to remove references to. 453 void clearScalarEvolution(Function *F); 454 455 /// Remove references from loop info to the kernel function @p F. 456 /// 457 /// @param F The function to remove references to. 458 void clearLoops(Function *F); 459 460 /// Finalize the generation of the kernel function. 461 /// 462 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 463 /// dump its IR to stderr. 464 /// 465 /// @returns The Assembly string of the kernel. 466 std::string finalizeKernelFunction(); 467 468 /// Finalize the generation of the kernel arguments. 469 /// 470 /// This function ensures that not-read-only scalars used in a kernel are 471 /// stored back to the global memory location they are backed with before 472 /// the kernel terminates. 473 /// 474 /// @params Kernel The kernel to finalize kernel arguments for. 475 void finalizeKernelArguments(ppcg_kernel *Kernel); 476 477 /// Create code that allocates memory to store arrays on device. 478 void allocateDeviceArrays(); 479 480 /// Free all allocated device arrays. 481 void freeDeviceArrays(); 482 483 /// Create a call to initialize the GPU context. 484 /// 485 /// @returns A pointer to the newly initialized context. 486 Value *createCallInitContext(); 487 488 /// Create a call to get the device pointer for a kernel allocation. 489 /// 490 /// @param Allocation The Polly GPU allocation 491 /// 492 /// @returns The device parameter corresponding to this allocation. 493 Value *createCallGetDevicePtr(Value *Allocation); 494 495 /// Create a call to free the GPU context. 496 /// 497 /// @param Context A pointer to an initialized GPU context. 498 void createCallFreeContext(Value *Context); 499 500 /// Create a call to allocate memory on the device. 501 /// 502 /// @param Size The size of memory to allocate 503 /// 504 /// @returns A pointer that identifies this allocation. 505 Value *createCallAllocateMemoryForDevice(Value *Size); 506 507 /// Create a call to free a device array. 508 /// 509 /// @param Array The device array to free. 510 void createCallFreeDeviceMemory(Value *Array); 511 512 /// Create a call to copy data from host to device. 513 /// 514 /// @param HostPtr A pointer to the host data that should be copied. 515 /// @param DevicePtr A device pointer specifying the location to copy to. 516 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 517 Value *Size); 518 519 /// Create a call to copy data from device to host. 520 /// 521 /// @param DevicePtr A pointer to the device data that should be copied. 522 /// @param HostPtr A host pointer specifying the location to copy to. 523 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 524 Value *Size); 525 526 /// Create a call to synchronize Host & Device. 527 /// \note 528 /// This is to be used only with managed memory. 529 void createCallSynchronizeDevice(); 530 531 /// Create a call to get a kernel from an assembly string. 532 /// 533 /// @param Buffer The string describing the kernel. 534 /// @param Entry The name of the kernel function to call. 535 /// 536 /// @returns A pointer to a kernel object 537 Value *createCallGetKernel(Value *Buffer, Value *Entry); 538 539 /// Create a call to free a GPU kernel. 540 /// 541 /// @param GPUKernel THe kernel to free. 542 void createCallFreeKernel(Value *GPUKernel); 543 544 /// Create a call to launch a GPU kernel. 545 /// 546 /// @param GPUKernel The kernel to launch. 547 /// @param GridDimX The size of the first grid dimension. 548 /// @param GridDimY The size of the second grid dimension. 549 /// @param GridBlockX The size of the first block dimension. 550 /// @param GridBlockY The size of the second block dimension. 551 /// @param GridBlockZ The size of the third block dimension. 552 /// @param Parameters A pointer to an array that contains itself pointers to 553 /// the parameter values passed for each kernel argument. 554 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 555 Value *GridDimY, Value *BlockDimX, 556 Value *BlockDimY, Value *BlockDimZ, 557 Value *Parameters); 558 }; 559 560 void GPUNodeBuilder::initializeAfterRTH() { 561 BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 562 &*Builder.GetInsertPoint(), &DT, &LI); 563 NewBB->setName("polly.acc.initialize"); 564 Builder.SetInsertPoint(&NewBB->front()); 565 566 GPUContext = createCallInitContext(); 567 568 if (!ManagedMemory) 569 allocateDeviceArrays(); 570 } 571 572 void GPUNodeBuilder::finalize() { 573 if (!ManagedMemory) 574 freeDeviceArrays(); 575 576 createCallFreeContext(GPUContext); 577 IslNodeBuilder::finalize(); 578 } 579 580 void GPUNodeBuilder::allocateDeviceArrays() { 581 assert(!ManagedMemory && "Managed memory will directly send host pointers " 582 "to the kernel. There is no need for device arrays"); 583 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 584 585 for (int i = 0; i < Prog->n_array; ++i) { 586 gpu_array_info *Array = &Prog->array[i]; 587 auto *ScopArray = (ScopArrayInfo *)Array->user; 588 std::string DevArrayName("p_dev_array_"); 589 DevArrayName.append(Array->name); 590 591 Value *ArraySize = getArraySize(Array); 592 Value *Offset = getArrayOffset(Array); 593 if (Offset) 594 ArraySize = Builder.CreateSub( 595 ArraySize, 596 Builder.CreateMul(Offset, 597 Builder.getInt64(ScopArray->getElemSizeInBytes()))); 598 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 599 DevArray->setName(DevArrayName); 600 DeviceAllocations[ScopArray] = DevArray; 601 } 602 603 isl_ast_build_free(Build); 604 } 605 606 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 607 Value *BlockDimY, Value *BlockDimZ) { 608 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 609 610 for (auto &F : *M) { 611 if (F.getCallingConv() != CallingConv::PTX_Kernel) 612 continue; 613 614 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 615 616 Metadata *Elements[] = { 617 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 618 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 619 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 620 ValueAsMetadata::get(V[2]), 621 }; 622 MDNode *Node = MDNode::get(M->getContext(), Elements); 623 AnnotationNode->addOperand(Node); 624 } 625 } 626 627 void GPUNodeBuilder::freeDeviceArrays() { 628 assert(!ManagedMemory && "Managed memory does not use device arrays"); 629 for (auto &Array : DeviceAllocations) 630 createCallFreeDeviceMemory(Array.second); 631 } 632 633 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 634 const char *Name = "polly_getKernel"; 635 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 636 Function *F = M->getFunction(Name); 637 638 // If F is not available, declare it. 639 if (!F) { 640 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 641 std::vector<Type *> Args; 642 Args.push_back(Builder.getInt8PtrTy()); 643 Args.push_back(Builder.getInt8PtrTy()); 644 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 645 F = Function::Create(Ty, Linkage, Name, M); 646 } 647 648 return Builder.CreateCall(F, {Buffer, Entry}); 649 } 650 651 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 652 const char *Name = "polly_getDevicePtr"; 653 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 654 Function *F = M->getFunction(Name); 655 656 // If F is not available, declare it. 657 if (!F) { 658 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 659 std::vector<Type *> Args; 660 Args.push_back(Builder.getInt8PtrTy()); 661 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 662 F = Function::Create(Ty, Linkage, Name, M); 663 } 664 665 return Builder.CreateCall(F, {Allocation}); 666 } 667 668 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 669 Value *GridDimY, Value *BlockDimX, 670 Value *BlockDimY, Value *BlockDimZ, 671 Value *Parameters) { 672 const char *Name = "polly_launchKernel"; 673 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 674 Function *F = M->getFunction(Name); 675 676 // If F is not available, declare it. 677 if (!F) { 678 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 679 std::vector<Type *> Args; 680 Args.push_back(Builder.getInt8PtrTy()); 681 Args.push_back(Builder.getInt32Ty()); 682 Args.push_back(Builder.getInt32Ty()); 683 Args.push_back(Builder.getInt32Ty()); 684 Args.push_back(Builder.getInt32Ty()); 685 Args.push_back(Builder.getInt32Ty()); 686 Args.push_back(Builder.getInt8PtrTy()); 687 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 688 F = Function::Create(Ty, Linkage, Name, M); 689 } 690 691 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 692 BlockDimZ, Parameters}); 693 } 694 695 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 696 const char *Name = "polly_freeKernel"; 697 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 698 Function *F = M->getFunction(Name); 699 700 // If F is not available, declare it. 701 if (!F) { 702 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 703 std::vector<Type *> Args; 704 Args.push_back(Builder.getInt8PtrTy()); 705 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 706 F = Function::Create(Ty, Linkage, Name, M); 707 } 708 709 Builder.CreateCall(F, {GPUKernel}); 710 } 711 712 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 713 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 714 "for device"); 715 const char *Name = "polly_freeDeviceMemory"; 716 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 717 Function *F = M->getFunction(Name); 718 719 // If F is not available, declare it. 720 if (!F) { 721 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 722 std::vector<Type *> Args; 723 Args.push_back(Builder.getInt8PtrTy()); 724 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 725 F = Function::Create(Ty, Linkage, Name, M); 726 } 727 728 Builder.CreateCall(F, {Array}); 729 } 730 731 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 732 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 733 "for device"); 734 const char *Name = "polly_allocateMemoryForDevice"; 735 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 736 Function *F = M->getFunction(Name); 737 738 // If F is not available, declare it. 739 if (!F) { 740 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 741 std::vector<Type *> Args; 742 Args.push_back(Builder.getInt64Ty()); 743 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 744 F = Function::Create(Ty, Linkage, Name, M); 745 } 746 747 return Builder.CreateCall(F, {Size}); 748 } 749 750 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 751 Value *DeviceData, 752 Value *Size) { 753 assert(!ManagedMemory && "Managed memory does not transfer memory between " 754 "device and host"); 755 const char *Name = "polly_copyFromHostToDevice"; 756 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 757 Function *F = M->getFunction(Name); 758 759 // If F is not available, declare it. 760 if (!F) { 761 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 762 std::vector<Type *> Args; 763 Args.push_back(Builder.getInt8PtrTy()); 764 Args.push_back(Builder.getInt8PtrTy()); 765 Args.push_back(Builder.getInt64Ty()); 766 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 767 F = Function::Create(Ty, Linkage, Name, M); 768 } 769 770 Builder.CreateCall(F, {HostData, DeviceData, Size}); 771 } 772 773 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 774 Value *HostData, 775 Value *Size) { 776 assert(!ManagedMemory && "Managed memory does not transfer memory between " 777 "device and host"); 778 const char *Name = "polly_copyFromDeviceToHost"; 779 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 780 Function *F = M->getFunction(Name); 781 782 // If F is not available, declare it. 783 if (!F) { 784 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 785 std::vector<Type *> Args; 786 Args.push_back(Builder.getInt8PtrTy()); 787 Args.push_back(Builder.getInt8PtrTy()); 788 Args.push_back(Builder.getInt64Ty()); 789 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 790 F = Function::Create(Ty, Linkage, Name, M); 791 } 792 793 Builder.CreateCall(F, {DeviceData, HostData, Size}); 794 } 795 796 void GPUNodeBuilder::createCallSynchronizeDevice() { 797 assert(ManagedMemory && "explicit synchronization is only necessary for " 798 "managed memory"); 799 const char *Name = "polly_synchronizeDevice"; 800 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 801 Function *F = M->getFunction(Name); 802 803 // If F is not available, declare it. 804 if (!F) { 805 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 806 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); 807 F = Function::Create(Ty, Linkage, Name, M); 808 } 809 810 Builder.CreateCall(F); 811 } 812 813 Value *GPUNodeBuilder::createCallInitContext() { 814 const char *Name; 815 816 switch (Runtime) { 817 case GPURuntime::CUDA: 818 Name = "polly_initContextCUDA"; 819 break; 820 case GPURuntime::OpenCL: 821 Name = "polly_initContextCL"; 822 break; 823 } 824 825 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 826 Function *F = M->getFunction(Name); 827 828 // If F is not available, declare it. 829 if (!F) { 830 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 831 std::vector<Type *> Args; 832 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 833 F = Function::Create(Ty, Linkage, Name, M); 834 } 835 836 return Builder.CreateCall(F, {}); 837 } 838 839 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 840 const char *Name = "polly_freeContext"; 841 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 842 Function *F = M->getFunction(Name); 843 844 // If F is not available, declare it. 845 if (!F) { 846 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 847 std::vector<Type *> Args; 848 Args.push_back(Builder.getInt8PtrTy()); 849 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 850 F = Function::Create(Ty, Linkage, Name, M); 851 } 852 853 Builder.CreateCall(F, {Context}); 854 } 855 856 /// Check if one string is a prefix of another. 857 /// 858 /// @param String The string in which to look for the prefix. 859 /// @param Prefix The prefix to look for. 860 static bool isPrefix(std::string String, std::string Prefix) { 861 return String.find(Prefix) == 0; 862 } 863 864 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 865 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 866 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 867 868 if (!gpu_array_is_scalar(Array)) { 869 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 870 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 871 872 for (unsigned int i = 1; i < Array->n_index; i++) { 873 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 874 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 875 Res = isl_ast_expr_mul(Res, Expr); 876 } 877 878 Value *NumElements = ExprBuilder.create(Res); 879 if (NumElements->getType() != ArraySize->getType()) 880 NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); 881 ArraySize = Builder.CreateMul(ArraySize, NumElements); 882 } 883 isl_ast_build_free(Build); 884 return ArraySize; 885 } 886 887 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { 888 if (gpu_array_is_scalar(Array)) 889 return nullptr; 890 891 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 892 893 isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent)); 894 895 isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min)); 896 897 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) 898 ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0); 899 900 if (isl_set_is_subset(Min, ZeroSet)) { 901 isl_set_free(Min); 902 isl_set_free(ZeroSet); 903 isl_ast_build_free(Build); 904 return nullptr; 905 } 906 isl_set_free(ZeroSet); 907 908 isl_ast_expr *Result = 909 isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0)); 910 911 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { 912 if (i > 0) { 913 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); 914 isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 915 Result = isl_ast_expr_mul(Result, BExpr); 916 } 917 isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i); 918 isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin); 919 Result = isl_ast_expr_add(Result, MExpr); 920 } 921 922 Value *ResultValue = ExprBuilder.create(Result); 923 isl_set_free(Min); 924 isl_ast_build_free(Build); 925 926 return ResultValue; 927 } 928 929 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, 930 ScopArrayInfo *ArrayInfo) { 931 932 assert(ManagedMemory && "Only used when you wish to get a host " 933 "pointer for sending data to the kernel, " 934 "with managed memory"); 935 std::map<ScopArrayInfo *, Value *>::iterator it; 936 if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { 937 return it->second; 938 } else { 939 Value *HostPtr; 940 941 if (gpu_array_is_scalar(Array)) 942 HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); 943 else 944 HostPtr = ArrayInfo->getBasePtr(); 945 946 Value *Offset = getArrayOffset(Array); 947 if (Offset) { 948 HostPtr = Builder.CreatePointerCast( 949 HostPtr, ArrayInfo->getElementType()->getPointerTo()); 950 HostPtr = Builder.CreateGEP(HostPtr, Offset); 951 } 952 953 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 954 DeviceAllocations[ArrayInfo] = HostPtr; 955 return HostPtr; 956 } 957 } 958 959 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 960 enum DataDirection Direction) { 961 assert(!ManagedMemory && "Managed memory needs no data transfers"); 962 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 963 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 964 isl_id *Id = isl_ast_expr_get_id(Arg); 965 auto Array = (gpu_array_info *)isl_id_get_user(Id); 966 auto ScopArray = (ScopArrayInfo *)(Array->user); 967 968 Value *Size = getArraySize(Array); 969 Value *Offset = getArrayOffset(Array); 970 Value *DevPtr = DeviceAllocations[ScopArray]; 971 972 Value *HostPtr; 973 974 if (gpu_array_is_scalar(Array)) 975 HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 976 else 977 HostPtr = ScopArray->getBasePtr(); 978 979 if (Offset) { 980 HostPtr = Builder.CreatePointerCast( 981 HostPtr, ScopArray->getElementType()->getPointerTo()); 982 HostPtr = Builder.CreateGEP(HostPtr, Offset); 983 } 984 985 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 986 987 if (Offset) { 988 Size = Builder.CreateSub( 989 Size, Builder.CreateMul( 990 Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); 991 } 992 993 if (Direction == HOST_TO_DEVICE) 994 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 995 else 996 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 997 998 isl_id_free(Id); 999 isl_ast_expr_free(Arg); 1000 isl_ast_expr_free(Expr); 1001 isl_ast_node_free(TransferStmt); 1002 } 1003 1004 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 1005 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 1006 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1007 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1008 isl_id_free(Id); 1009 isl_ast_expr_free(StmtExpr); 1010 1011 const char *Str = isl_id_get_name(Id); 1012 if (!strcmp(Str, "kernel")) { 1013 createKernel(UserStmt); 1014 isl_ast_expr_free(Expr); 1015 return; 1016 } 1017 1018 if (isPrefix(Str, "to_device")) { 1019 if (!ManagedMemory) 1020 createDataTransfer(UserStmt, HOST_TO_DEVICE); 1021 else 1022 isl_ast_node_free(UserStmt); 1023 1024 isl_ast_expr_free(Expr); 1025 return; 1026 } 1027 1028 if (isPrefix(Str, "from_device")) { 1029 if (!ManagedMemory) { 1030 createDataTransfer(UserStmt, DEVICE_TO_HOST); 1031 } else { 1032 createCallSynchronizeDevice(); 1033 isl_ast_node_free(UserStmt); 1034 } 1035 isl_ast_expr_free(Expr); 1036 return; 1037 } 1038 1039 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 1040 struct ppcg_kernel_stmt *KernelStmt = 1041 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 1042 isl_id_free(Anno); 1043 1044 switch (KernelStmt->type) { 1045 case ppcg_kernel_domain: 1046 createScopStmt(Expr, KernelStmt); 1047 isl_ast_node_free(UserStmt); 1048 return; 1049 case ppcg_kernel_copy: 1050 createKernelCopy(KernelStmt); 1051 isl_ast_expr_free(Expr); 1052 isl_ast_node_free(UserStmt); 1053 return; 1054 case ppcg_kernel_sync: 1055 createKernelSync(); 1056 isl_ast_expr_free(Expr); 1057 isl_ast_node_free(UserStmt); 1058 return; 1059 } 1060 1061 isl_ast_expr_free(Expr); 1062 isl_ast_node_free(UserStmt); 1063 return; 1064 } 1065 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 1066 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 1067 LocalIndex = isl_ast_expr_address_of(LocalIndex); 1068 Value *LocalAddr = ExprBuilder.create(LocalIndex); 1069 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 1070 Index = isl_ast_expr_address_of(Index); 1071 Value *GlobalAddr = ExprBuilder.create(Index); 1072 1073 if (KernelStmt->u.c.read) { 1074 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 1075 Builder.CreateStore(Load, LocalAddr); 1076 } else { 1077 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 1078 Builder.CreateStore(Load, GlobalAddr); 1079 } 1080 } 1081 1082 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 1083 ppcg_kernel_stmt *KernelStmt) { 1084 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1085 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 1086 1087 LoopToScevMapT LTS; 1088 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 1089 1090 createSubstitutions(Expr, Stmt, LTS); 1091 1092 if (Stmt->isBlockStmt()) 1093 BlockGen.copyStmt(*Stmt, LTS, Indexes); 1094 else 1095 RegionGen.copyStmt(*Stmt, LTS, Indexes); 1096 } 1097 1098 void GPUNodeBuilder::createKernelSync() { 1099 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1100 1101 Function *Sync; 1102 1103 switch (Arch) { 1104 case GPUArch::NVPTX64: 1105 Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 1106 break; 1107 } 1108 1109 Builder.CreateCall(Sync, {}); 1110 } 1111 1112 /// Collect llvm::Values referenced from @p Node 1113 /// 1114 /// This function only applies to isl_ast_nodes that are user_nodes referring 1115 /// to a ScopStmt. All other node types are ignore. 1116 /// 1117 /// @param Node The node to collect references for. 1118 /// @param User A user pointer used as storage for the data that is collected. 1119 /// 1120 /// @returns isl_bool_true if data could be collected successfully. 1121 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 1122 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 1123 return isl_bool_true; 1124 1125 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 1126 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1127 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1128 const char *Str = isl_id_get_name(Id); 1129 isl_id_free(Id); 1130 isl_ast_expr_free(StmtExpr); 1131 isl_ast_expr_free(Expr); 1132 1133 if (!isPrefix(Str, "Stmt")) 1134 return isl_bool_true; 1135 1136 Id = isl_ast_node_get_annotation(Node); 1137 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 1138 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1139 isl_id_free(Id); 1140 1141 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 1142 1143 return isl_bool_true; 1144 } 1145 1146 /// Check if F is a function that we can code-generate in a GPU kernel. 1147 static bool isValidFunctionInKernel(llvm::Function *F) { 1148 assert(F && "F is an invalid pointer"); 1149 // We string compare against the name of the function to allow 1150 // all variants of the intrinsic "llvm.sqrt.*" 1151 return F->isIntrinsic() && F->getName().startswith("llvm.sqrt"); 1152 } 1153 1154 /// Do not take `Function` as a subtree value. 1155 /// 1156 /// We try to take the reference of all subtree values and pass them along 1157 /// to the kernel from the host. Taking an address of any function and 1158 /// trying to pass along is nonsensical. Only allow `Value`s that are not 1159 /// `Function`s. 1160 static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); } 1161 1162 /// Return `Function`s from `RawSubtreeValues`. 1163 static SetVector<Function *> 1164 getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) { 1165 SetVector<Function *> SubtreeFunctions; 1166 for (Value *It : RawSubtreeValues) { 1167 Function *F = dyn_cast<Function>(It); 1168 if (F) { 1169 assert(isValidFunctionInKernel(F) && "Code should have bailed out by " 1170 "this point if an invalid function " 1171 "were present in a kernel."); 1172 SubtreeFunctions.insert(F); 1173 } 1174 } 1175 return SubtreeFunctions; 1176 } 1177 1178 std::pair<SetVector<Value *>, SetVector<Function *>> 1179 GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 1180 SetVector<Value *> SubtreeValues; 1181 SetVector<const SCEV *> SCEVs; 1182 SetVector<const Loop *> Loops; 1183 SubtreeReferences References = { 1184 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 1185 1186 for (const auto &I : IDToValue) 1187 SubtreeValues.insert(I.second); 1188 1189 isl_ast_node_foreach_descendant_top_down( 1190 Kernel->tree, collectReferencesInGPUStmt, &References); 1191 1192 for (const SCEV *Expr : SCEVs) 1193 findValues(Expr, SE, SubtreeValues); 1194 1195 for (auto &SAI : S.arrays()) 1196 SubtreeValues.remove(SAI->getBasePtr()); 1197 1198 isl_space *Space = S.getParamSpace(); 1199 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 1200 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 1201 assert(IDToValue.count(Id)); 1202 Value *Val = IDToValue[Id]; 1203 SubtreeValues.remove(Val); 1204 isl_id_free(Id); 1205 } 1206 isl_space_free(Space); 1207 1208 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 1209 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1210 assert(IDToValue.count(Id)); 1211 Value *Val = IDToValue[Id]; 1212 SubtreeValues.remove(Val); 1213 isl_id_free(Id); 1214 } 1215 1216 // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions 1217 // SubtreeValues. This is important, because we should not lose any 1218 // SubtreeValues in the process of constructing the 1219 // "ValidSubtree{Values, Functions} sets. Nor should the set 1220 // ValidSubtree{Values, Functions} have any common element. 1221 auto ValidSubtreeValuesIt = 1222 make_filter_range(SubtreeValues, isValidSubtreeValue); 1223 SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(), 1224 ValidSubtreeValuesIt.end()); 1225 SetVector<Function *> ValidSubtreeFunctions( 1226 getFunctionsFromRawSubtreeValues(SubtreeValues)); 1227 1228 return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions); 1229 } 1230 1231 void GPUNodeBuilder::clearDominators(Function *F) { 1232 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 1233 std::vector<BasicBlock *> Nodes; 1234 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 1235 Nodes.push_back(I->getBlock()); 1236 1237 for (BasicBlock *BB : Nodes) 1238 DT.eraseNode(BB); 1239 } 1240 1241 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 1242 for (BasicBlock &BB : *F) { 1243 Loop *L = LI.getLoopFor(&BB); 1244 if (L) 1245 SE.forgetLoop(L); 1246 } 1247 } 1248 1249 void GPUNodeBuilder::clearLoops(Function *F) { 1250 for (BasicBlock &BB : *F) { 1251 Loop *L = LI.getLoopFor(&BB); 1252 if (L) 1253 SE.forgetLoop(L); 1254 LI.removeBlock(&BB); 1255 } 1256 } 1257 1258 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 1259 std::vector<Value *> Sizes; 1260 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 1261 1262 for (long i = 0; i < Kernel->n_grid; i++) { 1263 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 1264 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 1265 Value *Res = ExprBuilder.create(GridSize); 1266 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 1267 Sizes.push_back(Res); 1268 } 1269 isl_ast_build_free(Context); 1270 1271 for (long i = Kernel->n_grid; i < 3; i++) 1272 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1273 1274 return std::make_tuple(Sizes[0], Sizes[1]); 1275 } 1276 1277 std::tuple<Value *, Value *, Value *> 1278 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 1279 std::vector<Value *> Sizes; 1280 1281 for (long i = 0; i < Kernel->n_block; i++) { 1282 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 1283 Sizes.push_back(Res); 1284 } 1285 1286 for (long i = Kernel->n_block; i < 3; i++) 1287 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1288 1289 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 1290 } 1291 1292 void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, 1293 Instruction *Param, int Index) { 1294 Value *Slot = Builder.CreateGEP( 1295 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1296 Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1297 Builder.CreateStore(ParamTyped, Slot); 1298 } 1299 1300 Value * 1301 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 1302 SetVector<Value *> SubtreeValues) { 1303 const int NumArgs = F->arg_size(); 1304 std::vector<int> ArgSizes(NumArgs); 1305 1306 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); 1307 1308 BasicBlock *EntryBlock = 1309 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 1310 auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); 1311 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 1312 Instruction *Parameters = new AllocaInst( 1313 ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); 1314 1315 int Index = 0; 1316 for (long i = 0; i < Prog->n_array; i++) { 1317 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1318 continue; 1319 1320 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1321 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1322 1323 ArgSizes[Index] = SAI->getElemSizeInBytes(); 1324 1325 Value *DevArray = nullptr; 1326 if (ManagedMemory) { 1327 DevArray = getOrCreateManagedDeviceArray( 1328 &Prog->array[i], const_cast<ScopArrayInfo *>(SAI)); 1329 } else { 1330 DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)]; 1331 DevArray = createCallGetDevicePtr(DevArray); 1332 } 1333 assert(DevArray != nullptr && "Array to be offloaded to device not " 1334 "initialized"); 1335 Value *Offset = getArrayOffset(&Prog->array[i]); 1336 1337 if (Offset) { 1338 DevArray = Builder.CreatePointerCast( 1339 DevArray, SAI->getElementType()->getPointerTo()); 1340 DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); 1341 DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); 1342 } 1343 Value *Slot = Builder.CreateGEP( 1344 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1345 1346 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1347 Value *ValPtr = nullptr; 1348 if (ManagedMemory) 1349 ValPtr = DevArray; 1350 else 1351 ValPtr = BlockGen.getOrCreateAlloca(SAI); 1352 1353 assert(ValPtr != nullptr && "ValPtr that should point to a valid object" 1354 " to be stored into Parameters"); 1355 Value *ValPtrCast = 1356 Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); 1357 Builder.CreateStore(ValPtrCast, Slot); 1358 } else { 1359 Instruction *Param = 1360 new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, 1361 Launch + "_param_" + std::to_string(Index), 1362 EntryBlock->getTerminator()); 1363 Builder.CreateStore(DevArray, Param); 1364 Value *ParamTyped = 1365 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1366 Builder.CreateStore(ParamTyped, Slot); 1367 } 1368 Index++; 1369 } 1370 1371 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1372 1373 for (long i = 0; i < NumHostIters; i++) { 1374 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1375 Value *Val = IDToValue[Id]; 1376 isl_id_free(Id); 1377 1378 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1379 1380 Instruction *Param = 1381 new AllocaInst(Val->getType(), AddressSpace, 1382 Launch + "_param_" + std::to_string(Index), 1383 EntryBlock->getTerminator()); 1384 Builder.CreateStore(Val, Param); 1385 insertStoreParameter(Parameters, Param, Index); 1386 Index++; 1387 } 1388 1389 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1390 1391 for (long i = 0; i < NumVars; i++) { 1392 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1393 Value *Val = IDToValue[Id]; 1394 isl_id_free(Id); 1395 1396 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1397 1398 Instruction *Param = 1399 new AllocaInst(Val->getType(), AddressSpace, 1400 Launch + "_param_" + std::to_string(Index), 1401 EntryBlock->getTerminator()); 1402 Builder.CreateStore(Val, Param); 1403 insertStoreParameter(Parameters, Param, Index); 1404 Index++; 1405 } 1406 1407 for (auto Val : SubtreeValues) { 1408 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1409 1410 Instruction *Param = 1411 new AllocaInst(Val->getType(), AddressSpace, 1412 Launch + "_param_" + std::to_string(Index), 1413 EntryBlock->getTerminator()); 1414 Builder.CreateStore(Val, Param); 1415 insertStoreParameter(Parameters, Param, Index); 1416 Index++; 1417 } 1418 1419 for (int i = 0; i < NumArgs; i++) { 1420 Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); 1421 Instruction *Param = 1422 new AllocaInst(Builder.getInt32Ty(), AddressSpace, 1423 Launch + "_param_size_" + std::to_string(i), 1424 EntryBlock->getTerminator()); 1425 Builder.CreateStore(Val, Param); 1426 insertStoreParameter(Parameters, Param, Index); 1427 Index++; 1428 } 1429 1430 auto Location = EntryBlock->getTerminator(); 1431 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1432 Launch + "_params_i8ptr", Location); 1433 } 1434 1435 void GPUNodeBuilder::setupKernelSubtreeFunctions( 1436 SetVector<Function *> SubtreeFunctions) { 1437 for (auto Fn : SubtreeFunctions) { 1438 const std::string ClonedFnName = Fn->getName(); 1439 Function *Clone = GPUModule->getFunction(ClonedFnName); 1440 if (!Clone) 1441 Clone = 1442 Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, 1443 ClonedFnName, GPUModule.get()); 1444 assert(Clone && "Expected cloned function to be initialized."); 1445 assert(ValueMap.find(Fn) == ValueMap.end() && 1446 "Fn already present in ValueMap"); 1447 ValueMap[Fn] = Clone; 1448 } 1449 } 1450 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1451 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1452 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1453 isl_id_free(Id); 1454 isl_ast_node_free(KernelStmt); 1455 1456 if (Kernel->n_grid > 1) 1457 DeepestParallel = 1458 std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set)); 1459 else 1460 DeepestSequential = 1461 std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set)); 1462 1463 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1464 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1465 1466 SetVector<Value *> SubtreeValues; 1467 SetVector<Function *> SubtreeFunctions; 1468 std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); 1469 1470 assert(Kernel->tree && "Device AST of kernel node is empty"); 1471 1472 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1473 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1474 ValueMapT HostValueMap = ValueMap; 1475 BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; 1476 ScalarMap.clear(); 1477 1478 SetVector<const Loop *> Loops; 1479 1480 // Create for all loops we depend on values that contain the current loop 1481 // iteration. These values are necessary to generate code for SCEVs that 1482 // depend on such loops. As a result we need to pass them to the subfunction. 1483 for (const Loop *L : Loops) { 1484 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1485 SE.getUnknown(Builder.getInt64(1)), 1486 L, SCEV::FlagAnyWrap); 1487 Value *V = generateSCEV(OuterLIV); 1488 OutsideLoopIterations[L] = SE.getUnknown(V); 1489 SubtreeValues.insert(V); 1490 } 1491 1492 createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); 1493 setupKernelSubtreeFunctions(SubtreeFunctions); 1494 1495 create(isl_ast_node_copy(Kernel->tree)); 1496 1497 finalizeKernelArguments(Kernel); 1498 Function *F = Builder.GetInsertBlock()->getParent(); 1499 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1500 clearDominators(F); 1501 clearScalarEvolution(F); 1502 clearLoops(F); 1503 1504 IDToValue = HostIDs; 1505 1506 ValueMap = std::move(HostValueMap); 1507 ScalarMap = std::move(HostScalarMap); 1508 EscapeMap.clear(); 1509 IDToSAI.clear(); 1510 Annotator.resetAlternativeAliasBases(); 1511 for (auto &BasePtr : LocalArrays) 1512 S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); 1513 LocalArrays.clear(); 1514 1515 std::string ASMString = finalizeKernelFunction(); 1516 Builder.SetInsertPoint(&HostInsertPoint); 1517 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1518 1519 std::string Name = "kernel_" + std::to_string(Kernel->id); 1520 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1521 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1522 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1523 1524 Value *GridDimX, *GridDimY; 1525 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1526 1527 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1528 BlockDimZ, Parameters); 1529 createCallFreeKernel(GPUKernel); 1530 1531 for (auto Id : KernelIds) 1532 isl_id_free(Id); 1533 1534 KernelIds.clear(); 1535 } 1536 1537 /// Compute the DataLayout string for the NVPTX backend. 1538 /// 1539 /// @param is64Bit Are we looking for a 64 bit architecture? 1540 static std::string computeNVPTXDataLayout(bool is64Bit) { 1541 std::string Ret = ""; 1542 1543 if (!is64Bit) { 1544 Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1545 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1546 "64-v128:128:128-n16:32:64"; 1547 } else { 1548 Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1549 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1550 "64-v128:128:128-n16:32:64"; 1551 } 1552 1553 return Ret; 1554 } 1555 1556 Function * 1557 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1558 SetVector<Value *> &SubtreeValues) { 1559 std::vector<Type *> Args; 1560 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1561 1562 for (long i = 0; i < Prog->n_array; i++) { 1563 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1564 continue; 1565 1566 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1567 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1568 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1569 Args.push_back(SAI->getElementType()); 1570 } else { 1571 static const int UseGlobalMemory = 1; 1572 Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); 1573 } 1574 } 1575 1576 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1577 1578 for (long i = 0; i < NumHostIters; i++) 1579 Args.push_back(Builder.getInt64Ty()); 1580 1581 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1582 1583 for (long i = 0; i < NumVars; i++) { 1584 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1585 Value *Val = IDToValue[Id]; 1586 isl_id_free(Id); 1587 Args.push_back(Val->getType()); 1588 } 1589 1590 for (auto *V : SubtreeValues) 1591 Args.push_back(V->getType()); 1592 1593 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1594 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1595 GPUModule.get()); 1596 1597 switch (Arch) { 1598 case GPUArch::NVPTX64: 1599 FN->setCallingConv(CallingConv::PTX_Kernel); 1600 break; 1601 } 1602 1603 auto Arg = FN->arg_begin(); 1604 for (long i = 0; i < Kernel->n_array; i++) { 1605 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1606 continue; 1607 1608 Arg->setName(Kernel->array[i].array->name); 1609 1610 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1611 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1612 Type *EleTy = SAI->getElementType(); 1613 Value *Val = &*Arg; 1614 SmallVector<const SCEV *, 4> Sizes; 1615 isl_ast_build *Build = 1616 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1617 Sizes.push_back(nullptr); 1618 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1619 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1620 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1621 auto V = ExprBuilder.create(DimSize); 1622 Sizes.push_back(SE.getSCEV(V)); 1623 } 1624 const ScopArrayInfo *SAIRep = 1625 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); 1626 LocalArrays.push_back(Val); 1627 1628 isl_ast_build_free(Build); 1629 KernelIds.push_back(Id); 1630 IDToSAI[Id] = SAIRep; 1631 Arg++; 1632 } 1633 1634 for (long i = 0; i < NumHostIters; i++) { 1635 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1636 Arg->setName(isl_id_get_name(Id)); 1637 IDToValue[Id] = &*Arg; 1638 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1639 Arg++; 1640 } 1641 1642 for (long i = 0; i < NumVars; i++) { 1643 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1644 Arg->setName(isl_id_get_name(Id)); 1645 Value *Val = IDToValue[Id]; 1646 ValueMap[Val] = &*Arg; 1647 IDToValue[Id] = &*Arg; 1648 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1649 Arg++; 1650 } 1651 1652 for (auto *V : SubtreeValues) { 1653 Arg->setName(V->getName()); 1654 ValueMap[V] = &*Arg; 1655 Arg++; 1656 } 1657 1658 return FN; 1659 } 1660 1661 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1662 Intrinsic::ID IntrinsicsBID[2]; 1663 Intrinsic::ID IntrinsicsTID[3]; 1664 1665 switch (Arch) { 1666 case GPUArch::NVPTX64: 1667 IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; 1668 IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; 1669 1670 IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; 1671 IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; 1672 IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; 1673 break; 1674 } 1675 1676 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1677 std::string Name = isl_id_get_name(Id); 1678 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1679 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1680 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1681 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1682 IDToValue[Id] = Val; 1683 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1684 }; 1685 1686 for (int i = 0; i < Kernel->n_grid; ++i) { 1687 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1688 addId(Id, IntrinsicsBID[i]); 1689 } 1690 1691 for (int i = 0; i < Kernel->n_block; ++i) { 1692 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1693 addId(Id, IntrinsicsTID[i]); 1694 } 1695 } 1696 1697 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1698 auto Arg = FN->arg_begin(); 1699 for (long i = 0; i < Kernel->n_array; i++) { 1700 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1701 continue; 1702 1703 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1704 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1705 isl_id_free(Id); 1706 1707 if (SAI->getNumberOfDimensions() > 0) { 1708 Arg++; 1709 continue; 1710 } 1711 1712 Value *Val = &*Arg; 1713 1714 if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { 1715 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1716 Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); 1717 Val = Builder.CreateLoad(TypedArgPtr); 1718 } 1719 1720 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1721 Builder.CreateStore(Val, Alloca); 1722 1723 Arg++; 1724 } 1725 } 1726 1727 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { 1728 auto *FN = Builder.GetInsertBlock()->getParent(); 1729 auto Arg = FN->arg_begin(); 1730 1731 bool StoredScalar = false; 1732 for (long i = 0; i < Kernel->n_array; i++) { 1733 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1734 continue; 1735 1736 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1737 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1738 isl_id_free(Id); 1739 1740 if (SAI->getNumberOfDimensions() > 0) { 1741 Arg++; 1742 continue; 1743 } 1744 1745 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1746 Arg++; 1747 continue; 1748 } 1749 1750 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1751 Value *ArgPtr = &*Arg; 1752 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1753 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1754 Value *Val = Builder.CreateLoad(Alloca); 1755 Builder.CreateStore(Val, TypedArgPtr); 1756 StoredScalar = true; 1757 1758 Arg++; 1759 } 1760 1761 if (StoredScalar) 1762 /// In case more than one thread contains scalar stores, the generated 1763 /// code might be incorrect, if we only store at the end of the kernel. 1764 /// To support this case we need to store these scalars back at each 1765 /// memory store or at least before each kernel barrier. 1766 if (Kernel->n_block != 0 || Kernel->n_grid != 0) 1767 BuildSuccessful = 0; 1768 } 1769 1770 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1771 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1772 1773 for (int i = 0; i < Kernel->n_var; ++i) { 1774 struct ppcg_kernel_var &Var = Kernel->var[i]; 1775 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1776 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1777 1778 Type *ArrayTy = EleTy; 1779 SmallVector<const SCEV *, 4> Sizes; 1780 1781 Sizes.push_back(nullptr); 1782 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1783 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1784 long Bound = isl_val_get_num_si(Val); 1785 isl_val_free(Val); 1786 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1787 } 1788 1789 for (int j = Var.array->n_index - 1; j >= 0; --j) { 1790 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1791 long Bound = isl_val_get_num_si(Val); 1792 isl_val_free(Val); 1793 ArrayTy = ArrayType::get(ArrayTy, Bound); 1794 } 1795 1796 const ScopArrayInfo *SAI; 1797 Value *Allocation; 1798 if (Var.type == ppcg_access_shared) { 1799 auto GlobalVar = new GlobalVariable( 1800 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1801 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1802 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1803 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1804 1805 Allocation = GlobalVar; 1806 } else if (Var.type == ppcg_access_private) { 1807 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1808 } else { 1809 llvm_unreachable("unknown variable type"); 1810 } 1811 SAI = 1812 S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); 1813 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1814 IDToValue[Id] = Allocation; 1815 LocalArrays.push_back(Allocation); 1816 KernelIds.push_back(Id); 1817 IDToSAI[Id] = SAI; 1818 } 1819 } 1820 1821 void GPUNodeBuilder::createKernelFunction( 1822 ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues, 1823 SetVector<Function *> &SubtreeFunctions) { 1824 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1825 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1826 1827 switch (Arch) { 1828 case GPUArch::NVPTX64: 1829 if (Runtime == GPURuntime::CUDA) 1830 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1831 else if (Runtime == GPURuntime::OpenCL) 1832 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); 1833 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1834 break; 1835 } 1836 1837 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1838 1839 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1840 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1841 1842 DT.addNewBlock(EntryBlock, PrevBlock); 1843 1844 Builder.SetInsertPoint(EntryBlock); 1845 Builder.CreateRetVoid(); 1846 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1847 1848 ScopDetection::markFunctionAsInvalid(FN); 1849 1850 prepareKernelArguments(Kernel, FN); 1851 createKernelVariables(Kernel, FN); 1852 insertKernelIntrinsics(Kernel); 1853 } 1854 1855 std::string GPUNodeBuilder::createKernelASM() { 1856 llvm::Triple GPUTriple; 1857 1858 switch (Arch) { 1859 case GPUArch::NVPTX64: 1860 switch (Runtime) { 1861 case GPURuntime::CUDA: 1862 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); 1863 break; 1864 case GPURuntime::OpenCL: 1865 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); 1866 break; 1867 } 1868 break; 1869 } 1870 1871 std::string ErrMsg; 1872 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1873 1874 if (!GPUTarget) { 1875 errs() << ErrMsg << "\n"; 1876 return ""; 1877 } 1878 1879 TargetOptions Options; 1880 Options.UnsafeFPMath = FastMath; 1881 1882 std::string subtarget; 1883 1884 switch (Arch) { 1885 case GPUArch::NVPTX64: 1886 subtarget = CudaVersion; 1887 break; 1888 } 1889 1890 std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine( 1891 GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>())); 1892 1893 SmallString<0> ASMString; 1894 raw_svector_ostream ASMStream(ASMString); 1895 llvm::legacy::PassManager PM; 1896 1897 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1898 1899 if (TargetM->addPassesToEmitFile( 1900 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1901 errs() << "The target does not support generation of this file type!\n"; 1902 return ""; 1903 } 1904 1905 PM.run(*GPUModule); 1906 1907 return ASMStream.str(); 1908 } 1909 1910 std::string GPUNodeBuilder::finalizeKernelFunction() { 1911 1912 if (verifyModule(*GPUModule)) { 1913 DEBUG(dbgs() << "verifyModule failed on module:\n"; 1914 GPUModule->print(dbgs(), nullptr); dbgs() << "\n";); 1915 1916 if (FailOnVerifyModuleFailure) 1917 llvm_unreachable("VerifyModule failed."); 1918 1919 BuildSuccessful = false; 1920 return ""; 1921 } 1922 1923 if (DumpKernelIR) 1924 outs() << *GPUModule << "\n"; 1925 1926 // Optimize module. 1927 llvm::legacy::PassManager OptPasses; 1928 PassManagerBuilder PassBuilder; 1929 PassBuilder.OptLevel = 3; 1930 PassBuilder.SizeLevel = 0; 1931 PassBuilder.populateModulePassManager(OptPasses); 1932 OptPasses.run(*GPUModule); 1933 1934 std::string Assembly = createKernelASM(); 1935 1936 if (DumpKernelASM) 1937 outs() << Assembly << "\n"; 1938 1939 GPUModule.release(); 1940 KernelIDs.clear(); 1941 1942 return Assembly; 1943 } 1944 1945 namespace { 1946 class PPCGCodeGeneration : public ScopPass { 1947 public: 1948 static char ID; 1949 1950 GPURuntime Runtime = GPURuntime::CUDA; 1951 1952 GPUArch Architecture = GPUArch::NVPTX64; 1953 1954 /// The scop that is currently processed. 1955 Scop *S; 1956 1957 LoopInfo *LI; 1958 DominatorTree *DT; 1959 ScalarEvolution *SE; 1960 const DataLayout *DL; 1961 RegionInfo *RI; 1962 1963 PPCGCodeGeneration() : ScopPass(ID) {} 1964 1965 /// Construct compilation options for PPCG. 1966 /// 1967 /// @returns The compilation options. 1968 ppcg_options *createPPCGOptions() { 1969 auto DebugOptions = 1970 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1971 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1972 1973 DebugOptions->dump_schedule_constraints = false; 1974 DebugOptions->dump_schedule = false; 1975 DebugOptions->dump_final_schedule = false; 1976 DebugOptions->dump_sizes = false; 1977 DebugOptions->verbose = false; 1978 1979 Options->debug = DebugOptions; 1980 1981 Options->reschedule = true; 1982 Options->scale_tile_loops = false; 1983 Options->wrap = false; 1984 1985 Options->non_negative_parameters = false; 1986 Options->ctx = nullptr; 1987 Options->sizes = nullptr; 1988 1989 Options->tile_size = 32; 1990 1991 Options->use_private_memory = PrivateMemory; 1992 Options->use_shared_memory = SharedMemory; 1993 Options->max_shared_memory = 48 * 1024; 1994 1995 Options->target = PPCG_TARGET_CUDA; 1996 Options->openmp = false; 1997 Options->linearize_device_arrays = true; 1998 Options->live_range_reordering = false; 1999 2000 Options->opencl_compiler_options = nullptr; 2001 Options->opencl_use_gpu = false; 2002 Options->opencl_n_include_file = 0; 2003 Options->opencl_include_files = nullptr; 2004 Options->opencl_print_kernel_types = false; 2005 Options->opencl_embed_kernel_code = false; 2006 2007 Options->save_schedule_file = nullptr; 2008 Options->load_schedule_file = nullptr; 2009 2010 return Options; 2011 } 2012 2013 /// Get a tagged access relation containing all accesses of type @p AccessTy. 2014 /// 2015 /// Instead of a normal access of the form: 2016 /// 2017 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 2018 /// 2019 /// a tagged access has the form 2020 /// 2021 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 2022 /// 2023 /// where 'id' is an additional space that references the memory access that 2024 /// triggered the access. 2025 /// 2026 /// @param AccessTy The type of the memory accesses to collect. 2027 /// 2028 /// @return The relation describing all tagged memory accesses. 2029 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 2030 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 2031 2032 for (auto &Stmt : *S) 2033 for (auto &Acc : Stmt) 2034 if (Acc->getType() == AccessTy) { 2035 isl_map *Relation = Acc->getAccessRelation(); 2036 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 2037 2038 isl_space *Space = isl_map_get_space(Relation); 2039 Space = isl_space_range(Space); 2040 Space = isl_space_from_range(Space); 2041 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 2042 isl_map *Universe = isl_map_universe(Space); 2043 Relation = isl_map_domain_product(Relation, Universe); 2044 Accesses = isl_union_map_add_map(Accesses, Relation); 2045 } 2046 2047 return Accesses; 2048 } 2049 2050 /// Get the set of all read accesses, tagged with the access id. 2051 /// 2052 /// @see getTaggedAccesses 2053 isl_union_map *getTaggedReads() { 2054 return getTaggedAccesses(MemoryAccess::READ); 2055 } 2056 2057 /// Get the set of all may (and must) accesses, tagged with the access id. 2058 /// 2059 /// @see getTaggedAccesses 2060 isl_union_map *getTaggedMayWrites() { 2061 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 2062 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 2063 } 2064 2065 /// Get the set of all must accesses, tagged with the access id. 2066 /// 2067 /// @see getTaggedAccesses 2068 isl_union_map *getTaggedMustWrites() { 2069 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 2070 } 2071 2072 /// Collect parameter and array names as isl_ids. 2073 /// 2074 /// To reason about the different parameters and arrays used, ppcg requires 2075 /// a list of all isl_ids in use. As PPCG traditionally performs 2076 /// source-to-source compilation each of these isl_ids is mapped to the 2077 /// expression that represents it. As we do not have a corresponding 2078 /// expression in Polly, we just map each id to a 'zero' expression to match 2079 /// the data format that ppcg expects. 2080 /// 2081 /// @returns Retun a map from collected ids to 'zero' ast expressions. 2082 __isl_give isl_id_to_ast_expr *getNames() { 2083 auto *Names = isl_id_to_ast_expr_alloc( 2084 S->getIslCtx(), 2085 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 2086 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 2087 auto *Space = S->getParamSpace(); 2088 2089 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 2090 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 2091 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 2092 } 2093 2094 for (auto &Array : S->arrays()) { 2095 auto Id = Array->getBasePtrId(); 2096 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 2097 } 2098 2099 isl_space_free(Space); 2100 isl_ast_expr_free(Zero); 2101 2102 return Names; 2103 } 2104 2105 /// Create a new PPCG scop from the current scop. 2106 /// 2107 /// The PPCG scop is initialized with data from the current polly::Scop. From 2108 /// this initial data, the data-dependences in the PPCG scop are initialized. 2109 /// We do not use Polly's dependence analysis for now, to ensure we match 2110 /// the PPCG default behaviour more closely. 2111 /// 2112 /// @returns A new ppcg scop. 2113 ppcg_scop *createPPCGScop() { 2114 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 2115 2116 PPCGScop->options = createPPCGOptions(); 2117 2118 PPCGScop->start = 0; 2119 PPCGScop->end = 0; 2120 2121 PPCGScop->context = S->getContext(); 2122 PPCGScop->domain = S->getDomains(); 2123 PPCGScop->call = nullptr; 2124 PPCGScop->tagged_reads = getTaggedReads(); 2125 PPCGScop->reads = S->getReads(); 2126 PPCGScop->live_in = nullptr; 2127 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 2128 PPCGScop->may_writes = S->getWrites(); 2129 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 2130 PPCGScop->must_writes = S->getMustWrites(); 2131 PPCGScop->live_out = nullptr; 2132 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 2133 PPCGScop->tagger = nullptr; 2134 2135 PPCGScop->independence = nullptr; 2136 PPCGScop->dep_flow = nullptr; 2137 PPCGScop->tagged_dep_flow = nullptr; 2138 PPCGScop->dep_false = nullptr; 2139 PPCGScop->dep_forced = nullptr; 2140 PPCGScop->dep_order = nullptr; 2141 PPCGScop->tagged_dep_order = nullptr; 2142 2143 PPCGScop->schedule = S->getScheduleTree(); 2144 PPCGScop->names = getNames(); 2145 2146 PPCGScop->pet = nullptr; 2147 2148 compute_tagger(PPCGScop); 2149 compute_dependences(PPCGScop); 2150 2151 return PPCGScop; 2152 } 2153 2154 /// Collect the array accesses in a statement. 2155 /// 2156 /// @param Stmt The statement for which to collect the accesses. 2157 /// 2158 /// @returns A list of array accesses. 2159 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 2160 gpu_stmt_access *Accesses = nullptr; 2161 2162 for (MemoryAccess *Acc : Stmt) { 2163 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 2164 Access->read = Acc->isRead(); 2165 Access->write = Acc->isWrite(); 2166 Access->access = Acc->getAccessRelation(); 2167 isl_space *Space = isl_map_get_space(Access->access); 2168 Space = isl_space_range(Space); 2169 Space = isl_space_from_range(Space); 2170 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 2171 isl_map *Universe = isl_map_universe(Space); 2172 Access->tagged_access = 2173 isl_map_domain_product(Acc->getAccessRelation(), Universe); 2174 Access->exact_write = !Acc->isMayWrite(); 2175 Access->ref_id = Acc->getId(); 2176 Access->next = Accesses; 2177 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 2178 Accesses = Access; 2179 } 2180 2181 return Accesses; 2182 } 2183 2184 /// Collect the list of GPU statements. 2185 /// 2186 /// Each statement has an id, a pointer to the underlying data structure, 2187 /// as well as a list with all memory accesses. 2188 /// 2189 /// TODO: Initialize the list of memory accesses. 2190 /// 2191 /// @returns A linked-list of statements. 2192 gpu_stmt *getStatements() { 2193 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 2194 std::distance(S->begin(), S->end())); 2195 2196 int i = 0; 2197 for (auto &Stmt : *S) { 2198 gpu_stmt *GPUStmt = &Stmts[i]; 2199 2200 GPUStmt->id = Stmt.getDomainId(); 2201 2202 // We use the pet stmt pointer to keep track of the Polly statements. 2203 GPUStmt->stmt = (pet_stmt *)&Stmt; 2204 GPUStmt->accesses = getStmtAccesses(Stmt); 2205 i++; 2206 } 2207 2208 return Stmts; 2209 } 2210 2211 /// Derive the extent of an array. 2212 /// 2213 /// The extent of an array is the set of elements that are within the 2214 /// accessed array. For the inner dimensions, the extent constraints are 2215 /// 0 and the size of the corresponding array dimension. For the first 2216 /// (outermost) dimension, the extent constraints are the minimal and maximal 2217 /// subscript value for the first dimension. 2218 /// 2219 /// @param Array The array to derive the extent for. 2220 /// 2221 /// @returns An isl_set describing the extent of the array. 2222 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 2223 unsigned NumDims = Array->getNumberOfDimensions(); 2224 isl_union_map *Accesses = S->getAccesses(); 2225 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 2226 Accesses = isl_union_map_detect_equalities(Accesses); 2227 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 2228 AccessUSet = isl_union_set_coalesce(AccessUSet); 2229 AccessUSet = isl_union_set_detect_equalities(AccessUSet); 2230 AccessUSet = isl_union_set_coalesce(AccessUSet); 2231 2232 if (isl_union_set_is_empty(AccessUSet)) { 2233 isl_union_set_free(AccessUSet); 2234 return isl_set_empty(Array->getSpace()); 2235 } 2236 2237 if (Array->getNumberOfDimensions() == 0) { 2238 isl_union_set_free(AccessUSet); 2239 return isl_set_universe(Array->getSpace()); 2240 } 2241 2242 isl_set *AccessSet = 2243 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 2244 2245 isl_union_set_free(AccessUSet); 2246 isl_local_space *LS = isl_local_space_from_space(Array->getSpace()); 2247 2248 isl_pw_aff *Val = 2249 isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 2250 2251 isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 2252 isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 2253 OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 2254 isl_pw_aff_dim(Val, isl_dim_in)); 2255 OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 2256 isl_pw_aff_dim(Val, isl_dim_in)); 2257 OuterMin = 2258 isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); 2259 OuterMax = 2260 isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); 2261 2262 isl_set *Extent = isl_set_universe(Array->getSpace()); 2263 2264 Extent = isl_set_intersect( 2265 Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 2266 Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 2267 2268 for (unsigned i = 1; i < NumDims; ++i) 2269 Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 2270 2271 for (unsigned i = 0; i < NumDims; ++i) { 2272 isl_pw_aff *PwAff = 2273 const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i)); 2274 2275 // isl_pw_aff can be NULL for zero dimension. Only in the case of a 2276 // Fortran array will we have a legitimate dimension. 2277 if (!PwAff) { 2278 assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); 2279 continue; 2280 } 2281 2282 isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 2283 isl_local_space_from_space(Array->getSpace()), isl_dim_set, i)); 2284 PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 2285 isl_pw_aff_dim(Val, isl_dim_in)); 2286 PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 2287 isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 2288 auto *Set = isl_pw_aff_gt_set(PwAff, Val); 2289 Extent = isl_set_intersect(Set, Extent); 2290 } 2291 2292 return Extent; 2293 } 2294 2295 /// Derive the bounds of an array. 2296 /// 2297 /// For the first dimension we derive the bound of the array from the extent 2298 /// of this dimension. For inner dimensions we obtain their size directly from 2299 /// ScopArrayInfo. 2300 /// 2301 /// @param PPCGArray The array to compute bounds for. 2302 /// @param Array The polly array from which to take the information. 2303 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 2304 if (PPCGArray.n_index > 0) { 2305 if (isl_set_is_empty(PPCGArray.extent)) { 2306 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2307 isl_local_space *LS = isl_local_space_from_space( 2308 isl_space_params(isl_set_get_space(Dom))); 2309 isl_set_free(Dom); 2310 isl_aff *Zero = isl_aff_zero_on_domain(LS); 2311 PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero); 2312 } else { 2313 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2314 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 2315 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 2316 isl_set_free(Dom); 2317 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 2318 isl_local_space *LS = 2319 isl_local_space_from_space(isl_set_get_space(Dom)); 2320 isl_aff *One = isl_aff_zero_on_domain(LS); 2321 One = isl_aff_add_constant_si(One, 1); 2322 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 2323 Bound = isl_pw_aff_gist(Bound, S->getContext()); 2324 PPCGArray.bound[0] = Bound; 2325 } 2326 } 2327 2328 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 2329 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 2330 auto LS = isl_pw_aff_get_domain_space(Bound); 2331 auto Aff = isl_multi_aff_zero(LS); 2332 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 2333 PPCGArray.bound[i] = Bound; 2334 } 2335 } 2336 2337 /// Create the arrays for @p PPCGProg. 2338 /// 2339 /// @param PPCGProg The program to compute the arrays for. 2340 void createArrays(gpu_prog *PPCGProg) { 2341 int i = 0; 2342 for (auto &Array : S->arrays()) { 2343 std::string TypeName; 2344 raw_string_ostream OS(TypeName); 2345 2346 OS << *Array->getElementType(); 2347 TypeName = OS.str(); 2348 2349 gpu_array_info &PPCGArray = PPCGProg->array[i]; 2350 2351 PPCGArray.space = Array->getSpace(); 2352 PPCGArray.type = strdup(TypeName.c_str()); 2353 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 2354 PPCGArray.name = strdup(Array->getName().c_str()); 2355 PPCGArray.extent = nullptr; 2356 PPCGArray.n_index = Array->getNumberOfDimensions(); 2357 PPCGArray.bound = 2358 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 2359 PPCGArray.extent = getExtent(Array); 2360 PPCGArray.n_ref = 0; 2361 PPCGArray.refs = nullptr; 2362 PPCGArray.accessed = true; 2363 PPCGArray.read_only_scalar = 2364 Array->isReadOnly() && Array->getNumberOfDimensions() == 0; 2365 PPCGArray.has_compound_element = false; 2366 PPCGArray.local = false; 2367 PPCGArray.declare_local = false; 2368 PPCGArray.global = false; 2369 PPCGArray.linearize = false; 2370 PPCGArray.dep_order = nullptr; 2371 PPCGArray.user = Array; 2372 2373 setArrayBounds(PPCGArray, Array); 2374 i++; 2375 2376 collect_references(PPCGProg, &PPCGArray); 2377 } 2378 } 2379 2380 /// Create an identity map between the arrays in the scop. 2381 /// 2382 /// @returns An identity map between the arrays in the scop. 2383 isl_union_map *getArrayIdentity() { 2384 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 2385 2386 for (auto &Array : S->arrays()) { 2387 isl_space *Space = Array->getSpace(); 2388 Space = isl_space_map_from_set(Space); 2389 isl_map *Identity = isl_map_identity(Space); 2390 Maps = isl_union_map_add_map(Maps, Identity); 2391 } 2392 2393 return Maps; 2394 } 2395 2396 /// Create a default-initialized PPCG GPU program. 2397 /// 2398 /// @returns A new gpu program description. 2399 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 2400 2401 if (!PPCGScop) 2402 return nullptr; 2403 2404 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 2405 2406 PPCGProg->ctx = S->getIslCtx(); 2407 PPCGProg->scop = PPCGScop; 2408 PPCGProg->context = isl_set_copy(PPCGScop->context); 2409 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 2410 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 2411 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 2412 PPCGProg->tagged_must_kill = 2413 isl_union_map_copy(PPCGScop->tagged_must_kills); 2414 PPCGProg->to_inner = getArrayIdentity(); 2415 PPCGProg->to_outer = getArrayIdentity(); 2416 PPCGProg->any_to_outer = nullptr; 2417 PPCGProg->array_order = nullptr; 2418 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 2419 PPCGProg->stmts = getStatements(); 2420 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 2421 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 2422 PPCGProg->n_array); 2423 2424 createArrays(PPCGProg); 2425 2426 PPCGProg->may_persist = compute_may_persist(PPCGProg); 2427 2428 return PPCGProg; 2429 } 2430 2431 struct PrintGPUUserData { 2432 struct cuda_info *CudaInfo; 2433 struct gpu_prog *PPCGProg; 2434 std::vector<ppcg_kernel *> Kernels; 2435 }; 2436 2437 /// Print a user statement node in the host code. 2438 /// 2439 /// We use ppcg's printing facilities to print the actual statement and 2440 /// additionally build up a list of all kernels that are encountered in the 2441 /// host ast. 2442 /// 2443 /// @param P The printer to print to 2444 /// @param Options The printing options to use 2445 /// @param Node The node to print 2446 /// @param User A user pointer to carry additional data. This pointer is 2447 /// expected to be of type PrintGPUUserData. 2448 /// 2449 /// @returns A printer to which the output has been printed. 2450 static __isl_give isl_printer * 2451 printHostUser(__isl_take isl_printer *P, 2452 __isl_take isl_ast_print_options *Options, 2453 __isl_take isl_ast_node *Node, void *User) { 2454 auto Data = (struct PrintGPUUserData *)User; 2455 auto Id = isl_ast_node_get_annotation(Node); 2456 2457 if (Id) { 2458 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 2459 2460 // If this is a user statement, format it ourselves as ppcg would 2461 // otherwise try to call pet functionality that is not available in 2462 // Polly. 2463 if (IsUser) { 2464 P = isl_printer_start_line(P); 2465 P = isl_printer_print_ast_node(P, Node); 2466 P = isl_printer_end_line(P); 2467 isl_id_free(Id); 2468 isl_ast_print_options_free(Options); 2469 return P; 2470 } 2471 2472 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 2473 isl_id_free(Id); 2474 Data->Kernels.push_back(Kernel); 2475 } 2476 2477 return print_host_user(P, Options, Node, User); 2478 } 2479 2480 /// Print C code corresponding to the control flow in @p Kernel. 2481 /// 2482 /// @param Kernel The kernel to print 2483 void printKernel(ppcg_kernel *Kernel) { 2484 auto *P = isl_printer_to_str(S->getIslCtx()); 2485 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2486 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2487 P = isl_ast_node_print(Kernel->tree, P, Options); 2488 char *String = isl_printer_get_str(P); 2489 printf("%s\n", String); 2490 free(String); 2491 isl_printer_free(P); 2492 } 2493 2494 /// Print C code corresponding to the GPU code described by @p Tree. 2495 /// 2496 /// @param Tree An AST describing GPU code 2497 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 2498 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 2499 auto *P = isl_printer_to_str(S->getIslCtx()); 2500 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2501 2502 PrintGPUUserData Data; 2503 Data.PPCGProg = PPCGProg; 2504 2505 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2506 Options = 2507 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 2508 P = isl_ast_node_print(Tree, P, Options); 2509 char *String = isl_printer_get_str(P); 2510 printf("# host\n"); 2511 printf("%s\n", String); 2512 free(String); 2513 isl_printer_free(P); 2514 2515 for (auto Kernel : Data.Kernels) { 2516 printf("# kernel%d\n", Kernel->id); 2517 printKernel(Kernel); 2518 } 2519 } 2520 2521 // Generate a GPU program using PPCG. 2522 // 2523 // GPU mapping consists of multiple steps: 2524 // 2525 // 1) Compute new schedule for the program. 2526 // 2) Map schedule to GPU (TODO) 2527 // 3) Generate code for new schedule (TODO) 2528 // 2529 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 2530 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 2531 // strategy directly from this pass. 2532 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 2533 2534 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 2535 2536 PPCGGen->ctx = S->getIslCtx(); 2537 PPCGGen->options = PPCGScop->options; 2538 PPCGGen->print = nullptr; 2539 PPCGGen->print_user = nullptr; 2540 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 2541 PPCGGen->prog = PPCGProg; 2542 PPCGGen->tree = nullptr; 2543 PPCGGen->types.n = 0; 2544 PPCGGen->types.name = nullptr; 2545 PPCGGen->sizes = nullptr; 2546 PPCGGen->used_sizes = nullptr; 2547 PPCGGen->kernel_id = 0; 2548 2549 // Set scheduling strategy to same strategy PPCG is using. 2550 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 2551 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 2552 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 2553 2554 isl_schedule *Schedule = get_schedule(PPCGGen); 2555 2556 int has_permutable = has_any_permutable_node(Schedule); 2557 2558 if (!has_permutable || has_permutable < 0) { 2559 Schedule = isl_schedule_free(Schedule); 2560 } else { 2561 Schedule = map_to_device(PPCGGen, Schedule); 2562 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 2563 } 2564 2565 if (DumpSchedule) { 2566 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 2567 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 2568 P = isl_printer_print_str(P, "Schedule\n"); 2569 P = isl_printer_print_str(P, "========\n"); 2570 if (Schedule) 2571 P = isl_printer_print_schedule(P, Schedule); 2572 else 2573 P = isl_printer_print_str(P, "No schedule found\n"); 2574 2575 printf("%s\n", isl_printer_get_str(P)); 2576 isl_printer_free(P); 2577 } 2578 2579 if (DumpCode) { 2580 printf("Code\n"); 2581 printf("====\n"); 2582 if (PPCGGen->tree) 2583 printGPUTree(PPCGGen->tree, PPCGProg); 2584 else 2585 printf("No code generated\n"); 2586 } 2587 2588 isl_schedule_free(Schedule); 2589 2590 return PPCGGen; 2591 } 2592 2593 /// Free gpu_gen structure. 2594 /// 2595 /// @param PPCGGen The ppcg_gen object to free. 2596 void freePPCGGen(gpu_gen *PPCGGen) { 2597 isl_ast_node_free(PPCGGen->tree); 2598 isl_union_map_free(PPCGGen->sizes); 2599 isl_union_map_free(PPCGGen->used_sizes); 2600 free(PPCGGen); 2601 } 2602 2603 /// Free the options in the ppcg scop structure. 2604 /// 2605 /// ppcg is not freeing these options for us. To avoid leaks we do this 2606 /// ourselves. 2607 /// 2608 /// @param PPCGScop The scop referencing the options to free. 2609 void freeOptions(ppcg_scop *PPCGScop) { 2610 free(PPCGScop->options->debug); 2611 PPCGScop->options->debug = nullptr; 2612 free(PPCGScop->options); 2613 PPCGScop->options = nullptr; 2614 } 2615 2616 /// Approximate the number of points in the set. 2617 /// 2618 /// This function returns an ast expression that overapproximates the number 2619 /// of points in an isl set through the rectangular hull surrounding this set. 2620 /// 2621 /// @param Set The set to count. 2622 /// @param Build The isl ast build object to use for creating the ast 2623 /// expression. 2624 /// 2625 /// @returns An approximation of the number of points in the set. 2626 __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, 2627 __isl_keep isl_ast_build *Build) { 2628 2629 isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); 2630 auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); 2631 2632 isl_space *Space = isl_set_get_space(Set); 2633 Space = isl_space_params(Space); 2634 auto *Univ = isl_set_universe(Space); 2635 isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); 2636 2637 for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) { 2638 isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); 2639 isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); 2640 isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); 2641 DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); 2642 auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); 2643 Expr = isl_ast_expr_mul(Expr, DimSizeExpr); 2644 } 2645 2646 isl_set_free(Set); 2647 isl_pw_aff_free(OneAff); 2648 2649 return Expr; 2650 } 2651 2652 /// Approximate a number of dynamic instructions executed by a given 2653 /// statement. 2654 /// 2655 /// @param Stmt The statement for which to compute the number of dynamic 2656 /// instructions. 2657 /// @param Build The isl ast build object to use for creating the ast 2658 /// expression. 2659 /// @returns An approximation of the number of dynamic instructions executed 2660 /// by @p Stmt. 2661 __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, 2662 __isl_keep isl_ast_build *Build) { 2663 auto Iterations = approxPointsInSet(Stmt.getDomain(), Build); 2664 2665 long InstCount = 0; 2666 2667 if (Stmt.isBlockStmt()) { 2668 auto *BB = Stmt.getBasicBlock(); 2669 InstCount = std::distance(BB->begin(), BB->end()); 2670 } else { 2671 auto *R = Stmt.getRegion(); 2672 2673 for (auto *BB : R->blocks()) { 2674 InstCount += std::distance(BB->begin(), BB->end()); 2675 } 2676 } 2677 2678 isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount); 2679 auto *InstExpr = isl_ast_expr_from_val(InstVal); 2680 return isl_ast_expr_mul(InstExpr, Iterations); 2681 } 2682 2683 /// Approximate dynamic instructions executed in scop. 2684 /// 2685 /// @param S The scop for which to approximate dynamic instructions. 2686 /// @param Build The isl ast build object to use for creating the ast 2687 /// expression. 2688 /// @returns An approximation of the number of dynamic instructions executed 2689 /// in @p S. 2690 __isl_give isl_ast_expr * 2691 getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { 2692 isl_ast_expr *Instructions; 2693 2694 isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0); 2695 Instructions = isl_ast_expr_from_val(Zero); 2696 2697 for (ScopStmt &Stmt : S) { 2698 isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); 2699 Instructions = isl_ast_expr_add(Instructions, StmtInstructions); 2700 } 2701 return Instructions; 2702 } 2703 2704 /// Create a check that ensures sufficient compute in scop. 2705 /// 2706 /// @param S The scop for which to ensure sufficient compute. 2707 /// @param Build The isl ast build object to use for creating the ast 2708 /// expression. 2709 /// @returns An expression that evaluates to TRUE in case of sufficient 2710 /// compute and to FALSE, otherwise. 2711 __isl_give isl_ast_expr * 2712 createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { 2713 auto Iterations = getNumberOfIterations(S, Build); 2714 auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute); 2715 auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); 2716 return isl_ast_expr_ge(Iterations, MinComputeExpr); 2717 } 2718 2719 /// Check if the basic block contains a function we cannot codegen for GPU 2720 /// kernels. 2721 /// 2722 /// If this basic block does something with a `Function` other than calling 2723 /// a function that we support in a kernel, return true. 2724 bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) { 2725 for (const Instruction &Inst : *BB) { 2726 const CallInst *Call = dyn_cast<CallInst>(&Inst); 2727 if (Call && isValidFunctionInKernel(Call->getCalledFunction())) { 2728 continue; 2729 } 2730 2731 for (Value *SrcVal : Inst.operands()) { 2732 PointerType *p = dyn_cast<PointerType>(SrcVal->getType()); 2733 if (!p) 2734 continue; 2735 if (isa<FunctionType>(p->getElementType())) 2736 return true; 2737 } 2738 } 2739 return false; 2740 } 2741 2742 /// Return whether the Scop S uses functions in a way that we do not support. 2743 bool containsInvalidKernelFunction(const Scop &S) { 2744 for (auto &Stmt : S) { 2745 if (Stmt.isBlockStmt()) { 2746 if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock())) 2747 return true; 2748 } else { 2749 assert(Stmt.isRegionStmt() && 2750 "Stmt was neither block nor region statement"); 2751 for (const BasicBlock *BB : Stmt.getRegion()->blocks()) 2752 if (containsInvalidKernelFunctionInBllock(BB)) 2753 return true; 2754 } 2755 } 2756 return false; 2757 } 2758 2759 /// Generate code for a given GPU AST described by @p Root. 2760 /// 2761 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 2762 /// @param Prog The GPU Program to generate code for. 2763 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 2764 ScopAnnotator Annotator; 2765 Annotator.buildAliasScopes(*S); 2766 2767 Region *R = &S->getRegion(); 2768 2769 simplifyRegion(R, DT, LI, RI); 2770 2771 BasicBlock *EnteringBB = R->getEnteringBlock(); 2772 2773 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 2774 2775 // Only build the run-time condition and parameters _after_ having 2776 // introduced the conditional branch. This is important as the conditional 2777 // branch will guard the original scop from new induction variables that 2778 // the SCEVExpander may introduce while code generating the parameters and 2779 // which may introduce scalar dependences that prevent us from correctly 2780 // code generating this scop. 2781 BBPair StartExitBlocks = 2782 executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); 2783 BasicBlock *StartBlock = std::get<0>(StartExitBlocks); 2784 2785 GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, 2786 StartBlock, Prog, Runtime, Architecture); 2787 2788 // TODO: Handle LICM 2789 auto SplitBlock = StartBlock->getSinglePredecessor(); 2790 Builder.SetInsertPoint(SplitBlock->getTerminator()); 2791 NodeBuilder.addParameters(S->getContext()); 2792 2793 isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 2794 isl_ast_expr *Condition = IslAst::buildRunCondition(*S, Build); 2795 isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); 2796 Condition = isl_ast_expr_and(Condition, SufficientCompute); 2797 isl_ast_build_free(Build); 2798 2799 Value *RTC = NodeBuilder.createRTC(Condition); 2800 Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 2801 2802 Builder.SetInsertPoint(&*StartBlock->begin()); 2803 2804 NodeBuilder.initializeAfterRTH(); 2805 NodeBuilder.create(Root); 2806 NodeBuilder.finalize(); 2807 2808 /// In case a sequential kernel has more surrounding loops as any parallel 2809 /// kernel, the SCoP is probably mostly sequential. Hence, there is no 2810 /// point in running it on a GPU. 2811 if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) 2812 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2813 2814 if (!NodeBuilder.BuildSuccessful) 2815 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2816 } 2817 2818 bool runOnScop(Scop &CurrentScop) override { 2819 S = &CurrentScop; 2820 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2821 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2822 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2823 DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); 2824 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2825 2826 // We currently do not support functions other than intrinsics inside 2827 // kernels, as code generation will need to offload function calls to the 2828 // kernel. This may lead to a kernel trying to call a function on the host. 2829 // This also allows us to prevent codegen from trying to take the 2830 // address of an intrinsic function to send to the kernel. 2831 if (containsInvalidKernelFunction(CurrentScop)) { 2832 DEBUG( 2833 dbgs() 2834 << "Scop contains function which cannot be materialised in a GPU " 2835 "kernel. Bailing out.\n";); 2836 return false; 2837 } 2838 2839 auto PPCGScop = createPPCGScop(); 2840 auto PPCGProg = createPPCGProg(PPCGScop); 2841 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2842 2843 if (PPCGGen->tree) { 2844 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2845 CurrentScop.markAsToBeSkipped(); 2846 } 2847 2848 freeOptions(PPCGScop); 2849 freePPCGGen(PPCGGen); 2850 gpu_prog_free(PPCGProg); 2851 ppcg_scop_free(PPCGScop); 2852 2853 return true; 2854 } 2855 2856 void printScop(raw_ostream &, Scop &) const override {} 2857 2858 void getAnalysisUsage(AnalysisUsage &AU) const override { 2859 AU.addRequired<DominatorTreeWrapperPass>(); 2860 AU.addRequired<RegionInfoPass>(); 2861 AU.addRequired<ScalarEvolutionWrapperPass>(); 2862 AU.addRequired<ScopDetectionWrapperPass>(); 2863 AU.addRequired<ScopInfoRegionPass>(); 2864 AU.addRequired<LoopInfoWrapperPass>(); 2865 2866 AU.addPreserved<AAResultsWrapperPass>(); 2867 AU.addPreserved<BasicAAWrapperPass>(); 2868 AU.addPreserved<LoopInfoWrapperPass>(); 2869 AU.addPreserved<DominatorTreeWrapperPass>(); 2870 AU.addPreserved<GlobalsAAWrapperPass>(); 2871 AU.addPreserved<ScopDetectionWrapperPass>(); 2872 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2873 AU.addPreserved<SCEVAAWrapperPass>(); 2874 2875 // FIXME: We do not yet add regions for the newly generated code to the 2876 // region tree. 2877 AU.addPreserved<RegionInfoPass>(); 2878 AU.addPreserved<ScopInfoRegionPass>(); 2879 } 2880 }; 2881 } // namespace 2882 2883 char PPCGCodeGeneration::ID = 1; 2884 2885 Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { 2886 PPCGCodeGeneration *generator = new PPCGCodeGeneration(); 2887 generator->Runtime = Runtime; 2888 generator->Architecture = Arch; 2889 return generator; 2890 } 2891 2892 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 2893 "Polly - Apply PPCG translation to SCOP", false, false) 2894 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 2895 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 2896 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 2897 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 2898 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 2899 INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); 2900 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 2901 "Polly - Apply PPCG translation to SCOP", false, false) 2902