1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopDetection.h" 21 #include "polly/ScopInfo.h" 22 #include "polly/Support/SCEVValidator.h" 23 #include "llvm/ADT/PostOrderIterator.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/BasicAliasAnalysis.h" 26 #include "llvm/Analysis/GlobalsModRef.h" 27 #include "llvm/Analysis/PostDominators.h" 28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/LegacyPassManager.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/Support/TargetRegistry.h" 34 #include "llvm/Support/TargetSelect.h" 35 #include "llvm/Target/TargetMachine.h" 36 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 37 38 #include "isl/union_map.h" 39 40 extern "C" { 41 #include "ppcg/cuda.h" 42 #include "ppcg/gpu.h" 43 #include "ppcg/gpu_print.h" 44 #include "ppcg/ppcg.h" 45 #include "ppcg/schedule.h" 46 } 47 48 #include "llvm/Support/Debug.h" 49 50 using namespace polly; 51 using namespace llvm; 52 53 #define DEBUG_TYPE "polly-codegen-ppcg" 54 55 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 56 cl::desc("Dump the computed GPU Schedule"), 57 cl::Hidden, cl::init(false), cl::ZeroOrMore, 58 cl::cat(PollyCategory)); 59 60 static cl::opt<bool> 61 DumpCode("polly-acc-dump-code", 62 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 63 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 64 65 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 66 cl::desc("Dump the kernel LLVM-IR"), 67 cl::Hidden, cl::init(false), cl::ZeroOrMore, 68 cl::cat(PollyCategory)); 69 70 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 71 cl::desc("Dump the kernel assembly code"), 72 cl::Hidden, cl::init(false), cl::ZeroOrMore, 73 cl::cat(PollyCategory)); 74 75 static cl::opt<bool> FastMath("polly-acc-fastmath", 76 cl::desc("Allow unsafe math optimizations"), 77 cl::Hidden, cl::init(false), cl::ZeroOrMore, 78 cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 CudaVersion("polly-acc-cuda-version", 82 cl::desc("The CUDA version to compile for"), cl::Hidden, 83 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 84 85 /// Create the ast expressions for a ScopStmt. 86 /// 87 /// This function is a callback for to generate the ast expressions for each 88 /// of the scheduled ScopStmts. 89 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 90 void *StmtT, isl_ast_build *Build, 91 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 92 isl_id *Id, void *User), 93 void *UserIndex, 94 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 95 void *UserExpr) { 96 97 ScopStmt *Stmt = (ScopStmt *)StmtT; 98 99 isl_ctx *Ctx; 100 101 if (!Stmt || !Build) 102 return NULL; 103 104 Ctx = isl_ast_build_get_ctx(Build); 105 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 106 107 for (MemoryAccess *Acc : *Stmt) { 108 isl_map *AddrFunc = Acc->getAddressFunction(); 109 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 110 isl_id *RefId = Acc->getId(); 111 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 112 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 113 MPA = isl_multi_pw_aff_coalesce(MPA); 114 MPA = FunctionIndex(MPA, RefId, UserIndex); 115 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 116 Access = FunctionExpr(Access, RefId, UserExpr); 117 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 118 } 119 120 return RefToExpr; 121 } 122 123 /// Generate code for a GPU specific isl AST. 124 /// 125 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 126 /// generates code for general-prupose AST nodes, with special functionality 127 /// for generating GPU specific user nodes. 128 /// 129 /// @see GPUNodeBuilder::createUser 130 class GPUNodeBuilder : public IslNodeBuilder { 131 public: 132 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 133 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 134 DominatorTree &DT, Scop &S, gpu_prog *Prog) 135 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 136 getExprBuilder().setIDToSAI(&IDToSAI); 137 } 138 139 /// Create after-run-time-check initialization code. 140 void initializeAfterRTH(); 141 142 /// Finalize the generated scop. 143 virtual void finalize(); 144 145 private: 146 /// A vector of array base pointers for which a new ScopArrayInfo was created. 147 /// 148 /// This vector is used to delete the ScopArrayInfo when it is not needed any 149 /// more. 150 std::vector<Value *> LocalArrays; 151 152 /// A map from ScopArrays to their corresponding device allocations. 153 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 154 155 /// The current GPU context. 156 Value *GPUContext; 157 158 /// A module containing GPU code. 159 /// 160 /// This pointer is only set in case we are currently generating GPU code. 161 std::unique_ptr<Module> GPUModule; 162 163 /// The GPU program we generate code for. 164 gpu_prog *Prog; 165 166 /// Class to free isl_ids. 167 class IslIdDeleter { 168 public: 169 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 170 }; 171 172 /// A set containing all isl_ids allocated in a GPU kernel. 173 /// 174 /// By releasing this set all isl_ids will be freed. 175 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 176 177 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 178 179 /// Create code for user-defined AST nodes. 180 /// 181 /// These AST nodes can be of type: 182 /// 183 /// - ScopStmt: A computational statement (TODO) 184 /// - Kernel: A GPU kernel call (TODO) 185 /// - Data-Transfer: A GPU <-> CPU data-transfer 186 /// - In-kernel synchronization 187 /// - In-kernel memory copy statement 188 /// 189 /// @param UserStmt The ast node to generate code for. 190 virtual void createUser(__isl_take isl_ast_node *UserStmt); 191 192 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 193 194 /// Create code for a data transfer statement 195 /// 196 /// @param TransferStmt The data transfer statement. 197 /// @param Direction The direction in which to transfer data. 198 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 199 enum DataDirection Direction); 200 201 /// Find llvm::Values referenced in GPU kernel. 202 /// 203 /// @param Kernel The kernel to scan for llvm::Values 204 /// 205 /// @returns A set of values referenced by the kernel. 206 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 207 208 /// Compute the sizes of the execution grid for a given kernel. 209 /// 210 /// @param Kernel The kernel to compute grid sizes for. 211 /// 212 /// @returns A tuple with grid sizes for X and Y dimension 213 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 214 215 /// Compute the sizes of the thread blocks for a given kernel. 216 /// 217 /// @param Kernel The kernel to compute thread block sizes for. 218 /// 219 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 220 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 221 222 /// Create kernel launch parameters. 223 /// 224 /// @param Kernel The kernel to create parameters for. 225 /// @param F The kernel function that has been created. 226 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 227 /// 228 /// @returns A stack allocated array with pointers to the parameter 229 /// values that are passed to the kernel. 230 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 231 SetVector<Value *> SubtreeValues); 232 233 /// Create GPU kernel. 234 /// 235 /// Code generate the kernel described by @p KernelStmt. 236 /// 237 /// @param KernelStmt The ast node to generate kernel code for. 238 void createKernel(__isl_take isl_ast_node *KernelStmt); 239 240 /// Generate code that computes the size of an array. 241 /// 242 /// @param Array The array for which to compute a size. 243 Value *getArraySize(gpu_array_info *Array); 244 245 /// Prepare the kernel arguments for kernel code generation 246 /// 247 /// @param Kernel The kernel to generate code for. 248 /// @param FN The function created for the kernel. 249 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 250 251 /// Create kernel function. 252 /// 253 /// Create a kernel function located in a newly created module that can serve 254 /// as target for device code generation. Set the Builder to point to the 255 /// start block of this newly created function. 256 /// 257 /// @param Kernel The kernel to generate code for. 258 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 259 void createKernelFunction(ppcg_kernel *Kernel, 260 SetVector<Value *> &SubtreeValues); 261 262 /// Create the declaration of a kernel function. 263 /// 264 /// The kernel function takes as arguments: 265 /// 266 /// - One i8 pointer for each external array reference used in the kernel. 267 /// - Host iterators 268 /// - Parameters 269 /// - Other LLVM Value references (TODO) 270 /// 271 /// @param Kernel The kernel to generate the function declaration for. 272 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 273 /// 274 /// @returns The newly declared function. 275 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 276 SetVector<Value *> &SubtreeValues); 277 278 /// Insert intrinsic functions to obtain thread and block ids. 279 /// 280 /// @param The kernel to generate the intrinsic functions for. 281 void insertKernelIntrinsics(ppcg_kernel *Kernel); 282 283 /// Create code for a ScopStmt called in @p Expr. 284 /// 285 /// @param Expr The expression containing the call. 286 /// @param KernelStmt The kernel statement referenced in the call. 287 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 288 289 /// Create an in-kernel synchronization call. 290 void createKernelSync(); 291 292 /// Create a PTX assembly string for the current GPU kernel. 293 /// 294 /// @returns A string containing the corresponding PTX assembly code. 295 std::string createKernelASM(); 296 297 /// Remove references from the dominator tree to the kernel function @p F. 298 /// 299 /// @param F The function to remove references to. 300 void clearDominators(Function *F); 301 302 /// Remove references from scalar evolution to the kernel function @p F. 303 /// 304 /// @param F The function to remove references to. 305 void clearScalarEvolution(Function *F); 306 307 /// Remove references from loop info to the kernel function @p F. 308 /// 309 /// @param F The function to remove references to. 310 void clearLoops(Function *F); 311 312 /// Finalize the generation of the kernel function. 313 /// 314 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 315 /// dump its IR to stderr. 316 /// 317 /// @returns The Assembly string of the kernel. 318 std::string finalizeKernelFunction(); 319 320 /// Create code that allocates memory to store arrays on device. 321 void allocateDeviceArrays(); 322 323 /// Free all allocated device arrays. 324 void freeDeviceArrays(); 325 326 /// Create a call to initialize the GPU context. 327 /// 328 /// @returns A pointer to the newly initialized context. 329 Value *createCallInitContext(); 330 331 /// Create a call to get the device pointer for a kernel allocation. 332 /// 333 /// @param Allocation The Polly GPU allocation 334 /// 335 /// @returns The device parameter corresponding to this allocation. 336 Value *createCallGetDevicePtr(Value *Allocation); 337 338 /// Create a call to free the GPU context. 339 /// 340 /// @param Context A pointer to an initialized GPU context. 341 void createCallFreeContext(Value *Context); 342 343 /// Create a call to allocate memory on the device. 344 /// 345 /// @param Size The size of memory to allocate 346 /// 347 /// @returns A pointer that identifies this allocation. 348 Value *createCallAllocateMemoryForDevice(Value *Size); 349 350 /// Create a call to free a device array. 351 /// 352 /// @param Array The device array to free. 353 void createCallFreeDeviceMemory(Value *Array); 354 355 /// Create a call to copy data from host to device. 356 /// 357 /// @param HostPtr A pointer to the host data that should be copied. 358 /// @param DevicePtr A device pointer specifying the location to copy to. 359 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 360 Value *Size); 361 362 /// Create a call to copy data from device to host. 363 /// 364 /// @param DevicePtr A pointer to the device data that should be copied. 365 /// @param HostPtr A host pointer specifying the location to copy to. 366 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 367 Value *Size); 368 369 /// Create a call to get a kernel from an assembly string. 370 /// 371 /// @param Buffer The string describing the kernel. 372 /// @param Entry The name of the kernel function to call. 373 /// 374 /// @returns A pointer to a kernel object 375 Value *createCallGetKernel(Value *Buffer, Value *Entry); 376 377 /// Create a call to free a GPU kernel. 378 /// 379 /// @param GPUKernel THe kernel to free. 380 void createCallFreeKernel(Value *GPUKernel); 381 382 /// Create a call to launch a GPU kernel. 383 /// 384 /// @param GPUKernel The kernel to launch. 385 /// @param GridDimX The size of the first grid dimension. 386 /// @param GridDimY The size of the second grid dimension. 387 /// @param GridBlockX The size of the first block dimension. 388 /// @param GridBlockY The size of the second block dimension. 389 /// @param GridBlockZ The size of the third block dimension. 390 /// @param Paramters A pointer to an array that contains itself pointers to 391 /// the parameter values passed for each kernel argument. 392 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 393 Value *GridDimY, Value *BlockDimX, 394 Value *BlockDimY, Value *BlockDimZ, 395 Value *Parameters); 396 }; 397 398 void GPUNodeBuilder::initializeAfterRTH() { 399 GPUContext = createCallInitContext(); 400 allocateDeviceArrays(); 401 } 402 403 void GPUNodeBuilder::finalize() { 404 freeDeviceArrays(); 405 createCallFreeContext(GPUContext); 406 IslNodeBuilder::finalize(); 407 } 408 409 void GPUNodeBuilder::allocateDeviceArrays() { 410 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 411 412 for (int i = 0; i < Prog->n_array; ++i) { 413 gpu_array_info *Array = &Prog->array[i]; 414 auto *ScopArray = (ScopArrayInfo *)Array->user; 415 std::string DevArrayName("p_dev_array_"); 416 DevArrayName.append(Array->name); 417 418 Value *ArraySize = getArraySize(Array); 419 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 420 DevArray->setName(DevArrayName); 421 DeviceAllocations[ScopArray] = DevArray; 422 } 423 424 isl_ast_build_free(Build); 425 } 426 427 void GPUNodeBuilder::freeDeviceArrays() { 428 for (auto &Array : DeviceAllocations) 429 createCallFreeDeviceMemory(Array.second); 430 } 431 432 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 433 const char *Name = "polly_getKernel"; 434 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 435 Function *F = M->getFunction(Name); 436 437 // If F is not available, declare it. 438 if (!F) { 439 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 440 std::vector<Type *> Args; 441 Args.push_back(Builder.getInt8PtrTy()); 442 Args.push_back(Builder.getInt8PtrTy()); 443 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 444 F = Function::Create(Ty, Linkage, Name, M); 445 } 446 447 return Builder.CreateCall(F, {Buffer, Entry}); 448 } 449 450 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 451 const char *Name = "polly_getDevicePtr"; 452 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 453 Function *F = M->getFunction(Name); 454 455 // If F is not available, declare it. 456 if (!F) { 457 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 458 std::vector<Type *> Args; 459 Args.push_back(Builder.getInt8PtrTy()); 460 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 461 F = Function::Create(Ty, Linkage, Name, M); 462 } 463 464 return Builder.CreateCall(F, {Allocation}); 465 } 466 467 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 468 Value *GridDimY, Value *BlockDimX, 469 Value *BlockDimY, Value *BlockDimZ, 470 Value *Parameters) { 471 const char *Name = "polly_launchKernel"; 472 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 473 Function *F = M->getFunction(Name); 474 475 // If F is not available, declare it. 476 if (!F) { 477 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 478 std::vector<Type *> Args; 479 Args.push_back(Builder.getInt8PtrTy()); 480 Args.push_back(Builder.getInt32Ty()); 481 Args.push_back(Builder.getInt32Ty()); 482 Args.push_back(Builder.getInt32Ty()); 483 Args.push_back(Builder.getInt32Ty()); 484 Args.push_back(Builder.getInt32Ty()); 485 Args.push_back(Builder.getInt8PtrTy()); 486 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 487 F = Function::Create(Ty, Linkage, Name, M); 488 } 489 490 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 491 BlockDimZ, Parameters}); 492 } 493 494 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 495 const char *Name = "polly_freeKernel"; 496 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 497 Function *F = M->getFunction(Name); 498 499 // If F is not available, declare it. 500 if (!F) { 501 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 502 std::vector<Type *> Args; 503 Args.push_back(Builder.getInt8PtrTy()); 504 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 505 F = Function::Create(Ty, Linkage, Name, M); 506 } 507 508 Builder.CreateCall(F, {GPUKernel}); 509 } 510 511 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 512 const char *Name = "polly_freeDeviceMemory"; 513 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 514 Function *F = M->getFunction(Name); 515 516 // If F is not available, declare it. 517 if (!F) { 518 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 519 std::vector<Type *> Args; 520 Args.push_back(Builder.getInt8PtrTy()); 521 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 522 F = Function::Create(Ty, Linkage, Name, M); 523 } 524 525 Builder.CreateCall(F, {Array}); 526 } 527 528 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 529 const char *Name = "polly_allocateMemoryForDevice"; 530 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 531 Function *F = M->getFunction(Name); 532 533 // If F is not available, declare it. 534 if (!F) { 535 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 536 std::vector<Type *> Args; 537 Args.push_back(Builder.getInt64Ty()); 538 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 539 F = Function::Create(Ty, Linkage, Name, M); 540 } 541 542 return Builder.CreateCall(F, {Size}); 543 } 544 545 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 546 Value *DeviceData, 547 Value *Size) { 548 const char *Name = "polly_copyFromHostToDevice"; 549 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 550 Function *F = M->getFunction(Name); 551 552 // If F is not available, declare it. 553 if (!F) { 554 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 555 std::vector<Type *> Args; 556 Args.push_back(Builder.getInt8PtrTy()); 557 Args.push_back(Builder.getInt8PtrTy()); 558 Args.push_back(Builder.getInt64Ty()); 559 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 560 F = Function::Create(Ty, Linkage, Name, M); 561 } 562 563 Builder.CreateCall(F, {HostData, DeviceData, Size}); 564 } 565 566 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 567 Value *HostData, 568 Value *Size) { 569 const char *Name = "polly_copyFromDeviceToHost"; 570 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 571 Function *F = M->getFunction(Name); 572 573 // If F is not available, declare it. 574 if (!F) { 575 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 576 std::vector<Type *> Args; 577 Args.push_back(Builder.getInt8PtrTy()); 578 Args.push_back(Builder.getInt8PtrTy()); 579 Args.push_back(Builder.getInt64Ty()); 580 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 581 F = Function::Create(Ty, Linkage, Name, M); 582 } 583 584 Builder.CreateCall(F, {DeviceData, HostData, Size}); 585 } 586 587 Value *GPUNodeBuilder::createCallInitContext() { 588 const char *Name = "polly_initContext"; 589 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 590 Function *F = M->getFunction(Name); 591 592 // If F is not available, declare it. 593 if (!F) { 594 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 595 std::vector<Type *> Args; 596 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 597 F = Function::Create(Ty, Linkage, Name, M); 598 } 599 600 return Builder.CreateCall(F, {}); 601 } 602 603 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 604 const char *Name = "polly_freeContext"; 605 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 606 Function *F = M->getFunction(Name); 607 608 // If F is not available, declare it. 609 if (!F) { 610 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 611 std::vector<Type *> Args; 612 Args.push_back(Builder.getInt8PtrTy()); 613 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 614 F = Function::Create(Ty, Linkage, Name, M); 615 } 616 617 Builder.CreateCall(F, {Context}); 618 } 619 620 /// Check if one string is a prefix of another. 621 /// 622 /// @param String The string in which to look for the prefix. 623 /// @param Prefix The prefix to look for. 624 static bool isPrefix(std::string String, std::string Prefix) { 625 return String.find(Prefix) == 0; 626 } 627 628 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 629 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 630 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 631 632 if (!gpu_array_is_scalar(Array)) { 633 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 634 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 635 636 for (unsigned int i = 1; i < Array->n_index; i++) { 637 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 638 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 639 Res = isl_ast_expr_mul(Res, Expr); 640 } 641 642 Value *NumElements = ExprBuilder.create(Res); 643 ArraySize = Builder.CreateMul(ArraySize, NumElements); 644 } 645 isl_ast_build_free(Build); 646 return ArraySize; 647 } 648 649 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 650 enum DataDirection Direction) { 651 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 652 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 653 isl_id *Id = isl_ast_expr_get_id(Arg); 654 auto Array = (gpu_array_info *)isl_id_get_user(Id); 655 auto ScopArray = (ScopArrayInfo *)(Array->user); 656 657 Value *Size = getArraySize(Array); 658 Value *HostPtr = ScopArray->getBasePtr(); 659 660 Value *DevPtr = DeviceAllocations[ScopArray]; 661 662 if (gpu_array_is_scalar(Array)) { 663 HostPtr = Builder.CreateAlloca(ScopArray->getElementType()); 664 Builder.CreateStore(ScopArray->getBasePtr(), HostPtr); 665 } 666 667 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 668 669 if (Direction == HOST_TO_DEVICE) 670 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 671 else 672 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 673 674 isl_id_free(Id); 675 isl_ast_expr_free(Arg); 676 isl_ast_expr_free(Expr); 677 isl_ast_node_free(TransferStmt); 678 } 679 680 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 681 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 682 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 683 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 684 isl_id_free(Id); 685 isl_ast_expr_free(StmtExpr); 686 687 const char *Str = isl_id_get_name(Id); 688 if (!strcmp(Str, "kernel")) { 689 createKernel(UserStmt); 690 isl_ast_expr_free(Expr); 691 return; 692 } 693 694 if (isPrefix(Str, "to_device")) { 695 createDataTransfer(UserStmt, HOST_TO_DEVICE); 696 isl_ast_expr_free(Expr); 697 return; 698 } 699 700 if (isPrefix(Str, "from_device")) { 701 createDataTransfer(UserStmt, DEVICE_TO_HOST); 702 isl_ast_expr_free(Expr); 703 return; 704 } 705 706 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 707 struct ppcg_kernel_stmt *KernelStmt = 708 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 709 isl_id_free(Anno); 710 711 switch (KernelStmt->type) { 712 case ppcg_kernel_domain: 713 createScopStmt(Expr, KernelStmt); 714 isl_ast_node_free(UserStmt); 715 return; 716 case ppcg_kernel_copy: 717 // TODO: Create kernel copy stmt 718 isl_ast_expr_free(Expr); 719 isl_ast_node_free(UserStmt); 720 return; 721 case ppcg_kernel_sync: 722 createKernelSync(); 723 isl_ast_expr_free(Expr); 724 isl_ast_node_free(UserStmt); 725 return; 726 } 727 728 isl_ast_expr_free(Expr); 729 isl_ast_node_free(UserStmt); 730 return; 731 } 732 733 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 734 ppcg_kernel_stmt *KernelStmt) { 735 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 736 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 737 738 LoopToScevMapT LTS; 739 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 740 741 createSubstitutions(Expr, Stmt, LTS); 742 743 if (Stmt->isBlockStmt()) 744 BlockGen.copyStmt(*Stmt, LTS, Indexes); 745 else 746 assert(0 && "Region statement not supported\n"); 747 } 748 749 void GPUNodeBuilder::createKernelSync() { 750 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 751 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 752 Builder.CreateCall(Sync, {}); 753 } 754 755 /// Collect llvm::Values referenced from @p Node 756 /// 757 /// This function only applies to isl_ast_nodes that are user_nodes referring 758 /// to a ScopStmt. All other node types are ignore. 759 /// 760 /// @param Node The node to collect references for. 761 /// @param User A user pointer used as storage for the data that is collected. 762 /// 763 /// @returns isl_bool_true if data could be collected successfully. 764 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 765 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 766 return isl_bool_true; 767 768 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 769 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 770 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 771 const char *Str = isl_id_get_name(Id); 772 isl_id_free(Id); 773 isl_ast_expr_free(StmtExpr); 774 isl_ast_expr_free(Expr); 775 776 if (!isPrefix(Str, "Stmt")) 777 return isl_bool_true; 778 779 Id = isl_ast_node_get_annotation(Node); 780 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 781 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 782 isl_id_free(Id); 783 784 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 785 786 return isl_bool_true; 787 } 788 789 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 790 SetVector<Value *> SubtreeValues; 791 SetVector<const SCEV *> SCEVs; 792 SetVector<const Loop *> Loops; 793 SubtreeReferences References = { 794 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 795 796 for (const auto &I : IDToValue) 797 SubtreeValues.insert(I.second); 798 799 isl_ast_node_foreach_descendant_top_down( 800 Kernel->tree, collectReferencesInGPUStmt, &References); 801 802 for (const SCEV *Expr : SCEVs) 803 findValues(Expr, SE, SubtreeValues); 804 805 for (auto &SAI : S.arrays()) 806 SubtreeValues.remove(SAI->getBasePtr()); 807 808 isl_space *Space = S.getParamSpace(); 809 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 810 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 811 assert(IDToValue.count(Id)); 812 Value *Val = IDToValue[Id]; 813 SubtreeValues.remove(Val); 814 isl_id_free(Id); 815 } 816 isl_space_free(Space); 817 818 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 819 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 820 assert(IDToValue.count(Id)); 821 Value *Val = IDToValue[Id]; 822 SubtreeValues.remove(Val); 823 isl_id_free(Id); 824 } 825 826 return SubtreeValues; 827 } 828 829 void GPUNodeBuilder::clearDominators(Function *F) { 830 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 831 std::vector<BasicBlock *> Nodes; 832 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 833 Nodes.push_back(I->getBlock()); 834 835 for (BasicBlock *BB : Nodes) 836 DT.eraseNode(BB); 837 } 838 839 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 840 for (BasicBlock &BB : *F) { 841 Loop *L = LI.getLoopFor(&BB); 842 if (L) 843 SE.forgetLoop(L); 844 } 845 } 846 847 void GPUNodeBuilder::clearLoops(Function *F) { 848 for (BasicBlock &BB : *F) { 849 Loop *L = LI.getLoopFor(&BB); 850 if (L) 851 SE.forgetLoop(L); 852 LI.removeBlock(&BB); 853 } 854 } 855 856 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 857 std::vector<Value *> Sizes; 858 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 859 860 for (long i = 0; i < Kernel->n_grid; i++) { 861 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 862 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 863 Value *Res = ExprBuilder.create(GridSize); 864 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 865 Sizes.push_back(Res); 866 } 867 isl_ast_build_free(Context); 868 869 for (long i = Kernel->n_grid; i < 3; i++) 870 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 871 872 return std::make_tuple(Sizes[0], Sizes[1]); 873 } 874 875 std::tuple<Value *, Value *, Value *> 876 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 877 std::vector<Value *> Sizes; 878 879 for (long i = 0; i < Kernel->n_block; i++) { 880 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 881 Sizes.push_back(Res); 882 } 883 884 for (long i = Kernel->n_block; i < 3; i++) 885 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 886 887 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 888 } 889 890 Value * 891 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 892 SetVector<Value *> SubtreeValues) { 893 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 894 std::distance(F->arg_begin(), F->arg_end())); 895 896 BasicBlock *EntryBlock = 897 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 898 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 899 Instruction *Parameters = 900 new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator()); 901 902 int Index = 0; 903 for (long i = 0; i < Prog->n_array; i++) { 904 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 905 continue; 906 907 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 908 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 909 910 Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI]; 911 DevArray = createCallGetDevicePtr(DevArray); 912 Instruction *Param = new AllocaInst( 913 Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), 914 EntryBlock->getTerminator()); 915 Builder.CreateStore(DevArray, Param); 916 Value *Slot = Builder.CreateGEP( 917 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 918 Value *ParamTyped = 919 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 920 Builder.CreateStore(ParamTyped, Slot); 921 Index++; 922 } 923 924 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 925 926 for (long i = 0; i < NumHostIters; i++) { 927 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 928 Value *Val = IDToValue[Id]; 929 isl_id_free(Id); 930 Instruction *Param = new AllocaInst( 931 Val->getType(), Launch + "_param_" + std::to_string(Index), 932 EntryBlock->getTerminator()); 933 Builder.CreateStore(Val, Param); 934 Value *Slot = Builder.CreateGEP( 935 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 936 Value *ParamTyped = 937 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 938 Builder.CreateStore(ParamTyped, Slot); 939 Index++; 940 } 941 942 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 943 944 for (long i = 0; i < NumVars; i++) { 945 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 946 Value *Val = IDToValue[Id]; 947 isl_id_free(Id); 948 Instruction *Param = new AllocaInst( 949 Val->getType(), Launch + "_param_" + std::to_string(Index), 950 EntryBlock->getTerminator()); 951 Builder.CreateStore(Val, Param); 952 Value *Slot = Builder.CreateGEP( 953 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 954 Value *ParamTyped = 955 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 956 Builder.CreateStore(ParamTyped, Slot); 957 Index++; 958 } 959 960 for (auto Val : SubtreeValues) { 961 Instruction *Param = new AllocaInst( 962 Val->getType(), Launch + "_param_" + std::to_string(Index), 963 EntryBlock->getTerminator()); 964 Builder.CreateStore(Val, Param); 965 Value *Slot = Builder.CreateGEP( 966 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 967 Value *ParamTyped = 968 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 969 Builder.CreateStore(ParamTyped, Slot); 970 Index++; 971 } 972 973 auto Location = EntryBlock->getTerminator(); 974 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 975 Launch + "_params_i8ptr", Location); 976 } 977 978 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 979 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 980 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 981 isl_id_free(Id); 982 isl_ast_node_free(KernelStmt); 983 984 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 985 986 assert(Kernel->tree && "Device AST of kernel node is empty"); 987 988 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 989 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 990 ValueMapT HostValueMap = ValueMap; 991 992 SetVector<const Loop *> Loops; 993 994 // Create for all loops we depend on values that contain the current loop 995 // iteration. These values are necessary to generate code for SCEVs that 996 // depend on such loops. As a result we need to pass them to the subfunction. 997 for (const Loop *L : Loops) { 998 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 999 SE.getUnknown(Builder.getInt64(1)), 1000 L, SCEV::FlagAnyWrap); 1001 Value *V = generateSCEV(OuterLIV); 1002 OutsideLoopIterations[L] = SE.getUnknown(V); 1003 SubtreeValues.insert(V); 1004 } 1005 1006 createKernelFunction(Kernel, SubtreeValues); 1007 1008 create(isl_ast_node_copy(Kernel->tree)); 1009 1010 Function *F = Builder.GetInsertBlock()->getParent(); 1011 clearDominators(F); 1012 clearScalarEvolution(F); 1013 clearLoops(F); 1014 1015 Builder.SetInsertPoint(&HostInsertPoint); 1016 IDToValue = HostIDs; 1017 1018 ValueMap = HostValueMap; 1019 ScalarMap.clear(); 1020 PHIOpMap.clear(); 1021 EscapeMap.clear(); 1022 IDToSAI.clear(); 1023 Annotator.resetAlternativeAliasBases(); 1024 for (auto &BasePtr : LocalArrays) 1025 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 1026 LocalArrays.clear(); 1027 1028 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1029 1030 std::string ASMString = finalizeKernelFunction(); 1031 std::string Name = "kernel_" + std::to_string(Kernel->id); 1032 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1033 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1034 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1035 1036 Value *GridDimX, *GridDimY; 1037 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1038 1039 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1040 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1041 1042 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1043 BlockDimZ, Parameters); 1044 createCallFreeKernel(GPUKernel); 1045 } 1046 1047 /// Compute the DataLayout string for the NVPTX backend. 1048 /// 1049 /// @param is64Bit Are we looking for a 64 bit architecture? 1050 static std::string computeNVPTXDataLayout(bool is64Bit) { 1051 std::string Ret = "e"; 1052 1053 if (!is64Bit) 1054 Ret += "-p:32:32"; 1055 1056 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 1057 1058 return Ret; 1059 } 1060 1061 Function * 1062 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1063 SetVector<Value *> &SubtreeValues) { 1064 std::vector<Type *> Args; 1065 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1066 1067 for (long i = 0; i < Prog->n_array; i++) { 1068 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1069 continue; 1070 1071 Args.push_back(Builder.getInt8PtrTy()); 1072 } 1073 1074 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1075 1076 for (long i = 0; i < NumHostIters; i++) 1077 Args.push_back(Builder.getInt64Ty()); 1078 1079 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1080 1081 for (long i = 0; i < NumVars; i++) 1082 Args.push_back(Builder.getInt64Ty()); 1083 1084 for (auto *V : SubtreeValues) 1085 Args.push_back(V->getType()); 1086 1087 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1088 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1089 GPUModule.get()); 1090 FN->setCallingConv(CallingConv::PTX_Kernel); 1091 1092 auto Arg = FN->arg_begin(); 1093 for (long i = 0; i < Kernel->n_array; i++) { 1094 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1095 continue; 1096 1097 Arg->setName(Kernel->array[i].array->name); 1098 1099 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1100 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1101 Type *EleTy = SAI->getElementType(); 1102 Value *Val = &*Arg; 1103 SmallVector<const SCEV *, 4> Sizes; 1104 isl_ast_build *Build = 1105 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1106 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1107 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1108 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1109 auto V = ExprBuilder.create(DimSize); 1110 Sizes.push_back(SE.getSCEV(V)); 1111 } 1112 const ScopArrayInfo *SAIRep = 1113 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 1114 LocalArrays.push_back(Val); 1115 1116 isl_ast_build_free(Build); 1117 isl_id_free(Id); 1118 IDToSAI[Id] = SAIRep; 1119 Arg++; 1120 } 1121 1122 for (long i = 0; i < NumHostIters; i++) { 1123 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1124 Arg->setName(isl_id_get_name(Id)); 1125 IDToValue[Id] = &*Arg; 1126 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1127 Arg++; 1128 } 1129 1130 for (long i = 0; i < NumVars; i++) { 1131 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1132 Arg->setName(isl_id_get_name(Id)); 1133 IDToValue[Id] = &*Arg; 1134 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1135 Arg++; 1136 } 1137 1138 for (auto *V : SubtreeValues) { 1139 Arg->setName(V->getName()); 1140 ValueMap[V] = &*Arg; 1141 Arg++; 1142 } 1143 1144 return FN; 1145 } 1146 1147 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1148 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 1149 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 1150 1151 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 1152 Intrinsic::nvvm_read_ptx_sreg_tid_y, 1153 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 1154 1155 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1156 std::string Name = isl_id_get_name(Id); 1157 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1158 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1159 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1160 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1161 IDToValue[Id] = Val; 1162 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1163 }; 1164 1165 for (int i = 0; i < Kernel->n_grid; ++i) { 1166 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1167 addId(Id, IntrinsicsBID[i]); 1168 } 1169 1170 for (int i = 0; i < Kernel->n_block; ++i) { 1171 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1172 addId(Id, IntrinsicsTID[i]); 1173 } 1174 } 1175 1176 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1177 auto Arg = FN->arg_begin(); 1178 for (long i = 0; i < Kernel->n_array; i++) { 1179 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1180 continue; 1181 1182 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1183 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1184 isl_id_free(Id); 1185 1186 if (SAI->getNumberOfDimensions() > 0) { 1187 Arg++; 1188 continue; 1189 } 1190 1191 Value *Alloca = BlockGen.getOrCreateScalarAlloca(SAI->getBasePtr()); 1192 Value *ArgPtr = &*Arg; 1193 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1194 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1195 Value *Val = Builder.CreateLoad(TypedArgPtr); 1196 Builder.CreateStore(Val, Alloca); 1197 1198 Arg++; 1199 } 1200 } 1201 1202 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1203 SetVector<Value *> &SubtreeValues) { 1204 1205 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1206 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1207 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1208 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1209 1210 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1211 1212 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1213 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1214 1215 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1216 DT.addNewBlock(EntryBlock, PrevBlock); 1217 1218 Builder.SetInsertPoint(EntryBlock); 1219 Builder.CreateRetVoid(); 1220 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1221 1222 ScopDetection::markFunctionAsInvalid(FN); 1223 1224 prepareKernelArguments(Kernel, FN); 1225 insertKernelIntrinsics(Kernel); 1226 } 1227 1228 std::string GPUNodeBuilder::createKernelASM() { 1229 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1230 std::string ErrMsg; 1231 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1232 1233 if (!GPUTarget) { 1234 errs() << ErrMsg << "\n"; 1235 return ""; 1236 } 1237 1238 TargetOptions Options; 1239 Options.UnsafeFPMath = FastMath; 1240 std::unique_ptr<TargetMachine> TargetM( 1241 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 1242 Options, Optional<Reloc::Model>())); 1243 1244 SmallString<0> ASMString; 1245 raw_svector_ostream ASMStream(ASMString); 1246 llvm::legacy::PassManager PM; 1247 1248 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1249 1250 if (TargetM->addPassesToEmitFile( 1251 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1252 errs() << "The target does not support generation of this file type!\n"; 1253 return ""; 1254 } 1255 1256 PM.run(*GPUModule); 1257 1258 return ASMStream.str(); 1259 } 1260 1261 std::string GPUNodeBuilder::finalizeKernelFunction() { 1262 // Verify module. 1263 llvm::legacy::PassManager Passes; 1264 Passes.add(createVerifierPass()); 1265 Passes.run(*GPUModule); 1266 1267 if (DumpKernelIR) 1268 outs() << *GPUModule << "\n"; 1269 1270 // Optimize module. 1271 llvm::legacy::PassManager OptPasses; 1272 PassManagerBuilder PassBuilder; 1273 PassBuilder.OptLevel = 3; 1274 PassBuilder.SizeLevel = 0; 1275 PassBuilder.populateModulePassManager(OptPasses); 1276 OptPasses.run(*GPUModule); 1277 1278 std::string Assembly = createKernelASM(); 1279 1280 if (DumpKernelASM) 1281 outs() << Assembly << "\n"; 1282 1283 GPUModule.release(); 1284 KernelIDs.clear(); 1285 1286 return Assembly; 1287 } 1288 1289 namespace { 1290 class PPCGCodeGeneration : public ScopPass { 1291 public: 1292 static char ID; 1293 1294 /// The scop that is currently processed. 1295 Scop *S; 1296 1297 LoopInfo *LI; 1298 DominatorTree *DT; 1299 ScalarEvolution *SE; 1300 const DataLayout *DL; 1301 RegionInfo *RI; 1302 1303 PPCGCodeGeneration() : ScopPass(ID) {} 1304 1305 /// Construct compilation options for PPCG. 1306 /// 1307 /// @returns The compilation options. 1308 ppcg_options *createPPCGOptions() { 1309 auto DebugOptions = 1310 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1311 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1312 1313 DebugOptions->dump_schedule_constraints = false; 1314 DebugOptions->dump_schedule = false; 1315 DebugOptions->dump_final_schedule = false; 1316 DebugOptions->dump_sizes = false; 1317 1318 Options->debug = DebugOptions; 1319 1320 Options->reschedule = true; 1321 Options->scale_tile_loops = false; 1322 Options->wrap = false; 1323 1324 Options->non_negative_parameters = false; 1325 Options->ctx = nullptr; 1326 Options->sizes = nullptr; 1327 1328 Options->tile_size = 32; 1329 1330 Options->use_private_memory = false; 1331 Options->use_shared_memory = false; 1332 Options->max_shared_memory = 0; 1333 1334 Options->target = PPCG_TARGET_CUDA; 1335 Options->openmp = false; 1336 Options->linearize_device_arrays = true; 1337 Options->live_range_reordering = false; 1338 1339 Options->opencl_compiler_options = nullptr; 1340 Options->opencl_use_gpu = false; 1341 Options->opencl_n_include_file = 0; 1342 Options->opencl_include_files = nullptr; 1343 Options->opencl_print_kernel_types = false; 1344 Options->opencl_embed_kernel_code = false; 1345 1346 Options->save_schedule_file = nullptr; 1347 Options->load_schedule_file = nullptr; 1348 1349 return Options; 1350 } 1351 1352 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1353 /// 1354 /// Instead of a normal access of the form: 1355 /// 1356 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1357 /// 1358 /// a tagged access has the form 1359 /// 1360 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1361 /// 1362 /// where 'id' is an additional space that references the memory access that 1363 /// triggered the access. 1364 /// 1365 /// @param AccessTy The type of the memory accesses to collect. 1366 /// 1367 /// @return The relation describing all tagged memory accesses. 1368 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1369 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1370 1371 for (auto &Stmt : *S) 1372 for (auto &Acc : Stmt) 1373 if (Acc->getType() == AccessTy) { 1374 isl_map *Relation = Acc->getAccessRelation(); 1375 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1376 1377 isl_space *Space = isl_map_get_space(Relation); 1378 Space = isl_space_range(Space); 1379 Space = isl_space_from_range(Space); 1380 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1381 isl_map *Universe = isl_map_universe(Space); 1382 Relation = isl_map_domain_product(Relation, Universe); 1383 Accesses = isl_union_map_add_map(Accesses, Relation); 1384 } 1385 1386 return Accesses; 1387 } 1388 1389 /// Get the set of all read accesses, tagged with the access id. 1390 /// 1391 /// @see getTaggedAccesses 1392 isl_union_map *getTaggedReads() { 1393 return getTaggedAccesses(MemoryAccess::READ); 1394 } 1395 1396 /// Get the set of all may (and must) accesses, tagged with the access id. 1397 /// 1398 /// @see getTaggedAccesses 1399 isl_union_map *getTaggedMayWrites() { 1400 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1401 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1402 } 1403 1404 /// Get the set of all must accesses, tagged with the access id. 1405 /// 1406 /// @see getTaggedAccesses 1407 isl_union_map *getTaggedMustWrites() { 1408 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1409 } 1410 1411 /// Collect parameter and array names as isl_ids. 1412 /// 1413 /// To reason about the different parameters and arrays used, ppcg requires 1414 /// a list of all isl_ids in use. As PPCG traditionally performs 1415 /// source-to-source compilation each of these isl_ids is mapped to the 1416 /// expression that represents it. As we do not have a corresponding 1417 /// expression in Polly, we just map each id to a 'zero' expression to match 1418 /// the data format that ppcg expects. 1419 /// 1420 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1421 __isl_give isl_id_to_ast_expr *getNames() { 1422 auto *Names = isl_id_to_ast_expr_alloc( 1423 S->getIslCtx(), 1424 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1425 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1426 auto *Space = S->getParamSpace(); 1427 1428 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1429 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1430 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1431 } 1432 1433 for (auto &Array : S->arrays()) { 1434 auto Id = Array->getBasePtrId(); 1435 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1436 } 1437 1438 isl_space_free(Space); 1439 isl_ast_expr_free(Zero); 1440 1441 return Names; 1442 } 1443 1444 /// Create a new PPCG scop from the current scop. 1445 /// 1446 /// The PPCG scop is initialized with data from the current polly::Scop. From 1447 /// this initial data, the data-dependences in the PPCG scop are initialized. 1448 /// We do not use Polly's dependence analysis for now, to ensure we match 1449 /// the PPCG default behaviour more closely. 1450 /// 1451 /// @returns A new ppcg scop. 1452 ppcg_scop *createPPCGScop() { 1453 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1454 1455 PPCGScop->options = createPPCGOptions(); 1456 1457 PPCGScop->start = 0; 1458 PPCGScop->end = 0; 1459 1460 PPCGScop->context = S->getContext(); 1461 PPCGScop->domain = S->getDomains(); 1462 PPCGScop->call = nullptr; 1463 PPCGScop->tagged_reads = getTaggedReads(); 1464 PPCGScop->reads = S->getReads(); 1465 PPCGScop->live_in = nullptr; 1466 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1467 PPCGScop->may_writes = S->getWrites(); 1468 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1469 PPCGScop->must_writes = S->getMustWrites(); 1470 PPCGScop->live_out = nullptr; 1471 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1472 PPCGScop->tagger = nullptr; 1473 1474 PPCGScop->independence = nullptr; 1475 PPCGScop->dep_flow = nullptr; 1476 PPCGScop->tagged_dep_flow = nullptr; 1477 PPCGScop->dep_false = nullptr; 1478 PPCGScop->dep_forced = nullptr; 1479 PPCGScop->dep_order = nullptr; 1480 PPCGScop->tagged_dep_order = nullptr; 1481 1482 PPCGScop->schedule = S->getScheduleTree(); 1483 PPCGScop->names = getNames(); 1484 1485 PPCGScop->pet = nullptr; 1486 1487 compute_tagger(PPCGScop); 1488 compute_dependences(PPCGScop); 1489 1490 return PPCGScop; 1491 } 1492 1493 /// Collect the array acesses in a statement. 1494 /// 1495 /// @param Stmt The statement for which to collect the accesses. 1496 /// 1497 /// @returns A list of array accesses. 1498 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1499 gpu_stmt_access *Accesses = nullptr; 1500 1501 for (MemoryAccess *Acc : Stmt) { 1502 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1503 Access->read = Acc->isRead(); 1504 Access->write = Acc->isWrite(); 1505 Access->access = Acc->getAccessRelation(); 1506 isl_space *Space = isl_map_get_space(Access->access); 1507 Space = isl_space_range(Space); 1508 Space = isl_space_from_range(Space); 1509 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1510 isl_map *Universe = isl_map_universe(Space); 1511 Access->tagged_access = 1512 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1513 Access->exact_write = Acc->isWrite(); 1514 Access->ref_id = Acc->getId(); 1515 Access->next = Accesses; 1516 Accesses = Access; 1517 } 1518 1519 return Accesses; 1520 } 1521 1522 /// Collect the list of GPU statements. 1523 /// 1524 /// Each statement has an id, a pointer to the underlying data structure, 1525 /// as well as a list with all memory accesses. 1526 /// 1527 /// TODO: Initialize the list of memory accesses. 1528 /// 1529 /// @returns A linked-list of statements. 1530 gpu_stmt *getStatements() { 1531 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1532 std::distance(S->begin(), S->end())); 1533 1534 int i = 0; 1535 for (auto &Stmt : *S) { 1536 gpu_stmt *GPUStmt = &Stmts[i]; 1537 1538 GPUStmt->id = Stmt.getDomainId(); 1539 1540 // We use the pet stmt pointer to keep track of the Polly statements. 1541 GPUStmt->stmt = (pet_stmt *)&Stmt; 1542 GPUStmt->accesses = getStmtAccesses(Stmt); 1543 i++; 1544 } 1545 1546 return Stmts; 1547 } 1548 1549 /// Derive the extent of an array. 1550 /// 1551 /// The extent of an array is defined by the set of memory locations for 1552 /// which a memory access in the iteration domain exists. 1553 /// 1554 /// @param Array The array to derive the extent for. 1555 /// 1556 /// @returns An isl_set describing the extent of the array. 1557 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1558 isl_union_map *Accesses = S->getAccesses(); 1559 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1560 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1561 isl_set *AccessSet = 1562 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1563 isl_union_set_free(AccessUSet); 1564 1565 return AccessSet; 1566 } 1567 1568 /// Derive the bounds of an array. 1569 /// 1570 /// For the first dimension we derive the bound of the array from the extent 1571 /// of this dimension. For inner dimensions we obtain their size directly from 1572 /// ScopArrayInfo. 1573 /// 1574 /// @param PPCGArray The array to compute bounds for. 1575 /// @param Array The polly array from which to take the information. 1576 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1577 if (PPCGArray.n_index > 0) { 1578 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1579 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1580 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1581 isl_set_free(Dom); 1582 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1583 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1584 isl_aff *One = isl_aff_zero_on_domain(LS); 1585 One = isl_aff_add_constant_si(One, 1); 1586 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1587 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1588 PPCGArray.bound[0] = Bound; 1589 } 1590 1591 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1592 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1593 auto LS = isl_pw_aff_get_domain_space(Bound); 1594 auto Aff = isl_multi_aff_zero(LS); 1595 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1596 PPCGArray.bound[i] = Bound; 1597 } 1598 } 1599 1600 /// Create the arrays for @p PPCGProg. 1601 /// 1602 /// @param PPCGProg The program to compute the arrays for. 1603 void createArrays(gpu_prog *PPCGProg) { 1604 int i = 0; 1605 for (auto &Array : S->arrays()) { 1606 std::string TypeName; 1607 raw_string_ostream OS(TypeName); 1608 1609 OS << *Array->getElementType(); 1610 TypeName = OS.str(); 1611 1612 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1613 1614 PPCGArray.space = Array->getSpace(); 1615 PPCGArray.type = strdup(TypeName.c_str()); 1616 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1617 PPCGArray.name = strdup(Array->getName().c_str()); 1618 PPCGArray.extent = nullptr; 1619 PPCGArray.n_index = Array->getNumberOfDimensions(); 1620 PPCGArray.bound = 1621 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1622 PPCGArray.extent = getExtent(Array); 1623 PPCGArray.n_ref = 0; 1624 PPCGArray.refs = nullptr; 1625 PPCGArray.accessed = true; 1626 PPCGArray.read_only_scalar = false; 1627 PPCGArray.has_compound_element = false; 1628 PPCGArray.local = false; 1629 PPCGArray.declare_local = false; 1630 PPCGArray.global = false; 1631 PPCGArray.linearize = false; 1632 PPCGArray.dep_order = nullptr; 1633 PPCGArray.user = Array; 1634 1635 setArrayBounds(PPCGArray, Array); 1636 i++; 1637 1638 collect_references(PPCGProg, &PPCGArray); 1639 } 1640 } 1641 1642 /// Create an identity map between the arrays in the scop. 1643 /// 1644 /// @returns An identity map between the arrays in the scop. 1645 isl_union_map *getArrayIdentity() { 1646 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1647 1648 for (auto &Array : S->arrays()) { 1649 isl_space *Space = Array->getSpace(); 1650 Space = isl_space_map_from_set(Space); 1651 isl_map *Identity = isl_map_identity(Space); 1652 Maps = isl_union_map_add_map(Maps, Identity); 1653 } 1654 1655 return Maps; 1656 } 1657 1658 /// Create a default-initialized PPCG GPU program. 1659 /// 1660 /// @returns A new gpu grogram description. 1661 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1662 1663 if (!PPCGScop) 1664 return nullptr; 1665 1666 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1667 1668 PPCGProg->ctx = S->getIslCtx(); 1669 PPCGProg->scop = PPCGScop; 1670 PPCGProg->context = isl_set_copy(PPCGScop->context); 1671 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1672 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1673 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1674 PPCGProg->tagged_must_kill = 1675 isl_union_map_copy(PPCGScop->tagged_must_kills); 1676 PPCGProg->to_inner = getArrayIdentity(); 1677 PPCGProg->to_outer = getArrayIdentity(); 1678 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1679 PPCGProg->any_to_outer = nullptr; 1680 PPCGProg->array_order = nullptr; 1681 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1682 PPCGProg->stmts = getStatements(); 1683 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1684 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1685 PPCGProg->n_array); 1686 1687 createArrays(PPCGProg); 1688 1689 return PPCGProg; 1690 } 1691 1692 struct PrintGPUUserData { 1693 struct cuda_info *CudaInfo; 1694 struct gpu_prog *PPCGProg; 1695 std::vector<ppcg_kernel *> Kernels; 1696 }; 1697 1698 /// Print a user statement node in the host code. 1699 /// 1700 /// We use ppcg's printing facilities to print the actual statement and 1701 /// additionally build up a list of all kernels that are encountered in the 1702 /// host ast. 1703 /// 1704 /// @param P The printer to print to 1705 /// @param Options The printing options to use 1706 /// @param Node The node to print 1707 /// @param User A user pointer to carry additional data. This pointer is 1708 /// expected to be of type PrintGPUUserData. 1709 /// 1710 /// @returns A printer to which the output has been printed. 1711 static __isl_give isl_printer * 1712 printHostUser(__isl_take isl_printer *P, 1713 __isl_take isl_ast_print_options *Options, 1714 __isl_take isl_ast_node *Node, void *User) { 1715 auto Data = (struct PrintGPUUserData *)User; 1716 auto Id = isl_ast_node_get_annotation(Node); 1717 1718 if (Id) { 1719 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1720 1721 // If this is a user statement, format it ourselves as ppcg would 1722 // otherwise try to call pet functionality that is not available in 1723 // Polly. 1724 if (IsUser) { 1725 P = isl_printer_start_line(P); 1726 P = isl_printer_print_ast_node(P, Node); 1727 P = isl_printer_end_line(P); 1728 isl_id_free(Id); 1729 isl_ast_print_options_free(Options); 1730 return P; 1731 } 1732 1733 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1734 isl_id_free(Id); 1735 Data->Kernels.push_back(Kernel); 1736 } 1737 1738 return print_host_user(P, Options, Node, User); 1739 } 1740 1741 /// Print C code corresponding to the control flow in @p Kernel. 1742 /// 1743 /// @param Kernel The kernel to print 1744 void printKernel(ppcg_kernel *Kernel) { 1745 auto *P = isl_printer_to_str(S->getIslCtx()); 1746 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1747 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1748 P = isl_ast_node_print(Kernel->tree, P, Options); 1749 char *String = isl_printer_get_str(P); 1750 printf("%s\n", String); 1751 free(String); 1752 isl_printer_free(P); 1753 } 1754 1755 /// Print C code corresponding to the GPU code described by @p Tree. 1756 /// 1757 /// @param Tree An AST describing GPU code 1758 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1759 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1760 auto *P = isl_printer_to_str(S->getIslCtx()); 1761 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1762 1763 PrintGPUUserData Data; 1764 Data.PPCGProg = PPCGProg; 1765 1766 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1767 Options = 1768 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1769 P = isl_ast_node_print(Tree, P, Options); 1770 char *String = isl_printer_get_str(P); 1771 printf("# host\n"); 1772 printf("%s\n", String); 1773 free(String); 1774 isl_printer_free(P); 1775 1776 for (auto Kernel : Data.Kernels) { 1777 printf("# kernel%d\n", Kernel->id); 1778 printKernel(Kernel); 1779 } 1780 } 1781 1782 // Generate a GPU program using PPCG. 1783 // 1784 // GPU mapping consists of multiple steps: 1785 // 1786 // 1) Compute new schedule for the program. 1787 // 2) Map schedule to GPU (TODO) 1788 // 3) Generate code for new schedule (TODO) 1789 // 1790 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1791 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1792 // strategy directly from this pass. 1793 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1794 1795 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1796 1797 PPCGGen->ctx = S->getIslCtx(); 1798 PPCGGen->options = PPCGScop->options; 1799 PPCGGen->print = nullptr; 1800 PPCGGen->print_user = nullptr; 1801 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1802 PPCGGen->prog = PPCGProg; 1803 PPCGGen->tree = nullptr; 1804 PPCGGen->types.n = 0; 1805 PPCGGen->types.name = nullptr; 1806 PPCGGen->sizes = nullptr; 1807 PPCGGen->used_sizes = nullptr; 1808 PPCGGen->kernel_id = 0; 1809 1810 // Set scheduling strategy to same strategy PPCG is using. 1811 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1812 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1813 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1814 1815 isl_schedule *Schedule = get_schedule(PPCGGen); 1816 1817 int has_permutable = has_any_permutable_node(Schedule); 1818 1819 if (!has_permutable || has_permutable < 0) { 1820 Schedule = isl_schedule_free(Schedule); 1821 } else { 1822 Schedule = map_to_device(PPCGGen, Schedule); 1823 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1824 } 1825 1826 if (DumpSchedule) { 1827 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1828 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1829 P = isl_printer_print_str(P, "Schedule\n"); 1830 P = isl_printer_print_str(P, "========\n"); 1831 if (Schedule) 1832 P = isl_printer_print_schedule(P, Schedule); 1833 else 1834 P = isl_printer_print_str(P, "No schedule found\n"); 1835 1836 printf("%s\n", isl_printer_get_str(P)); 1837 isl_printer_free(P); 1838 } 1839 1840 if (DumpCode) { 1841 printf("Code\n"); 1842 printf("====\n"); 1843 if (PPCGGen->tree) 1844 printGPUTree(PPCGGen->tree, PPCGProg); 1845 else 1846 printf("No code generated\n"); 1847 } 1848 1849 isl_schedule_free(Schedule); 1850 1851 return PPCGGen; 1852 } 1853 1854 /// Free gpu_gen structure. 1855 /// 1856 /// @param PPCGGen The ppcg_gen object to free. 1857 void freePPCGGen(gpu_gen *PPCGGen) { 1858 isl_ast_node_free(PPCGGen->tree); 1859 isl_union_map_free(PPCGGen->sizes); 1860 isl_union_map_free(PPCGGen->used_sizes); 1861 free(PPCGGen); 1862 } 1863 1864 /// Free the options in the ppcg scop structure. 1865 /// 1866 /// ppcg is not freeing these options for us. To avoid leaks we do this 1867 /// ourselves. 1868 /// 1869 /// @param PPCGScop The scop referencing the options to free. 1870 void freeOptions(ppcg_scop *PPCGScop) { 1871 free(PPCGScop->options->debug); 1872 PPCGScop->options->debug = nullptr; 1873 free(PPCGScop->options); 1874 PPCGScop->options = nullptr; 1875 } 1876 1877 /// Generate code for a given GPU AST described by @p Root. 1878 /// 1879 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1880 /// @param Prog The GPU Program to generate code for. 1881 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1882 ScopAnnotator Annotator; 1883 Annotator.buildAliasScopes(*S); 1884 1885 Region *R = &S->getRegion(); 1886 1887 simplifyRegion(R, DT, LI, RI); 1888 1889 BasicBlock *EnteringBB = R->getEnteringBlock(); 1890 1891 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1892 1893 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1894 Prog); 1895 1896 // Only build the run-time condition and parameters _after_ having 1897 // introduced the conditional branch. This is important as the conditional 1898 // branch will guard the original scop from new induction variables that 1899 // the SCEVExpander may introduce while code generating the parameters and 1900 // which may introduce scalar dependences that prevent us from correctly 1901 // code generating this scop. 1902 BasicBlock *StartBlock = 1903 executeScopConditionally(*S, this, Builder.getTrue()); 1904 1905 // TODO: Handle LICM 1906 // TODO: Verify run-time checks 1907 auto SplitBlock = StartBlock->getSinglePredecessor(); 1908 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1909 NodeBuilder.addParameters(S->getContext()); 1910 Builder.SetInsertPoint(&*StartBlock->begin()); 1911 1912 NodeBuilder.initializeAfterRTH(); 1913 NodeBuilder.create(Root); 1914 NodeBuilder.finalize(); 1915 } 1916 1917 bool runOnScop(Scop &CurrentScop) override { 1918 S = &CurrentScop; 1919 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1920 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1921 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1922 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 1923 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 1924 1925 // We currently do not support scops with invariant loads. 1926 if (S->hasInvariantAccesses()) 1927 return false; 1928 1929 auto PPCGScop = createPPCGScop(); 1930 auto PPCGProg = createPPCGProg(PPCGScop); 1931 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 1932 1933 if (PPCGGen->tree) 1934 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 1935 1936 freeOptions(PPCGScop); 1937 freePPCGGen(PPCGGen); 1938 gpu_prog_free(PPCGProg); 1939 ppcg_scop_free(PPCGScop); 1940 1941 return true; 1942 } 1943 1944 void printScop(raw_ostream &, Scop &) const override {} 1945 1946 void getAnalysisUsage(AnalysisUsage &AU) const override { 1947 AU.addRequired<DominatorTreeWrapperPass>(); 1948 AU.addRequired<RegionInfoPass>(); 1949 AU.addRequired<ScalarEvolutionWrapperPass>(); 1950 AU.addRequired<ScopDetection>(); 1951 AU.addRequired<ScopInfoRegionPass>(); 1952 AU.addRequired<LoopInfoWrapperPass>(); 1953 1954 AU.addPreserved<AAResultsWrapperPass>(); 1955 AU.addPreserved<BasicAAWrapperPass>(); 1956 AU.addPreserved<LoopInfoWrapperPass>(); 1957 AU.addPreserved<DominatorTreeWrapperPass>(); 1958 AU.addPreserved<GlobalsAAWrapperPass>(); 1959 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1960 AU.addPreserved<ScopDetection>(); 1961 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1962 AU.addPreserved<SCEVAAWrapperPass>(); 1963 1964 // FIXME: We do not yet add regions for the newly generated code to the 1965 // region tree. 1966 AU.addPreserved<RegionInfoPass>(); 1967 AU.addPreserved<ScopInfoRegionPass>(); 1968 } 1969 }; 1970 } 1971 1972 char PPCGCodeGeneration::ID = 1; 1973 1974 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 1975 1976 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 1977 "Polly - Apply PPCG translation to SCOP", false, false) 1978 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1979 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1980 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 1981 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1982 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 1983 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1984 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 1985 "Polly - Apply PPCG translation to SCOP", false, false) 1986