1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopDetection.h" 21 #include "polly/ScopInfo.h" 22 #include "polly/Support/SCEVValidator.h" 23 #include "llvm/ADT/PostOrderIterator.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/BasicAliasAnalysis.h" 26 #include "llvm/Analysis/GlobalsModRef.h" 27 #include "llvm/Analysis/PostDominators.h" 28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/LegacyPassManager.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/Support/TargetRegistry.h" 34 #include "llvm/Support/TargetSelect.h" 35 #include "llvm/Target/TargetMachine.h" 36 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 37 38 #include "isl/union_map.h" 39 40 extern "C" { 41 #include "ppcg/cuda.h" 42 #include "ppcg/gpu.h" 43 #include "ppcg/gpu_print.h" 44 #include "ppcg/ppcg.h" 45 #include "ppcg/schedule.h" 46 } 47 48 #include "llvm/Support/Debug.h" 49 50 using namespace polly; 51 using namespace llvm; 52 53 #define DEBUG_TYPE "polly-codegen-ppcg" 54 55 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 56 cl::desc("Dump the computed GPU Schedule"), 57 cl::Hidden, cl::init(false), cl::ZeroOrMore, 58 cl::cat(PollyCategory)); 59 60 static cl::opt<bool> 61 DumpCode("polly-acc-dump-code", 62 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 63 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 64 65 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 66 cl::desc("Dump the kernel LLVM-IR"), 67 cl::Hidden, cl::init(false), cl::ZeroOrMore, 68 cl::cat(PollyCategory)); 69 70 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 71 cl::desc("Dump the kernel assembly code"), 72 cl::Hidden, cl::init(false), cl::ZeroOrMore, 73 cl::cat(PollyCategory)); 74 75 static cl::opt<bool> FastMath("polly-acc-fastmath", 76 cl::desc("Allow unsafe math optimizations"), 77 cl::Hidden, cl::init(false), cl::ZeroOrMore, 78 cl::cat(PollyCategory)); 79 80 static cl::opt<std::string> 81 CudaVersion("polly-acc-cuda-version", 82 cl::desc("The CUDA version to compile for"), cl::Hidden, 83 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 84 85 /// Create the ast expressions for a ScopStmt. 86 /// 87 /// This function is a callback for to generate the ast expressions for each 88 /// of the scheduled ScopStmts. 89 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 90 void *StmtT, isl_ast_build *Build, 91 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 92 isl_id *Id, void *User), 93 void *UserIndex, 94 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 95 void *UserExpr) { 96 97 ScopStmt *Stmt = (ScopStmt *)StmtT; 98 99 isl_ctx *Ctx; 100 101 if (!Stmt || !Build) 102 return NULL; 103 104 Ctx = isl_ast_build_get_ctx(Build); 105 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 106 107 for (MemoryAccess *Acc : *Stmt) { 108 isl_map *AddrFunc = Acc->getAddressFunction(); 109 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 110 isl_id *RefId = Acc->getId(); 111 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 112 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 113 MPA = isl_multi_pw_aff_coalesce(MPA); 114 MPA = FunctionIndex(MPA, RefId, UserIndex); 115 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 116 Access = FunctionExpr(Access, RefId, UserExpr); 117 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 118 } 119 120 return RefToExpr; 121 } 122 123 /// Generate code for a GPU specific isl AST. 124 /// 125 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 126 /// generates code for general-prupose AST nodes, with special functionality 127 /// for generating GPU specific user nodes. 128 /// 129 /// @see GPUNodeBuilder::createUser 130 class GPUNodeBuilder : public IslNodeBuilder { 131 public: 132 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 133 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 134 DominatorTree &DT, Scop &S, gpu_prog *Prog) 135 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 136 getExprBuilder().setIDToSAI(&IDToSAI); 137 } 138 139 /// Create after-run-time-check initialization code. 140 void initializeAfterRTH(); 141 142 /// Finalize the generated scop. 143 virtual void finalize(); 144 145 private: 146 /// A vector of array base pointers for which a new ScopArrayInfo was created. 147 /// 148 /// This vector is used to delete the ScopArrayInfo when it is not needed any 149 /// more. 150 std::vector<Value *> LocalArrays; 151 152 /// A map from ScopArrays to their corresponding device allocations. 153 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 154 155 /// The current GPU context. 156 Value *GPUContext; 157 158 /// A module containing GPU code. 159 /// 160 /// This pointer is only set in case we are currently generating GPU code. 161 std::unique_ptr<Module> GPUModule; 162 163 /// The GPU program we generate code for. 164 gpu_prog *Prog; 165 166 /// Class to free isl_ids. 167 class IslIdDeleter { 168 public: 169 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 170 }; 171 172 /// A set containing all isl_ids allocated in a GPU kernel. 173 /// 174 /// By releasing this set all isl_ids will be freed. 175 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 176 177 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 178 179 /// Create code for user-defined AST nodes. 180 /// 181 /// These AST nodes can be of type: 182 /// 183 /// - ScopStmt: A computational statement (TODO) 184 /// - Kernel: A GPU kernel call (TODO) 185 /// - Data-Transfer: A GPU <-> CPU data-transfer 186 /// - In-kernel synchronization 187 /// - In-kernel memory copy statement 188 /// 189 /// @param UserStmt The ast node to generate code for. 190 virtual void createUser(__isl_take isl_ast_node *UserStmt); 191 192 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 193 194 /// Create code for a data transfer statement 195 /// 196 /// @param TransferStmt The data transfer statement. 197 /// @param Direction The direction in which to transfer data. 198 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 199 enum DataDirection Direction); 200 201 /// Find llvm::Values referenced in GPU kernel. 202 /// 203 /// @param Kernel The kernel to scan for llvm::Values 204 /// 205 /// @returns A set of values referenced by the kernel. 206 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 207 208 /// Compute the sizes of the execution grid for a given kernel. 209 /// 210 /// @param Kernel The kernel to compute grid sizes for. 211 /// 212 /// @returns A tuple with grid sizes for X and Y dimension 213 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 214 215 /// Compute the sizes of the thread blocks for a given kernel. 216 /// 217 /// @param Kernel The kernel to compute thread block sizes for. 218 /// 219 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 220 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 221 222 /// Create kernel launch parameters. 223 /// 224 /// @param Kernel The kernel to create parameters for. 225 /// @param F The kernel function that has been created. 226 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 227 /// 228 /// @returns A stack allocated array with pointers to the parameter 229 /// values that are passed to the kernel. 230 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 231 SetVector<Value *> SubtreeValues); 232 233 /// Create GPU kernel. 234 /// 235 /// Code generate the kernel described by @p KernelStmt. 236 /// 237 /// @param KernelStmt The ast node to generate kernel code for. 238 void createKernel(__isl_take isl_ast_node *KernelStmt); 239 240 /// Generate code that computes the size of an array. 241 /// 242 /// @param Array The array for which to compute a size. 243 Value *getArraySize(gpu_array_info *Array); 244 245 /// Create kernel function. 246 /// 247 /// Create a kernel function located in a newly created module that can serve 248 /// as target for device code generation. Set the Builder to point to the 249 /// start block of this newly created function. 250 /// 251 /// @param Kernel The kernel to generate code for. 252 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 253 void createKernelFunction(ppcg_kernel *Kernel, 254 SetVector<Value *> &SubtreeValues); 255 256 /// Create the declaration of a kernel function. 257 /// 258 /// The kernel function takes as arguments: 259 /// 260 /// - One i8 pointer for each external array reference used in the kernel. 261 /// - Host iterators 262 /// - Parameters 263 /// - Other LLVM Value references (TODO) 264 /// 265 /// @param Kernel The kernel to generate the function declaration for. 266 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 267 /// 268 /// @returns The newly declared function. 269 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 270 SetVector<Value *> &SubtreeValues); 271 272 /// Insert intrinsic functions to obtain thread and block ids. 273 /// 274 /// @param The kernel to generate the intrinsic functions for. 275 void insertKernelIntrinsics(ppcg_kernel *Kernel); 276 277 /// Create code for a ScopStmt called in @p Expr. 278 /// 279 /// @param Expr The expression containing the call. 280 /// @param KernelStmt The kernel statement referenced in the call. 281 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 282 283 /// Create an in-kernel synchronization call. 284 void createKernelSync(); 285 286 /// Create a PTX assembly string for the current GPU kernel. 287 /// 288 /// @returns A string containing the corresponding PTX assembly code. 289 std::string createKernelASM(); 290 291 /// Remove references from the dominator tree to the kernel function @p F. 292 /// 293 /// @param F The function to remove references to. 294 void clearDominators(Function *F); 295 296 /// Remove references from scalar evolution to the kernel function @p F. 297 /// 298 /// @param F The function to remove references to. 299 void clearScalarEvolution(Function *F); 300 301 /// Remove references from loop info to the kernel function @p F. 302 /// 303 /// @param F The function to remove references to. 304 void clearLoops(Function *F); 305 306 /// Finalize the generation of the kernel function. 307 /// 308 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 309 /// dump its IR to stderr. 310 /// 311 /// @returns The Assembly string of the kernel. 312 std::string finalizeKernelFunction(); 313 314 /// Create code that allocates memory to store arrays on device. 315 void allocateDeviceArrays(); 316 317 /// Free all allocated device arrays. 318 void freeDeviceArrays(); 319 320 /// Create a call to initialize the GPU context. 321 /// 322 /// @returns A pointer to the newly initialized context. 323 Value *createCallInitContext(); 324 325 /// Create a call to get the device pointer for a kernel allocation. 326 /// 327 /// @param Allocation The Polly GPU allocation 328 /// 329 /// @returns The device parameter corresponding to this allocation. 330 Value *createCallGetDevicePtr(Value *Allocation); 331 332 /// Create a call to free the GPU context. 333 /// 334 /// @param Context A pointer to an initialized GPU context. 335 void createCallFreeContext(Value *Context); 336 337 /// Create a call to allocate memory on the device. 338 /// 339 /// @param Size The size of memory to allocate 340 /// 341 /// @returns A pointer that identifies this allocation. 342 Value *createCallAllocateMemoryForDevice(Value *Size); 343 344 /// Create a call to free a device array. 345 /// 346 /// @param Array The device array to free. 347 void createCallFreeDeviceMemory(Value *Array); 348 349 /// Create a call to copy data from host to device. 350 /// 351 /// @param HostPtr A pointer to the host data that should be copied. 352 /// @param DevicePtr A device pointer specifying the location to copy to. 353 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 354 Value *Size); 355 356 /// Create a call to copy data from device to host. 357 /// 358 /// @param DevicePtr A pointer to the device data that should be copied. 359 /// @param HostPtr A host pointer specifying the location to copy to. 360 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 361 Value *Size); 362 363 /// Create a call to get a kernel from an assembly string. 364 /// 365 /// @param Buffer The string describing the kernel. 366 /// @param Entry The name of the kernel function to call. 367 /// 368 /// @returns A pointer to a kernel object 369 Value *createCallGetKernel(Value *Buffer, Value *Entry); 370 371 /// Create a call to free a GPU kernel. 372 /// 373 /// @param GPUKernel THe kernel to free. 374 void createCallFreeKernel(Value *GPUKernel); 375 376 /// Create a call to launch a GPU kernel. 377 /// 378 /// @param GPUKernel The kernel to launch. 379 /// @param GridDimX The size of the first grid dimension. 380 /// @param GridDimY The size of the second grid dimension. 381 /// @param GridBlockX The size of the first block dimension. 382 /// @param GridBlockY The size of the second block dimension. 383 /// @param GridBlockZ The size of the third block dimension. 384 /// @param Paramters A pointer to an array that contains itself pointers to 385 /// the parameter values passed for each kernel argument. 386 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 387 Value *GridDimY, Value *BlockDimX, 388 Value *BlockDimY, Value *BlockDimZ, 389 Value *Parameters); 390 }; 391 392 void GPUNodeBuilder::initializeAfterRTH() { 393 GPUContext = createCallInitContext(); 394 allocateDeviceArrays(); 395 } 396 397 void GPUNodeBuilder::finalize() { 398 freeDeviceArrays(); 399 createCallFreeContext(GPUContext); 400 IslNodeBuilder::finalize(); 401 } 402 403 void GPUNodeBuilder::allocateDeviceArrays() { 404 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 405 406 for (int i = 0; i < Prog->n_array; ++i) { 407 gpu_array_info *Array = &Prog->array[i]; 408 auto *ScopArray = (ScopArrayInfo *)Array->user; 409 std::string DevArrayName("p_dev_array_"); 410 DevArrayName.append(Array->name); 411 412 Value *ArraySize = getArraySize(Array); 413 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 414 DevArray->setName(DevArrayName); 415 DeviceAllocations[ScopArray] = DevArray; 416 } 417 418 isl_ast_build_free(Build); 419 } 420 421 void GPUNodeBuilder::freeDeviceArrays() { 422 for (auto &Array : DeviceAllocations) 423 createCallFreeDeviceMemory(Array.second); 424 } 425 426 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 427 const char *Name = "polly_getKernel"; 428 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 429 Function *F = M->getFunction(Name); 430 431 // If F is not available, declare it. 432 if (!F) { 433 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 434 std::vector<Type *> Args; 435 Args.push_back(Builder.getInt8PtrTy()); 436 Args.push_back(Builder.getInt8PtrTy()); 437 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 438 F = Function::Create(Ty, Linkage, Name, M); 439 } 440 441 return Builder.CreateCall(F, {Buffer, Entry}); 442 } 443 444 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 445 const char *Name = "polly_getDevicePtr"; 446 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 447 Function *F = M->getFunction(Name); 448 449 // If F is not available, declare it. 450 if (!F) { 451 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 452 std::vector<Type *> Args; 453 Args.push_back(Builder.getInt8PtrTy()); 454 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 455 F = Function::Create(Ty, Linkage, Name, M); 456 } 457 458 return Builder.CreateCall(F, {Allocation}); 459 } 460 461 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 462 Value *GridDimY, Value *BlockDimX, 463 Value *BlockDimY, Value *BlockDimZ, 464 Value *Parameters) { 465 const char *Name = "polly_launchKernel"; 466 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 467 Function *F = M->getFunction(Name); 468 469 // If F is not available, declare it. 470 if (!F) { 471 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 472 std::vector<Type *> Args; 473 Args.push_back(Builder.getInt8PtrTy()); 474 Args.push_back(Builder.getInt32Ty()); 475 Args.push_back(Builder.getInt32Ty()); 476 Args.push_back(Builder.getInt32Ty()); 477 Args.push_back(Builder.getInt32Ty()); 478 Args.push_back(Builder.getInt32Ty()); 479 Args.push_back(Builder.getInt8PtrTy()); 480 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 481 F = Function::Create(Ty, Linkage, Name, M); 482 } 483 484 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 485 BlockDimZ, Parameters}); 486 } 487 488 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 489 const char *Name = "polly_freeKernel"; 490 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 491 Function *F = M->getFunction(Name); 492 493 // If F is not available, declare it. 494 if (!F) { 495 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 496 std::vector<Type *> Args; 497 Args.push_back(Builder.getInt8PtrTy()); 498 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 499 F = Function::Create(Ty, Linkage, Name, M); 500 } 501 502 Builder.CreateCall(F, {GPUKernel}); 503 } 504 505 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 506 const char *Name = "polly_freeDeviceMemory"; 507 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 508 Function *F = M->getFunction(Name); 509 510 // If F is not available, declare it. 511 if (!F) { 512 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 513 std::vector<Type *> Args; 514 Args.push_back(Builder.getInt8PtrTy()); 515 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 516 F = Function::Create(Ty, Linkage, Name, M); 517 } 518 519 Builder.CreateCall(F, {Array}); 520 } 521 522 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 523 const char *Name = "polly_allocateMemoryForDevice"; 524 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 525 Function *F = M->getFunction(Name); 526 527 // If F is not available, declare it. 528 if (!F) { 529 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 530 std::vector<Type *> Args; 531 Args.push_back(Builder.getInt64Ty()); 532 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 533 F = Function::Create(Ty, Linkage, Name, M); 534 } 535 536 return Builder.CreateCall(F, {Size}); 537 } 538 539 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 540 Value *DeviceData, 541 Value *Size) { 542 const char *Name = "polly_copyFromHostToDevice"; 543 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 544 Function *F = M->getFunction(Name); 545 546 // If F is not available, declare it. 547 if (!F) { 548 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 549 std::vector<Type *> Args; 550 Args.push_back(Builder.getInt8PtrTy()); 551 Args.push_back(Builder.getInt8PtrTy()); 552 Args.push_back(Builder.getInt64Ty()); 553 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 554 F = Function::Create(Ty, Linkage, Name, M); 555 } 556 557 Builder.CreateCall(F, {HostData, DeviceData, Size}); 558 } 559 560 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 561 Value *HostData, 562 Value *Size) { 563 const char *Name = "polly_copyFromDeviceToHost"; 564 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 565 Function *F = M->getFunction(Name); 566 567 // If F is not available, declare it. 568 if (!F) { 569 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 570 std::vector<Type *> Args; 571 Args.push_back(Builder.getInt8PtrTy()); 572 Args.push_back(Builder.getInt8PtrTy()); 573 Args.push_back(Builder.getInt64Ty()); 574 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 575 F = Function::Create(Ty, Linkage, Name, M); 576 } 577 578 Builder.CreateCall(F, {DeviceData, HostData, Size}); 579 } 580 581 Value *GPUNodeBuilder::createCallInitContext() { 582 const char *Name = "polly_initContext"; 583 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 584 Function *F = M->getFunction(Name); 585 586 // If F is not available, declare it. 587 if (!F) { 588 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 589 std::vector<Type *> Args; 590 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 591 F = Function::Create(Ty, Linkage, Name, M); 592 } 593 594 return Builder.CreateCall(F, {}); 595 } 596 597 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 598 const char *Name = "polly_freeContext"; 599 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 600 Function *F = M->getFunction(Name); 601 602 // If F is not available, declare it. 603 if (!F) { 604 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 605 std::vector<Type *> Args; 606 Args.push_back(Builder.getInt8PtrTy()); 607 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 608 F = Function::Create(Ty, Linkage, Name, M); 609 } 610 611 Builder.CreateCall(F, {Context}); 612 } 613 614 /// Check if one string is a prefix of another. 615 /// 616 /// @param String The string in which to look for the prefix. 617 /// @param Prefix The prefix to look for. 618 static bool isPrefix(std::string String, std::string Prefix) { 619 return String.find(Prefix) == 0; 620 } 621 622 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 623 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 624 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 625 626 if (!gpu_array_is_scalar(Array)) { 627 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 628 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 629 630 for (unsigned int i = 1; i < Array->n_index; i++) { 631 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 632 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 633 Res = isl_ast_expr_mul(Res, Expr); 634 } 635 636 Value *NumElements = ExprBuilder.create(Res); 637 ArraySize = Builder.CreateMul(ArraySize, NumElements); 638 } 639 isl_ast_build_free(Build); 640 return ArraySize; 641 } 642 643 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 644 enum DataDirection Direction) { 645 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 646 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 647 isl_id *Id = isl_ast_expr_get_id(Arg); 648 auto Array = (gpu_array_info *)isl_id_get_user(Id); 649 auto ScopArray = (ScopArrayInfo *)(Array->user); 650 651 Value *Size = getArraySize(Array); 652 Value *HostPtr = ScopArray->getBasePtr(); 653 654 Value *DevPtr = DeviceAllocations[ScopArray]; 655 656 if (gpu_array_is_scalar(Array)) { 657 HostPtr = Builder.CreateAlloca(ScopArray->getElementType()); 658 Builder.CreateStore(ScopArray->getBasePtr(), HostPtr); 659 } 660 661 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 662 663 if (Direction == HOST_TO_DEVICE) 664 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 665 else 666 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 667 668 isl_id_free(Id); 669 isl_ast_expr_free(Arg); 670 isl_ast_expr_free(Expr); 671 isl_ast_node_free(TransferStmt); 672 } 673 674 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 675 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 676 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 677 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 678 isl_id_free(Id); 679 isl_ast_expr_free(StmtExpr); 680 681 const char *Str = isl_id_get_name(Id); 682 if (!strcmp(Str, "kernel")) { 683 createKernel(UserStmt); 684 isl_ast_expr_free(Expr); 685 return; 686 } 687 688 if (isPrefix(Str, "to_device")) { 689 createDataTransfer(UserStmt, HOST_TO_DEVICE); 690 isl_ast_expr_free(Expr); 691 return; 692 } 693 694 if (isPrefix(Str, "from_device")) { 695 createDataTransfer(UserStmt, DEVICE_TO_HOST); 696 isl_ast_expr_free(Expr); 697 return; 698 } 699 700 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 701 struct ppcg_kernel_stmt *KernelStmt = 702 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 703 isl_id_free(Anno); 704 705 switch (KernelStmt->type) { 706 case ppcg_kernel_domain: 707 createScopStmt(Expr, KernelStmt); 708 isl_ast_node_free(UserStmt); 709 return; 710 case ppcg_kernel_copy: 711 // TODO: Create kernel copy stmt 712 isl_ast_expr_free(Expr); 713 isl_ast_node_free(UserStmt); 714 return; 715 case ppcg_kernel_sync: 716 createKernelSync(); 717 isl_ast_expr_free(Expr); 718 isl_ast_node_free(UserStmt); 719 return; 720 } 721 722 isl_ast_expr_free(Expr); 723 isl_ast_node_free(UserStmt); 724 return; 725 } 726 727 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 728 ppcg_kernel_stmt *KernelStmt) { 729 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 730 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 731 732 LoopToScevMapT LTS; 733 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 734 735 createSubstitutions(Expr, Stmt, LTS); 736 737 if (Stmt->isBlockStmt()) 738 BlockGen.copyStmt(*Stmt, LTS, Indexes); 739 else 740 assert(0 && "Region statement not supported\n"); 741 } 742 743 void GPUNodeBuilder::createKernelSync() { 744 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 745 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 746 Builder.CreateCall(Sync, {}); 747 } 748 749 /// Collect llvm::Values referenced from @p Node 750 /// 751 /// This function only applies to isl_ast_nodes that are user_nodes referring 752 /// to a ScopStmt. All other node types are ignore. 753 /// 754 /// @param Node The node to collect references for. 755 /// @param User A user pointer used as storage for the data that is collected. 756 /// 757 /// @returns isl_bool_true if data could be collected successfully. 758 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 759 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 760 return isl_bool_true; 761 762 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 763 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 764 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 765 const char *Str = isl_id_get_name(Id); 766 isl_id_free(Id); 767 isl_ast_expr_free(StmtExpr); 768 isl_ast_expr_free(Expr); 769 770 if (!isPrefix(Str, "Stmt")) 771 return isl_bool_true; 772 773 Id = isl_ast_node_get_annotation(Node); 774 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 775 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 776 isl_id_free(Id); 777 778 addReferencesFromStmt(Stmt, User); 779 780 return isl_bool_true; 781 } 782 783 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 784 SetVector<Value *> SubtreeValues; 785 SetVector<const SCEV *> SCEVs; 786 SetVector<const Loop *> Loops; 787 SubtreeReferences References = { 788 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 789 790 for (const auto &I : IDToValue) 791 SubtreeValues.insert(I.second); 792 793 isl_ast_node_foreach_descendant_top_down( 794 Kernel->tree, collectReferencesInGPUStmt, &References); 795 796 for (const SCEV *Expr : SCEVs) 797 findValues(Expr, SE, SubtreeValues); 798 799 for (auto &SAI : S.arrays()) 800 SubtreeValues.remove(SAI->getBasePtr()); 801 802 isl_space *Space = S.getParamSpace(); 803 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 804 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 805 assert(IDToValue.count(Id)); 806 Value *Val = IDToValue[Id]; 807 SubtreeValues.remove(Val); 808 isl_id_free(Id); 809 } 810 isl_space_free(Space); 811 812 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 813 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 814 assert(IDToValue.count(Id)); 815 Value *Val = IDToValue[Id]; 816 SubtreeValues.remove(Val); 817 isl_id_free(Id); 818 } 819 820 return SubtreeValues; 821 } 822 823 void GPUNodeBuilder::clearDominators(Function *F) { 824 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 825 std::vector<BasicBlock *> Nodes; 826 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 827 Nodes.push_back(I->getBlock()); 828 829 for (BasicBlock *BB : Nodes) 830 DT.eraseNode(BB); 831 } 832 833 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 834 for (BasicBlock &BB : *F) { 835 Loop *L = LI.getLoopFor(&BB); 836 if (L) 837 SE.forgetLoop(L); 838 } 839 } 840 841 void GPUNodeBuilder::clearLoops(Function *F) { 842 for (BasicBlock &BB : *F) { 843 Loop *L = LI.getLoopFor(&BB); 844 if (L) 845 SE.forgetLoop(L); 846 LI.removeBlock(&BB); 847 } 848 } 849 850 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 851 std::vector<Value *> Sizes; 852 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 853 854 for (long i = 0; i < Kernel->n_grid; i++) { 855 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 856 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 857 Value *Res = ExprBuilder.create(GridSize); 858 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 859 Sizes.push_back(Res); 860 } 861 isl_ast_build_free(Context); 862 863 for (long i = Kernel->n_grid; i < 3; i++) 864 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 865 866 return std::make_tuple(Sizes[0], Sizes[1]); 867 } 868 869 std::tuple<Value *, Value *, Value *> 870 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 871 std::vector<Value *> Sizes; 872 873 for (long i = 0; i < Kernel->n_block; i++) { 874 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 875 Sizes.push_back(Res); 876 } 877 878 for (long i = Kernel->n_block; i < 3; i++) 879 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 880 881 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 882 } 883 884 Value * 885 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 886 SetVector<Value *> SubtreeValues) { 887 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 888 std::distance(F->arg_begin(), F->arg_end())); 889 890 BasicBlock *EntryBlock = 891 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 892 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 893 Instruction *Parameters = 894 new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator()); 895 896 int Index = 0; 897 for (long i = 0; i < Prog->n_array; i++) { 898 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 899 continue; 900 901 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 902 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 903 904 Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI]; 905 DevArray = createCallGetDevicePtr(DevArray); 906 Instruction *Param = new AllocaInst( 907 Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), 908 EntryBlock->getTerminator()); 909 Builder.CreateStore(DevArray, Param); 910 Value *Slot = Builder.CreateGEP( 911 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 912 Value *ParamTyped = 913 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 914 Builder.CreateStore(ParamTyped, Slot); 915 Index++; 916 } 917 918 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 919 920 for (long i = 0; i < NumHostIters; i++) { 921 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 922 Value *Val = IDToValue[Id]; 923 isl_id_free(Id); 924 Instruction *Param = new AllocaInst( 925 Val->getType(), Launch + "_param_" + std::to_string(Index), 926 EntryBlock->getTerminator()); 927 Builder.CreateStore(Val, Param); 928 Value *Slot = Builder.CreateGEP( 929 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 930 Value *ParamTyped = 931 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 932 Builder.CreateStore(ParamTyped, Slot); 933 Index++; 934 } 935 936 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 937 938 for (long i = 0; i < NumVars; i++) { 939 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 940 Value *Val = IDToValue[Id]; 941 isl_id_free(Id); 942 Instruction *Param = new AllocaInst( 943 Val->getType(), Launch + "_param_" + std::to_string(Index), 944 EntryBlock->getTerminator()); 945 Builder.CreateStore(Val, Param); 946 Value *Slot = Builder.CreateGEP( 947 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 948 Value *ParamTyped = 949 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 950 Builder.CreateStore(ParamTyped, Slot); 951 Index++; 952 } 953 954 for (auto Val : SubtreeValues) { 955 Instruction *Param = new AllocaInst( 956 Val->getType(), Launch + "_param_" + std::to_string(Index), 957 EntryBlock->getTerminator()); 958 Builder.CreateStore(Val, Param); 959 Value *Slot = Builder.CreateGEP( 960 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 961 Value *ParamTyped = 962 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 963 Builder.CreateStore(ParamTyped, Slot); 964 Index++; 965 } 966 967 auto Location = EntryBlock->getTerminator(); 968 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 969 Launch + "_params_i8ptr", Location); 970 } 971 972 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 973 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 974 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 975 isl_id_free(Id); 976 isl_ast_node_free(KernelStmt); 977 978 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 979 980 assert(Kernel->tree && "Device AST of kernel node is empty"); 981 982 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 983 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 984 ValueMapT HostValueMap = ValueMap; 985 986 SetVector<const Loop *> Loops; 987 988 // Create for all loops we depend on values that contain the current loop 989 // iteration. These values are necessary to generate code for SCEVs that 990 // depend on such loops. As a result we need to pass them to the subfunction. 991 for (const Loop *L : Loops) { 992 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 993 SE.getUnknown(Builder.getInt64(1)), 994 L, SCEV::FlagAnyWrap); 995 Value *V = generateSCEV(OuterLIV); 996 OutsideLoopIterations[L] = SE.getUnknown(V); 997 SubtreeValues.insert(V); 998 } 999 1000 createKernelFunction(Kernel, SubtreeValues); 1001 1002 create(isl_ast_node_copy(Kernel->tree)); 1003 1004 Function *F = Builder.GetInsertBlock()->getParent(); 1005 clearDominators(F); 1006 clearScalarEvolution(F); 1007 clearLoops(F); 1008 1009 Builder.SetInsertPoint(&HostInsertPoint); 1010 IDToValue = HostIDs; 1011 1012 ValueMap = HostValueMap; 1013 ScalarMap.clear(); 1014 PHIOpMap.clear(); 1015 EscapeMap.clear(); 1016 IDToSAI.clear(); 1017 Annotator.resetAlternativeAliasBases(); 1018 for (auto &BasePtr : LocalArrays) 1019 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 1020 LocalArrays.clear(); 1021 1022 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1023 1024 std::string ASMString = finalizeKernelFunction(); 1025 std::string Name = "kernel_" + std::to_string(Kernel->id); 1026 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1027 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1028 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1029 1030 Value *GridDimX, *GridDimY; 1031 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1032 1033 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1034 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1035 1036 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1037 BlockDimZ, Parameters); 1038 createCallFreeKernel(GPUKernel); 1039 } 1040 1041 /// Compute the DataLayout string for the NVPTX backend. 1042 /// 1043 /// @param is64Bit Are we looking for a 64 bit architecture? 1044 static std::string computeNVPTXDataLayout(bool is64Bit) { 1045 std::string Ret = "e"; 1046 1047 if (!is64Bit) 1048 Ret += "-p:32:32"; 1049 1050 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 1051 1052 return Ret; 1053 } 1054 1055 Function * 1056 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1057 SetVector<Value *> &SubtreeValues) { 1058 std::vector<Type *> Args; 1059 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1060 1061 for (long i = 0; i < Prog->n_array; i++) { 1062 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1063 continue; 1064 1065 Args.push_back(Builder.getInt8PtrTy()); 1066 } 1067 1068 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1069 1070 for (long i = 0; i < NumHostIters; i++) 1071 Args.push_back(Builder.getInt64Ty()); 1072 1073 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1074 1075 for (long i = 0; i < NumVars; i++) 1076 Args.push_back(Builder.getInt64Ty()); 1077 1078 for (auto *V : SubtreeValues) 1079 Args.push_back(V->getType()); 1080 1081 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1082 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1083 GPUModule.get()); 1084 FN->setCallingConv(CallingConv::PTX_Kernel); 1085 1086 auto Arg = FN->arg_begin(); 1087 for (long i = 0; i < Kernel->n_array; i++) { 1088 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1089 continue; 1090 1091 Arg->setName(Kernel->array[i].array->name); 1092 1093 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1094 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1095 Type *EleTy = SAI->getElementType(); 1096 Value *Val = &*Arg; 1097 SmallVector<const SCEV *, 4> Sizes; 1098 isl_ast_build *Build = 1099 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1100 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1101 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1102 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1103 auto V = ExprBuilder.create(DimSize); 1104 Sizes.push_back(SE.getSCEV(V)); 1105 } 1106 const ScopArrayInfo *SAIRep = 1107 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 1108 LocalArrays.push_back(Val); 1109 1110 isl_ast_build_free(Build); 1111 isl_id_free(Id); 1112 IDToSAI[Id] = SAIRep; 1113 Arg++; 1114 } 1115 1116 for (long i = 0; i < NumHostIters; i++) { 1117 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1118 Arg->setName(isl_id_get_name(Id)); 1119 IDToValue[Id] = &*Arg; 1120 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1121 Arg++; 1122 } 1123 1124 for (long i = 0; i < NumVars; i++) { 1125 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1126 Arg->setName(isl_id_get_name(Id)); 1127 IDToValue[Id] = &*Arg; 1128 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1129 Arg++; 1130 } 1131 1132 for (auto *V : SubtreeValues) { 1133 Arg->setName(V->getName()); 1134 ValueMap[V] = &*Arg; 1135 Arg++; 1136 } 1137 1138 return FN; 1139 } 1140 1141 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1142 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 1143 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 1144 1145 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 1146 Intrinsic::nvvm_read_ptx_sreg_tid_y, 1147 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 1148 1149 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1150 std::string Name = isl_id_get_name(Id); 1151 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1152 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1153 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1154 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1155 IDToValue[Id] = Val; 1156 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1157 }; 1158 1159 for (int i = 0; i < Kernel->n_grid; ++i) { 1160 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1161 addId(Id, IntrinsicsBID[i]); 1162 } 1163 1164 for (int i = 0; i < Kernel->n_block; ++i) { 1165 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1166 addId(Id, IntrinsicsTID[i]); 1167 } 1168 } 1169 1170 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1171 SetVector<Value *> &SubtreeValues) { 1172 1173 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1174 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1175 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1176 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1177 1178 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1179 1180 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1181 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1182 1183 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1184 DT.addNewBlock(EntryBlock, PrevBlock); 1185 1186 Builder.SetInsertPoint(EntryBlock); 1187 Builder.CreateRetVoid(); 1188 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1189 1190 ScopDetection::markFunctionAsInvalid(FN); 1191 1192 insertKernelIntrinsics(Kernel); 1193 } 1194 1195 std::string GPUNodeBuilder::createKernelASM() { 1196 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1197 std::string ErrMsg; 1198 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1199 1200 if (!GPUTarget) { 1201 errs() << ErrMsg << "\n"; 1202 return ""; 1203 } 1204 1205 TargetOptions Options; 1206 Options.UnsafeFPMath = FastMath; 1207 std::unique_ptr<TargetMachine> TargetM( 1208 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 1209 Options, Optional<Reloc::Model>())); 1210 1211 SmallString<0> ASMString; 1212 raw_svector_ostream ASMStream(ASMString); 1213 llvm::legacy::PassManager PM; 1214 1215 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1216 1217 if (TargetM->addPassesToEmitFile( 1218 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1219 errs() << "The target does not support generation of this file type!\n"; 1220 return ""; 1221 } 1222 1223 PM.run(*GPUModule); 1224 1225 return ASMStream.str(); 1226 } 1227 1228 std::string GPUNodeBuilder::finalizeKernelFunction() { 1229 // Verify module. 1230 llvm::legacy::PassManager Passes; 1231 Passes.add(createVerifierPass()); 1232 Passes.run(*GPUModule); 1233 1234 if (DumpKernelIR) 1235 outs() << *GPUModule << "\n"; 1236 1237 // Optimize module. 1238 llvm::legacy::PassManager OptPasses; 1239 PassManagerBuilder PassBuilder; 1240 PassBuilder.OptLevel = 3; 1241 PassBuilder.SizeLevel = 0; 1242 PassBuilder.populateModulePassManager(OptPasses); 1243 OptPasses.run(*GPUModule); 1244 1245 std::string Assembly = createKernelASM(); 1246 1247 if (DumpKernelASM) 1248 outs() << Assembly << "\n"; 1249 1250 GPUModule.release(); 1251 KernelIDs.clear(); 1252 1253 return Assembly; 1254 } 1255 1256 namespace { 1257 class PPCGCodeGeneration : public ScopPass { 1258 public: 1259 static char ID; 1260 1261 /// The scop that is currently processed. 1262 Scop *S; 1263 1264 LoopInfo *LI; 1265 DominatorTree *DT; 1266 ScalarEvolution *SE; 1267 const DataLayout *DL; 1268 RegionInfo *RI; 1269 1270 PPCGCodeGeneration() : ScopPass(ID) {} 1271 1272 /// Construct compilation options for PPCG. 1273 /// 1274 /// @returns The compilation options. 1275 ppcg_options *createPPCGOptions() { 1276 auto DebugOptions = 1277 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1278 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1279 1280 DebugOptions->dump_schedule_constraints = false; 1281 DebugOptions->dump_schedule = false; 1282 DebugOptions->dump_final_schedule = false; 1283 DebugOptions->dump_sizes = false; 1284 1285 Options->debug = DebugOptions; 1286 1287 Options->reschedule = true; 1288 Options->scale_tile_loops = false; 1289 Options->wrap = false; 1290 1291 Options->non_negative_parameters = false; 1292 Options->ctx = nullptr; 1293 Options->sizes = nullptr; 1294 1295 Options->tile_size = 32; 1296 1297 Options->use_private_memory = false; 1298 Options->use_shared_memory = false; 1299 Options->max_shared_memory = 0; 1300 1301 Options->target = PPCG_TARGET_CUDA; 1302 Options->openmp = false; 1303 Options->linearize_device_arrays = true; 1304 Options->live_range_reordering = false; 1305 1306 Options->opencl_compiler_options = nullptr; 1307 Options->opencl_use_gpu = false; 1308 Options->opencl_n_include_file = 0; 1309 Options->opencl_include_files = nullptr; 1310 Options->opencl_print_kernel_types = false; 1311 Options->opencl_embed_kernel_code = false; 1312 1313 Options->save_schedule_file = nullptr; 1314 Options->load_schedule_file = nullptr; 1315 1316 return Options; 1317 } 1318 1319 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1320 /// 1321 /// Instead of a normal access of the form: 1322 /// 1323 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1324 /// 1325 /// a tagged access has the form 1326 /// 1327 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1328 /// 1329 /// where 'id' is an additional space that references the memory access that 1330 /// triggered the access. 1331 /// 1332 /// @param AccessTy The type of the memory accesses to collect. 1333 /// 1334 /// @return The relation describing all tagged memory accesses. 1335 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1336 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1337 1338 for (auto &Stmt : *S) 1339 for (auto &Acc : Stmt) 1340 if (Acc->getType() == AccessTy) { 1341 isl_map *Relation = Acc->getAccessRelation(); 1342 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1343 1344 isl_space *Space = isl_map_get_space(Relation); 1345 Space = isl_space_range(Space); 1346 Space = isl_space_from_range(Space); 1347 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1348 isl_map *Universe = isl_map_universe(Space); 1349 Relation = isl_map_domain_product(Relation, Universe); 1350 Accesses = isl_union_map_add_map(Accesses, Relation); 1351 } 1352 1353 return Accesses; 1354 } 1355 1356 /// Get the set of all read accesses, tagged with the access id. 1357 /// 1358 /// @see getTaggedAccesses 1359 isl_union_map *getTaggedReads() { 1360 return getTaggedAccesses(MemoryAccess::READ); 1361 } 1362 1363 /// Get the set of all may (and must) accesses, tagged with the access id. 1364 /// 1365 /// @see getTaggedAccesses 1366 isl_union_map *getTaggedMayWrites() { 1367 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1368 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1369 } 1370 1371 /// Get the set of all must accesses, tagged with the access id. 1372 /// 1373 /// @see getTaggedAccesses 1374 isl_union_map *getTaggedMustWrites() { 1375 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1376 } 1377 1378 /// Collect parameter and array names as isl_ids. 1379 /// 1380 /// To reason about the different parameters and arrays used, ppcg requires 1381 /// a list of all isl_ids in use. As PPCG traditionally performs 1382 /// source-to-source compilation each of these isl_ids is mapped to the 1383 /// expression that represents it. As we do not have a corresponding 1384 /// expression in Polly, we just map each id to a 'zero' expression to match 1385 /// the data format that ppcg expects. 1386 /// 1387 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1388 __isl_give isl_id_to_ast_expr *getNames() { 1389 auto *Names = isl_id_to_ast_expr_alloc( 1390 S->getIslCtx(), 1391 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1392 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1393 auto *Space = S->getParamSpace(); 1394 1395 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1396 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1397 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1398 } 1399 1400 for (auto &Array : S->arrays()) { 1401 auto Id = Array->getBasePtrId(); 1402 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1403 } 1404 1405 isl_space_free(Space); 1406 isl_ast_expr_free(Zero); 1407 1408 return Names; 1409 } 1410 1411 /// Create a new PPCG scop from the current scop. 1412 /// 1413 /// The PPCG scop is initialized with data from the current polly::Scop. From 1414 /// this initial data, the data-dependences in the PPCG scop are initialized. 1415 /// We do not use Polly's dependence analysis for now, to ensure we match 1416 /// the PPCG default behaviour more closely. 1417 /// 1418 /// @returns A new ppcg scop. 1419 ppcg_scop *createPPCGScop() { 1420 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1421 1422 PPCGScop->options = createPPCGOptions(); 1423 1424 PPCGScop->start = 0; 1425 PPCGScop->end = 0; 1426 1427 PPCGScop->context = S->getContext(); 1428 PPCGScop->domain = S->getDomains(); 1429 PPCGScop->call = nullptr; 1430 PPCGScop->tagged_reads = getTaggedReads(); 1431 PPCGScop->reads = S->getReads(); 1432 PPCGScop->live_in = nullptr; 1433 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1434 PPCGScop->may_writes = S->getWrites(); 1435 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1436 PPCGScop->must_writes = S->getMustWrites(); 1437 PPCGScop->live_out = nullptr; 1438 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1439 PPCGScop->tagger = nullptr; 1440 1441 PPCGScop->independence = nullptr; 1442 PPCGScop->dep_flow = nullptr; 1443 PPCGScop->tagged_dep_flow = nullptr; 1444 PPCGScop->dep_false = nullptr; 1445 PPCGScop->dep_forced = nullptr; 1446 PPCGScop->dep_order = nullptr; 1447 PPCGScop->tagged_dep_order = nullptr; 1448 1449 PPCGScop->schedule = S->getScheduleTree(); 1450 PPCGScop->names = getNames(); 1451 1452 PPCGScop->pet = nullptr; 1453 1454 compute_tagger(PPCGScop); 1455 compute_dependences(PPCGScop); 1456 1457 return PPCGScop; 1458 } 1459 1460 /// Collect the array acesses in a statement. 1461 /// 1462 /// @param Stmt The statement for which to collect the accesses. 1463 /// 1464 /// @returns A list of array accesses. 1465 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1466 gpu_stmt_access *Accesses = nullptr; 1467 1468 for (MemoryAccess *Acc : Stmt) { 1469 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1470 Access->read = Acc->isRead(); 1471 Access->write = Acc->isWrite(); 1472 Access->access = Acc->getAccessRelation(); 1473 isl_space *Space = isl_map_get_space(Access->access); 1474 Space = isl_space_range(Space); 1475 Space = isl_space_from_range(Space); 1476 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1477 isl_map *Universe = isl_map_universe(Space); 1478 Access->tagged_access = 1479 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1480 Access->exact_write = Acc->isWrite(); 1481 Access->ref_id = Acc->getId(); 1482 Access->next = Accesses; 1483 Accesses = Access; 1484 } 1485 1486 return Accesses; 1487 } 1488 1489 /// Collect the list of GPU statements. 1490 /// 1491 /// Each statement has an id, a pointer to the underlying data structure, 1492 /// as well as a list with all memory accesses. 1493 /// 1494 /// TODO: Initialize the list of memory accesses. 1495 /// 1496 /// @returns A linked-list of statements. 1497 gpu_stmt *getStatements() { 1498 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1499 std::distance(S->begin(), S->end())); 1500 1501 int i = 0; 1502 for (auto &Stmt : *S) { 1503 gpu_stmt *GPUStmt = &Stmts[i]; 1504 1505 GPUStmt->id = Stmt.getDomainId(); 1506 1507 // We use the pet stmt pointer to keep track of the Polly statements. 1508 GPUStmt->stmt = (pet_stmt *)&Stmt; 1509 GPUStmt->accesses = getStmtAccesses(Stmt); 1510 i++; 1511 } 1512 1513 return Stmts; 1514 } 1515 1516 /// Derive the extent of an array. 1517 /// 1518 /// The extent of an array is defined by the set of memory locations for 1519 /// which a memory access in the iteration domain exists. 1520 /// 1521 /// @param Array The array to derive the extent for. 1522 /// 1523 /// @returns An isl_set describing the extent of the array. 1524 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1525 isl_union_map *Accesses = S->getAccesses(); 1526 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1527 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1528 isl_set *AccessSet = 1529 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1530 isl_union_set_free(AccessUSet); 1531 1532 return AccessSet; 1533 } 1534 1535 /// Derive the bounds of an array. 1536 /// 1537 /// For the first dimension we derive the bound of the array from the extent 1538 /// of this dimension. For inner dimensions we obtain their size directly from 1539 /// ScopArrayInfo. 1540 /// 1541 /// @param PPCGArray The array to compute bounds for. 1542 /// @param Array The polly array from which to take the information. 1543 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1544 if (PPCGArray.n_index > 0) { 1545 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1546 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1547 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1548 isl_set_free(Dom); 1549 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1550 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1551 isl_aff *One = isl_aff_zero_on_domain(LS); 1552 One = isl_aff_add_constant_si(One, 1); 1553 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1554 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1555 PPCGArray.bound[0] = Bound; 1556 } 1557 1558 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1559 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1560 auto LS = isl_pw_aff_get_domain_space(Bound); 1561 auto Aff = isl_multi_aff_zero(LS); 1562 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1563 PPCGArray.bound[i] = Bound; 1564 } 1565 } 1566 1567 /// Create the arrays for @p PPCGProg. 1568 /// 1569 /// @param PPCGProg The program to compute the arrays for. 1570 void createArrays(gpu_prog *PPCGProg) { 1571 int i = 0; 1572 for (auto &Array : S->arrays()) { 1573 std::string TypeName; 1574 raw_string_ostream OS(TypeName); 1575 1576 OS << *Array->getElementType(); 1577 TypeName = OS.str(); 1578 1579 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1580 1581 PPCGArray.space = Array->getSpace(); 1582 PPCGArray.type = strdup(TypeName.c_str()); 1583 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1584 PPCGArray.name = strdup(Array->getName().c_str()); 1585 PPCGArray.extent = nullptr; 1586 PPCGArray.n_index = Array->getNumberOfDimensions(); 1587 PPCGArray.bound = 1588 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1589 PPCGArray.extent = getExtent(Array); 1590 PPCGArray.n_ref = 0; 1591 PPCGArray.refs = nullptr; 1592 PPCGArray.accessed = true; 1593 PPCGArray.read_only_scalar = false; 1594 PPCGArray.has_compound_element = false; 1595 PPCGArray.local = false; 1596 PPCGArray.declare_local = false; 1597 PPCGArray.global = false; 1598 PPCGArray.linearize = false; 1599 PPCGArray.dep_order = nullptr; 1600 PPCGArray.user = Array; 1601 1602 setArrayBounds(PPCGArray, Array); 1603 i++; 1604 1605 collect_references(PPCGProg, &PPCGArray); 1606 } 1607 } 1608 1609 /// Create an identity map between the arrays in the scop. 1610 /// 1611 /// @returns An identity map between the arrays in the scop. 1612 isl_union_map *getArrayIdentity() { 1613 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1614 1615 for (auto &Array : S->arrays()) { 1616 isl_space *Space = Array->getSpace(); 1617 Space = isl_space_map_from_set(Space); 1618 isl_map *Identity = isl_map_identity(Space); 1619 Maps = isl_union_map_add_map(Maps, Identity); 1620 } 1621 1622 return Maps; 1623 } 1624 1625 /// Create a default-initialized PPCG GPU program. 1626 /// 1627 /// @returns A new gpu grogram description. 1628 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1629 1630 if (!PPCGScop) 1631 return nullptr; 1632 1633 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1634 1635 PPCGProg->ctx = S->getIslCtx(); 1636 PPCGProg->scop = PPCGScop; 1637 PPCGProg->context = isl_set_copy(PPCGScop->context); 1638 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1639 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1640 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1641 PPCGProg->tagged_must_kill = 1642 isl_union_map_copy(PPCGScop->tagged_must_kills); 1643 PPCGProg->to_inner = getArrayIdentity(); 1644 PPCGProg->to_outer = getArrayIdentity(); 1645 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1646 PPCGProg->any_to_outer = nullptr; 1647 PPCGProg->array_order = nullptr; 1648 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1649 PPCGProg->stmts = getStatements(); 1650 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1651 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1652 PPCGProg->n_array); 1653 1654 createArrays(PPCGProg); 1655 1656 return PPCGProg; 1657 } 1658 1659 struct PrintGPUUserData { 1660 struct cuda_info *CudaInfo; 1661 struct gpu_prog *PPCGProg; 1662 std::vector<ppcg_kernel *> Kernels; 1663 }; 1664 1665 /// Print a user statement node in the host code. 1666 /// 1667 /// We use ppcg's printing facilities to print the actual statement and 1668 /// additionally build up a list of all kernels that are encountered in the 1669 /// host ast. 1670 /// 1671 /// @param P The printer to print to 1672 /// @param Options The printing options to use 1673 /// @param Node The node to print 1674 /// @param User A user pointer to carry additional data. This pointer is 1675 /// expected to be of type PrintGPUUserData. 1676 /// 1677 /// @returns A printer to which the output has been printed. 1678 static __isl_give isl_printer * 1679 printHostUser(__isl_take isl_printer *P, 1680 __isl_take isl_ast_print_options *Options, 1681 __isl_take isl_ast_node *Node, void *User) { 1682 auto Data = (struct PrintGPUUserData *)User; 1683 auto Id = isl_ast_node_get_annotation(Node); 1684 1685 if (Id) { 1686 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1687 1688 // If this is a user statement, format it ourselves as ppcg would 1689 // otherwise try to call pet functionality that is not available in 1690 // Polly. 1691 if (IsUser) { 1692 P = isl_printer_start_line(P); 1693 P = isl_printer_print_ast_node(P, Node); 1694 P = isl_printer_end_line(P); 1695 isl_id_free(Id); 1696 isl_ast_print_options_free(Options); 1697 return P; 1698 } 1699 1700 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1701 isl_id_free(Id); 1702 Data->Kernels.push_back(Kernel); 1703 } 1704 1705 return print_host_user(P, Options, Node, User); 1706 } 1707 1708 /// Print C code corresponding to the control flow in @p Kernel. 1709 /// 1710 /// @param Kernel The kernel to print 1711 void printKernel(ppcg_kernel *Kernel) { 1712 auto *P = isl_printer_to_str(S->getIslCtx()); 1713 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1714 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1715 P = isl_ast_node_print(Kernel->tree, P, Options); 1716 char *String = isl_printer_get_str(P); 1717 printf("%s\n", String); 1718 free(String); 1719 isl_printer_free(P); 1720 } 1721 1722 /// Print C code corresponding to the GPU code described by @p Tree. 1723 /// 1724 /// @param Tree An AST describing GPU code 1725 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1726 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1727 auto *P = isl_printer_to_str(S->getIslCtx()); 1728 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1729 1730 PrintGPUUserData Data; 1731 Data.PPCGProg = PPCGProg; 1732 1733 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1734 Options = 1735 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1736 P = isl_ast_node_print(Tree, P, Options); 1737 char *String = isl_printer_get_str(P); 1738 printf("# host\n"); 1739 printf("%s\n", String); 1740 free(String); 1741 isl_printer_free(P); 1742 1743 for (auto Kernel : Data.Kernels) { 1744 printf("# kernel%d\n", Kernel->id); 1745 printKernel(Kernel); 1746 } 1747 } 1748 1749 // Generate a GPU program using PPCG. 1750 // 1751 // GPU mapping consists of multiple steps: 1752 // 1753 // 1) Compute new schedule for the program. 1754 // 2) Map schedule to GPU (TODO) 1755 // 3) Generate code for new schedule (TODO) 1756 // 1757 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1758 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1759 // strategy directly from this pass. 1760 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1761 1762 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1763 1764 PPCGGen->ctx = S->getIslCtx(); 1765 PPCGGen->options = PPCGScop->options; 1766 PPCGGen->print = nullptr; 1767 PPCGGen->print_user = nullptr; 1768 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1769 PPCGGen->prog = PPCGProg; 1770 PPCGGen->tree = nullptr; 1771 PPCGGen->types.n = 0; 1772 PPCGGen->types.name = nullptr; 1773 PPCGGen->sizes = nullptr; 1774 PPCGGen->used_sizes = nullptr; 1775 PPCGGen->kernel_id = 0; 1776 1777 // Set scheduling strategy to same strategy PPCG is using. 1778 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1779 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1780 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1781 1782 isl_schedule *Schedule = get_schedule(PPCGGen); 1783 1784 int has_permutable = has_any_permutable_node(Schedule); 1785 1786 if (!has_permutable || has_permutable < 0) { 1787 Schedule = isl_schedule_free(Schedule); 1788 } else { 1789 Schedule = map_to_device(PPCGGen, Schedule); 1790 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1791 } 1792 1793 if (DumpSchedule) { 1794 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1795 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1796 P = isl_printer_print_str(P, "Schedule\n"); 1797 P = isl_printer_print_str(P, "========\n"); 1798 if (Schedule) 1799 P = isl_printer_print_schedule(P, Schedule); 1800 else 1801 P = isl_printer_print_str(P, "No schedule found\n"); 1802 1803 printf("%s\n", isl_printer_get_str(P)); 1804 isl_printer_free(P); 1805 } 1806 1807 if (DumpCode) { 1808 printf("Code\n"); 1809 printf("====\n"); 1810 if (PPCGGen->tree) 1811 printGPUTree(PPCGGen->tree, PPCGProg); 1812 else 1813 printf("No code generated\n"); 1814 } 1815 1816 isl_schedule_free(Schedule); 1817 1818 return PPCGGen; 1819 } 1820 1821 /// Free gpu_gen structure. 1822 /// 1823 /// @param PPCGGen The ppcg_gen object to free. 1824 void freePPCGGen(gpu_gen *PPCGGen) { 1825 isl_ast_node_free(PPCGGen->tree); 1826 isl_union_map_free(PPCGGen->sizes); 1827 isl_union_map_free(PPCGGen->used_sizes); 1828 free(PPCGGen); 1829 } 1830 1831 /// Free the options in the ppcg scop structure. 1832 /// 1833 /// ppcg is not freeing these options for us. To avoid leaks we do this 1834 /// ourselves. 1835 /// 1836 /// @param PPCGScop The scop referencing the options to free. 1837 void freeOptions(ppcg_scop *PPCGScop) { 1838 free(PPCGScop->options->debug); 1839 PPCGScop->options->debug = nullptr; 1840 free(PPCGScop->options); 1841 PPCGScop->options = nullptr; 1842 } 1843 1844 /// Generate code for a given GPU AST described by @p Root. 1845 /// 1846 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1847 /// @param Prog The GPU Program to generate code for. 1848 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1849 ScopAnnotator Annotator; 1850 Annotator.buildAliasScopes(*S); 1851 1852 Region *R = &S->getRegion(); 1853 1854 simplifyRegion(R, DT, LI, RI); 1855 1856 BasicBlock *EnteringBB = R->getEnteringBlock(); 1857 1858 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1859 1860 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1861 Prog); 1862 1863 // Only build the run-time condition and parameters _after_ having 1864 // introduced the conditional branch. This is important as the conditional 1865 // branch will guard the original scop from new induction variables that 1866 // the SCEVExpander may introduce while code generating the parameters and 1867 // which may introduce scalar dependences that prevent us from correctly 1868 // code generating this scop. 1869 BasicBlock *StartBlock = 1870 executeScopConditionally(*S, this, Builder.getTrue()); 1871 1872 // TODO: Handle LICM 1873 // TODO: Verify run-time checks 1874 auto SplitBlock = StartBlock->getSinglePredecessor(); 1875 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1876 NodeBuilder.addParameters(S->getContext()); 1877 Builder.SetInsertPoint(&*StartBlock->begin()); 1878 1879 NodeBuilder.initializeAfterRTH(); 1880 NodeBuilder.create(Root); 1881 NodeBuilder.finalize(); 1882 } 1883 1884 bool runOnScop(Scop &CurrentScop) override { 1885 S = &CurrentScop; 1886 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1887 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1888 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1889 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 1890 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 1891 1892 // We currently do not support scops with invariant loads. 1893 if (S->hasInvariantAccesses()) 1894 return false; 1895 1896 auto PPCGScop = createPPCGScop(); 1897 auto PPCGProg = createPPCGProg(PPCGScop); 1898 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 1899 1900 if (PPCGGen->tree) 1901 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 1902 1903 freeOptions(PPCGScop); 1904 freePPCGGen(PPCGGen); 1905 gpu_prog_free(PPCGProg); 1906 ppcg_scop_free(PPCGScop); 1907 1908 return true; 1909 } 1910 1911 void printScop(raw_ostream &, Scop &) const override {} 1912 1913 void getAnalysisUsage(AnalysisUsage &AU) const override { 1914 AU.addRequired<DominatorTreeWrapperPass>(); 1915 AU.addRequired<RegionInfoPass>(); 1916 AU.addRequired<ScalarEvolutionWrapperPass>(); 1917 AU.addRequired<ScopDetection>(); 1918 AU.addRequired<ScopInfoRegionPass>(); 1919 AU.addRequired<LoopInfoWrapperPass>(); 1920 1921 AU.addPreserved<AAResultsWrapperPass>(); 1922 AU.addPreserved<BasicAAWrapperPass>(); 1923 AU.addPreserved<LoopInfoWrapperPass>(); 1924 AU.addPreserved<DominatorTreeWrapperPass>(); 1925 AU.addPreserved<GlobalsAAWrapperPass>(); 1926 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1927 AU.addPreserved<ScopDetection>(); 1928 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1929 AU.addPreserved<SCEVAAWrapperPass>(); 1930 1931 // FIXME: We do not yet add regions for the newly generated code to the 1932 // region tree. 1933 AU.addPreserved<RegionInfoPass>(); 1934 AU.addPreserved<ScopInfoRegionPass>(); 1935 } 1936 }; 1937 } 1938 1939 char PPCGCodeGeneration::ID = 1; 1940 1941 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 1942 1943 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 1944 "Polly - Apply PPCG translation to SCOP", false, false) 1945 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1946 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1947 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 1948 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1949 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 1950 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1951 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 1952 "Polly - Apply PPCG translation to SCOP", false, false) 1953