1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopInfo.h" 21 #include "polly/Support/SCEVValidator.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/Analysis/BasicAliasAnalysis.h" 25 #include "llvm/Analysis/GlobalsModRef.h" 26 #include "llvm/Analysis/PostDominators.h" 27 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/LegacyPassManager.h" 31 #include "llvm/IR/Verifier.h" 32 #include "llvm/Support/TargetRegistry.h" 33 #include "llvm/Support/TargetSelect.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 36 37 #include "isl/union_map.h" 38 39 extern "C" { 40 #include "ppcg/cuda.h" 41 #include "ppcg/gpu.h" 42 #include "ppcg/gpu_print.h" 43 #include "ppcg/ppcg.h" 44 #include "ppcg/schedule.h" 45 } 46 47 #include "llvm/Support/Debug.h" 48 49 using namespace polly; 50 using namespace llvm; 51 52 #define DEBUG_TYPE "polly-codegen-ppcg" 53 54 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 55 cl::desc("Dump the computed GPU Schedule"), 56 cl::Hidden, cl::init(false), cl::ZeroOrMore, 57 cl::cat(PollyCategory)); 58 59 static cl::opt<bool> 60 DumpCode("polly-acc-dump-code", 61 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 62 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 63 64 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 65 cl::desc("Dump the kernel LLVM-IR"), 66 cl::Hidden, cl::init(false), cl::ZeroOrMore, 67 cl::cat(PollyCategory)); 68 69 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 70 cl::desc("Dump the kernel assembly code"), 71 cl::Hidden, cl::init(false), cl::ZeroOrMore, 72 cl::cat(PollyCategory)); 73 74 static cl::opt<bool> FastMath("polly-acc-fastmath", 75 cl::desc("Allow unsafe math optimizations"), 76 cl::Hidden, cl::init(false), cl::ZeroOrMore, 77 cl::cat(PollyCategory)); 78 79 static cl::opt<std::string> 80 CudaVersion("polly-acc-cuda-version", 81 cl::desc("The CUDA version to compile for"), cl::Hidden, 82 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 83 84 /// Create the ast expressions for a ScopStmt. 85 /// 86 /// This function is a callback for to generate the ast expressions for each 87 /// of the scheduled ScopStmts. 88 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 89 void *StmtT, isl_ast_build *Build, 90 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 91 isl_id *Id, void *User), 92 void *UserIndex, 93 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 94 void *UserExpr) { 95 96 ScopStmt *Stmt = (ScopStmt *)StmtT; 97 98 isl_ctx *Ctx; 99 100 if (!Stmt || !Build) 101 return NULL; 102 103 Ctx = isl_ast_build_get_ctx(Build); 104 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 105 106 for (MemoryAccess *Acc : *Stmt) { 107 isl_map *AddrFunc = Acc->getAddressFunction(); 108 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 109 isl_id *RefId = Acc->getId(); 110 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 111 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 112 MPA = isl_multi_pw_aff_coalesce(MPA); 113 MPA = FunctionIndex(MPA, RefId, UserIndex); 114 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 115 Access = FunctionExpr(Access, RefId, UserExpr); 116 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 117 } 118 119 return RefToExpr; 120 } 121 122 /// Generate code for a GPU specific isl AST. 123 /// 124 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 125 /// generates code for general-prupose AST nodes, with special functionality 126 /// for generating GPU specific user nodes. 127 /// 128 /// @see GPUNodeBuilder::createUser 129 class GPUNodeBuilder : public IslNodeBuilder { 130 public: 131 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 132 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 133 DominatorTree &DT, Scop &S, gpu_prog *Prog) 134 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 135 getExprBuilder().setIDToSAI(&IDToSAI); 136 } 137 138 /// Create after-run-time-check initialization code. 139 void initializeAfterRTH(); 140 141 /// Finalize the generated scop. 142 virtual void finalize(); 143 144 private: 145 /// A vector of array base pointers for which a new ScopArrayInfo was created. 146 /// 147 /// This vector is used to delete the ScopArrayInfo when it is not needed any 148 /// more. 149 std::vector<Value *> LocalArrays; 150 151 /// A map from ScopArrays to their corresponding device allocations. 152 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 153 154 /// The current GPU context. 155 Value *GPUContext; 156 157 /// A module containing GPU code. 158 /// 159 /// This pointer is only set in case we are currently generating GPU code. 160 std::unique_ptr<Module> GPUModule; 161 162 /// The GPU program we generate code for. 163 gpu_prog *Prog; 164 165 /// Class to free isl_ids. 166 class IslIdDeleter { 167 public: 168 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 169 }; 170 171 /// A set containing all isl_ids allocated in a GPU kernel. 172 /// 173 /// By releasing this set all isl_ids will be freed. 174 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 175 176 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 177 178 /// Create code for user-defined AST nodes. 179 /// 180 /// These AST nodes can be of type: 181 /// 182 /// - ScopStmt: A computational statement (TODO) 183 /// - Kernel: A GPU kernel call (TODO) 184 /// - Data-Transfer: A GPU <-> CPU data-transfer 185 /// - In-kernel synchronization 186 /// - In-kernel memory copy statement 187 /// 188 /// @param UserStmt The ast node to generate code for. 189 virtual void createUser(__isl_take isl_ast_node *UserStmt); 190 191 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 192 193 /// Create code for a data transfer statement 194 /// 195 /// @param TransferStmt The data transfer statement. 196 /// @param Direction The direction in which to transfer data. 197 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 198 enum DataDirection Direction); 199 200 /// Find llvm::Values referenced in GPU kernel. 201 /// 202 /// @param Kernel The kernel to scan for llvm::Values 203 /// 204 /// @returns A set of values referenced by the kernel. 205 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 206 207 /// Create GPU kernel. 208 /// 209 /// Code generate the kernel described by @p KernelStmt. 210 /// 211 /// @param KernelStmt The ast node to generate kernel code for. 212 void createKernel(__isl_take isl_ast_node *KernelStmt); 213 214 /// Generate code that computes the size of an array. 215 /// 216 /// @param Array The array for which to compute a size. 217 Value *getArraySize(gpu_array_info *Array); 218 219 /// Create kernel function. 220 /// 221 /// Create a kernel function located in a newly created module that can serve 222 /// as target for device code generation. Set the Builder to point to the 223 /// start block of this newly created function. 224 /// 225 /// @param Kernel The kernel to generate code for. 226 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 227 void createKernelFunction(ppcg_kernel *Kernel, 228 SetVector<Value *> &SubtreeValues); 229 230 /// Create the declaration of a kernel function. 231 /// 232 /// The kernel function takes as arguments: 233 /// 234 /// - One i8 pointer for each external array reference used in the kernel. 235 /// - Host iterators 236 /// - Parameters 237 /// - Other LLVM Value references (TODO) 238 /// 239 /// @param Kernel The kernel to generate the function declaration for. 240 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 241 /// 242 /// @returns The newly declared function. 243 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 244 SetVector<Value *> &SubtreeValues); 245 246 /// Insert intrinsic functions to obtain thread and block ids. 247 /// 248 /// @param The kernel to generate the intrinsic functions for. 249 void insertKernelIntrinsics(ppcg_kernel *Kernel); 250 251 /// Create code for a ScopStmt called in @p Expr. 252 /// 253 /// @param Expr The expression containing the call. 254 /// @param KernelStmt The kernel statement referenced in the call. 255 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 256 257 /// Create an in-kernel synchronization call. 258 void createKernelSync(); 259 260 /// Create a PTX assembly string for the current GPU kernel. 261 /// 262 /// @returns A string containing the corresponding PTX assembly code. 263 std::string createKernelASM(); 264 265 /// Remove references from the dominator tree to the kernel function @p F. 266 /// 267 /// @param F The function to remove references to. 268 void clearDominators(Function *F); 269 270 /// Remove references from scalar evolution to the kernel function @p F. 271 /// 272 /// @param F The function to remove references to. 273 void clearScalarEvolution(Function *F); 274 275 /// Remove references from loop info to the kernel function @p F. 276 /// 277 /// @param F The function to remove references to. 278 void clearLoops(Function *F); 279 280 /// Finalize the generation of the kernel function. 281 /// 282 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 283 /// dump its IR to stderr. 284 /// 285 /// @returns The Assembly string of the kernel. 286 std::string finalizeKernelFunction(); 287 288 /// Create code that allocates memory to store arrays on device. 289 void allocateDeviceArrays(); 290 291 /// Free all allocated device arrays. 292 void freeDeviceArrays(); 293 294 /// Create a call to initialize the GPU context. 295 /// 296 /// @returns A pointer to the newly initialized context. 297 Value *createCallInitContext(); 298 299 /// Create a call to free the GPU context. 300 /// 301 /// @param Context A pointer to an initialized GPU context. 302 void createCallFreeContext(Value *Context); 303 304 /// Create a call to allocate memory on the device. 305 /// 306 /// @param Size The size of memory to allocate 307 /// 308 /// @returns A pointer that identifies this allocation. 309 Value *createCallAllocateMemoryForDevice(Value *Size); 310 311 /// Create a call to free a device array. 312 /// 313 /// @param Array The device array to free. 314 void createCallFreeDeviceMemory(Value *Array); 315 316 /// Create a call to copy data from host to device. 317 /// 318 /// @param HostPtr A pointer to the host data that should be copied. 319 /// @param DevicePtr A device pointer specifying the location to copy to. 320 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 321 Value *Size); 322 323 /// Create a call to copy data from device to host. 324 /// 325 /// @param DevicePtr A pointer to the device data that should be copied. 326 /// @param HostPtr A host pointer specifying the location to copy to. 327 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 328 Value *Size); 329 330 /// Create a call to get a kernel from an assembly string. 331 /// 332 /// @param Buffer The string describing the kernel. 333 /// @param Entry The name of the kernel function to call. 334 /// 335 /// @returns A pointer to a kernel object 336 Value *createCallGetKernel(Value *Buffer, Value *Entry); 337 338 /// Create a call to free a GPU kernel. 339 /// 340 /// @param GPUKernel THe kernel to free. 341 void createCallFreeKernel(Value *GPUKernel); 342 }; 343 344 void GPUNodeBuilder::initializeAfterRTH() { 345 GPUContext = createCallInitContext(); 346 allocateDeviceArrays(); 347 } 348 349 void GPUNodeBuilder::finalize() { 350 freeDeviceArrays(); 351 createCallFreeContext(GPUContext); 352 IslNodeBuilder::finalize(); 353 } 354 355 void GPUNodeBuilder::allocateDeviceArrays() { 356 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 357 358 for (int i = 0; i < Prog->n_array; ++i) { 359 gpu_array_info *Array = &Prog->array[i]; 360 auto *ScopArray = (ScopArrayInfo *)Array->user; 361 std::string DevArrayName("p_dev_array_"); 362 DevArrayName.append(Array->name); 363 364 Value *ArraySize = getArraySize(Array); 365 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 366 DevArray->setName(DevArrayName); 367 DeviceAllocations[ScopArray] = DevArray; 368 } 369 370 isl_ast_build_free(Build); 371 } 372 373 void GPUNodeBuilder::freeDeviceArrays() { 374 for (auto &Array : DeviceAllocations) 375 createCallFreeDeviceMemory(Array.second); 376 } 377 378 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 379 const char *Name = "polly_getKernel"; 380 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 381 Function *F = M->getFunction(Name); 382 383 // If F is not available, declare it. 384 if (!F) { 385 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 386 std::vector<Type *> Args; 387 Args.push_back(Builder.getInt8PtrTy()); 388 Args.push_back(Builder.getInt8PtrTy()); 389 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 390 F = Function::Create(Ty, Linkage, Name, M); 391 } 392 393 return Builder.CreateCall(F, {Buffer, Entry}); 394 } 395 396 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 397 const char *Name = "polly_freeKernel"; 398 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 399 Function *F = M->getFunction(Name); 400 401 // If F is not available, declare it. 402 if (!F) { 403 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 404 std::vector<Type *> Args; 405 Args.push_back(Builder.getInt8PtrTy()); 406 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 407 F = Function::Create(Ty, Linkage, Name, M); 408 } 409 410 Builder.CreateCall(F, {GPUKernel}); 411 } 412 413 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 414 const char *Name = "polly_freeDeviceMemory"; 415 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 416 Function *F = M->getFunction(Name); 417 418 // If F is not available, declare it. 419 if (!F) { 420 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 421 std::vector<Type *> Args; 422 Args.push_back(Builder.getInt8PtrTy()); 423 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 424 F = Function::Create(Ty, Linkage, Name, M); 425 } 426 427 Builder.CreateCall(F, {Array}); 428 } 429 430 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 431 const char *Name = "polly_allocateMemoryForDevice"; 432 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 433 Function *F = M->getFunction(Name); 434 435 // If F is not available, declare it. 436 if (!F) { 437 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 438 std::vector<Type *> Args; 439 Args.push_back(Builder.getInt64Ty()); 440 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 441 F = Function::Create(Ty, Linkage, Name, M); 442 } 443 444 return Builder.CreateCall(F, {Size}); 445 } 446 447 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 448 Value *DeviceData, 449 Value *Size) { 450 const char *Name = "polly_copyFromHostToDevice"; 451 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 452 Function *F = M->getFunction(Name); 453 454 // If F is not available, declare it. 455 if (!F) { 456 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 457 std::vector<Type *> Args; 458 Args.push_back(Builder.getInt8PtrTy()); 459 Args.push_back(Builder.getInt8PtrTy()); 460 Args.push_back(Builder.getInt64Ty()); 461 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 462 F = Function::Create(Ty, Linkage, Name, M); 463 } 464 465 Builder.CreateCall(F, {HostData, DeviceData, Size}); 466 } 467 468 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 469 Value *HostData, 470 Value *Size) { 471 const char *Name = "polly_copyFromDeviceToHost"; 472 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 473 Function *F = M->getFunction(Name); 474 475 // If F is not available, declare it. 476 if (!F) { 477 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 478 std::vector<Type *> Args; 479 Args.push_back(Builder.getInt8PtrTy()); 480 Args.push_back(Builder.getInt8PtrTy()); 481 Args.push_back(Builder.getInt64Ty()); 482 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 483 F = Function::Create(Ty, Linkage, Name, M); 484 } 485 486 Builder.CreateCall(F, {DeviceData, HostData, Size}); 487 } 488 489 Value *GPUNodeBuilder::createCallInitContext() { 490 const char *Name = "polly_initContext"; 491 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 492 Function *F = M->getFunction(Name); 493 494 // If F is not available, declare it. 495 if (!F) { 496 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 497 std::vector<Type *> Args; 498 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 499 F = Function::Create(Ty, Linkage, Name, M); 500 } 501 502 return Builder.CreateCall(F, {}); 503 } 504 505 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 506 const char *Name = "polly_freeContext"; 507 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 508 Function *F = M->getFunction(Name); 509 510 // If F is not available, declare it. 511 if (!F) { 512 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 513 std::vector<Type *> Args; 514 Args.push_back(Builder.getInt8PtrTy()); 515 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 516 F = Function::Create(Ty, Linkage, Name, M); 517 } 518 519 Builder.CreateCall(F, {Context}); 520 } 521 522 /// Check if one string is a prefix of another. 523 /// 524 /// @param String The string in which to look for the prefix. 525 /// @param Prefix The prefix to look for. 526 static bool isPrefix(std::string String, std::string Prefix) { 527 return String.find(Prefix) == 0; 528 } 529 530 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 531 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 532 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 533 534 if (!gpu_array_is_scalar(Array)) { 535 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 536 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 537 538 for (unsigned int i = 1; i < Array->n_index; i++) { 539 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 540 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 541 Res = isl_ast_expr_mul(Res, Expr); 542 } 543 544 Value *NumElements = ExprBuilder.create(Res); 545 ArraySize = Builder.CreateMul(ArraySize, NumElements); 546 } 547 isl_ast_build_free(Build); 548 return ArraySize; 549 } 550 551 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 552 enum DataDirection Direction) { 553 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 554 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 555 isl_id *Id = isl_ast_expr_get_id(Arg); 556 auto Array = (gpu_array_info *)isl_id_get_user(Id); 557 auto ScopArray = (ScopArrayInfo *)(Array->user); 558 559 Value *Size = getArraySize(Array); 560 Value *HostPtr = ScopArray->getBasePtr(); 561 562 Value *DevPtr = DeviceAllocations[ScopArray]; 563 564 if (gpu_array_is_scalar(Array)) { 565 HostPtr = Builder.CreateAlloca(ScopArray->getElementType()); 566 Builder.CreateStore(ScopArray->getBasePtr(), HostPtr); 567 } 568 569 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 570 571 if (Direction == HOST_TO_DEVICE) 572 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 573 else 574 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 575 576 isl_id_free(Id); 577 isl_ast_expr_free(Arg); 578 isl_ast_expr_free(Expr); 579 isl_ast_node_free(TransferStmt); 580 } 581 582 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 583 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 584 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 585 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 586 isl_id_free(Id); 587 isl_ast_expr_free(StmtExpr); 588 589 const char *Str = isl_id_get_name(Id); 590 if (!strcmp(Str, "kernel")) { 591 createKernel(UserStmt); 592 isl_ast_expr_free(Expr); 593 return; 594 } 595 596 if (isPrefix(Str, "to_device")) { 597 createDataTransfer(UserStmt, HOST_TO_DEVICE); 598 isl_ast_expr_free(Expr); 599 return; 600 } 601 602 if (isPrefix(Str, "from_device")) { 603 createDataTransfer(UserStmt, DEVICE_TO_HOST); 604 isl_ast_expr_free(Expr); 605 return; 606 } 607 608 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 609 struct ppcg_kernel_stmt *KernelStmt = 610 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 611 isl_id_free(Anno); 612 613 switch (KernelStmt->type) { 614 case ppcg_kernel_domain: 615 createScopStmt(Expr, KernelStmt); 616 isl_ast_node_free(UserStmt); 617 return; 618 case ppcg_kernel_copy: 619 // TODO: Create kernel copy stmt 620 isl_ast_expr_free(Expr); 621 isl_ast_node_free(UserStmt); 622 return; 623 case ppcg_kernel_sync: 624 createKernelSync(); 625 isl_ast_expr_free(Expr); 626 isl_ast_node_free(UserStmt); 627 return; 628 } 629 630 isl_ast_expr_free(Expr); 631 isl_ast_node_free(UserStmt); 632 return; 633 } 634 635 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 636 ppcg_kernel_stmt *KernelStmt) { 637 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 638 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 639 640 LoopToScevMapT LTS; 641 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 642 643 createSubstitutions(Expr, Stmt, LTS); 644 645 if (Stmt->isBlockStmt()) 646 BlockGen.copyStmt(*Stmt, LTS, Indexes); 647 else 648 assert(0 && "Region statement not supported\n"); 649 } 650 651 void GPUNodeBuilder::createKernelSync() { 652 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 653 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 654 Builder.CreateCall(Sync, {}); 655 } 656 657 /// Collect llvm::Values referenced from @p Node 658 /// 659 /// This function only applies to isl_ast_nodes that are user_nodes referring 660 /// to a ScopStmt. All other node types are ignore. 661 /// 662 /// @param Node The node to collect references for. 663 /// @param User A user pointer used as storage for the data that is collected. 664 /// 665 /// @returns isl_bool_true if data could be collected successfully. 666 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 667 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 668 return isl_bool_true; 669 670 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 671 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 672 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 673 const char *Str = isl_id_get_name(Id); 674 isl_id_free(Id); 675 isl_ast_expr_free(StmtExpr); 676 isl_ast_expr_free(Expr); 677 678 if (!isPrefix(Str, "Stmt")) 679 return isl_bool_true; 680 681 Id = isl_ast_node_get_annotation(Node); 682 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 683 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 684 isl_id_free(Id); 685 686 addReferencesFromStmt(Stmt, User); 687 688 return isl_bool_true; 689 } 690 691 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 692 SetVector<Value *> SubtreeValues; 693 SetVector<const SCEV *> SCEVs; 694 SetVector<const Loop *> Loops; 695 SubtreeReferences References = { 696 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 697 698 for (const auto &I : IDToValue) 699 SubtreeValues.insert(I.second); 700 701 isl_ast_node_foreach_descendant_top_down( 702 Kernel->tree, collectReferencesInGPUStmt, &References); 703 704 for (const SCEV *Expr : SCEVs) 705 findValues(Expr, SE, SubtreeValues); 706 707 for (auto &SAI : S.arrays()) 708 SubtreeValues.remove(SAI.second->getBasePtr()); 709 710 isl_space *Space = S.getParamSpace(); 711 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 712 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 713 assert(IDToValue.count(Id)); 714 Value *Val = IDToValue[Id]; 715 SubtreeValues.remove(Val); 716 isl_id_free(Id); 717 } 718 isl_space_free(Space); 719 720 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 721 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 722 assert(IDToValue.count(Id)); 723 Value *Val = IDToValue[Id]; 724 SubtreeValues.remove(Val); 725 isl_id_free(Id); 726 } 727 728 return SubtreeValues; 729 } 730 731 void GPUNodeBuilder::clearDominators(Function *F) { 732 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 733 std::vector<BasicBlock *> Nodes; 734 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 735 Nodes.push_back(I->getBlock()); 736 737 for (BasicBlock *BB : Nodes) 738 DT.eraseNode(BB); 739 } 740 741 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 742 for (BasicBlock &BB : *F) { 743 Loop *L = LI.getLoopFor(&BB); 744 if (L) 745 SE.forgetLoop(L); 746 } 747 } 748 749 void GPUNodeBuilder::clearLoops(Function *F) { 750 for (BasicBlock &BB : *F) { 751 Loop *L = LI.getLoopFor(&BB); 752 if (L) 753 SE.forgetLoop(L); 754 LI.removeBlock(&BB); 755 } 756 } 757 758 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 759 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 760 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 761 isl_id_free(Id); 762 isl_ast_node_free(KernelStmt); 763 764 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 765 766 assert(Kernel->tree && "Device AST of kernel node is empty"); 767 768 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 769 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 770 ValueMapT HostValueMap = ValueMap; 771 772 SetVector<const Loop *> Loops; 773 774 // Create for all loops we depend on values that contain the current loop 775 // iteration. These values are necessary to generate code for SCEVs that 776 // depend on such loops. As a result we need to pass them to the subfunction. 777 for (const Loop *L : Loops) { 778 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 779 SE.getUnknown(Builder.getInt64(1)), 780 L, SCEV::FlagAnyWrap); 781 Value *V = generateSCEV(OuterLIV); 782 OutsideLoopIterations[L] = SE.getUnknown(V); 783 SubtreeValues.insert(V); 784 } 785 786 createKernelFunction(Kernel, SubtreeValues); 787 788 create(isl_ast_node_copy(Kernel->tree)); 789 790 Function *F = Builder.GetInsertBlock()->getParent(); 791 clearDominators(F); 792 clearScalarEvolution(F); 793 clearLoops(F); 794 795 Builder.SetInsertPoint(&HostInsertPoint); 796 IDToValue = HostIDs; 797 798 ValueMap = HostValueMap; 799 ScalarMap.clear(); 800 PHIOpMap.clear(); 801 EscapeMap.clear(); 802 IDToSAI.clear(); 803 Annotator.resetAlternativeAliasBases(); 804 for (auto &BasePtr : LocalArrays) 805 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 806 LocalArrays.clear(); 807 808 std::string ASMString = finalizeKernelFunction(); 809 std::string Name = "kernel_" + std::to_string(Kernel->id); 810 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 811 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 812 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 813 createCallFreeKernel(GPUKernel); 814 } 815 816 /// Compute the DataLayout string for the NVPTX backend. 817 /// 818 /// @param is64Bit Are we looking for a 64 bit architecture? 819 static std::string computeNVPTXDataLayout(bool is64Bit) { 820 std::string Ret = "e"; 821 822 if (!is64Bit) 823 Ret += "-p:32:32"; 824 825 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 826 827 return Ret; 828 } 829 830 Function * 831 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 832 SetVector<Value *> &SubtreeValues) { 833 std::vector<Type *> Args; 834 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 835 836 for (long i = 0; i < Prog->n_array; i++) { 837 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 838 continue; 839 840 Args.push_back(Builder.getInt8PtrTy()); 841 } 842 843 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 844 845 for (long i = 0; i < NumHostIters; i++) 846 Args.push_back(Builder.getInt64Ty()); 847 848 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 849 850 for (long i = 0; i < NumVars; i++) 851 Args.push_back(Builder.getInt64Ty()); 852 853 for (auto *V : SubtreeValues) 854 Args.push_back(V->getType()); 855 856 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 857 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 858 GPUModule.get()); 859 FN->setCallingConv(CallingConv::PTX_Kernel); 860 861 auto Arg = FN->arg_begin(); 862 for (long i = 0; i < Kernel->n_array; i++) { 863 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 864 continue; 865 866 Arg->setName(Kernel->array[i].array->name); 867 868 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 869 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 870 Type *EleTy = SAI->getElementType(); 871 Value *Val = &*Arg; 872 SmallVector<const SCEV *, 4> Sizes; 873 isl_ast_build *Build = 874 isl_ast_build_from_context(isl_set_copy(Prog->context)); 875 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 876 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 877 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 878 auto V = ExprBuilder.create(DimSize); 879 Sizes.push_back(SE.getSCEV(V)); 880 } 881 const ScopArrayInfo *SAIRep = 882 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 883 LocalArrays.push_back(Val); 884 885 isl_ast_build_free(Build); 886 isl_id_free(Id); 887 IDToSAI[Id] = SAIRep; 888 Arg++; 889 } 890 891 for (long i = 0; i < NumHostIters; i++) { 892 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 893 Arg->setName(isl_id_get_name(Id)); 894 IDToValue[Id] = &*Arg; 895 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 896 Arg++; 897 } 898 899 for (long i = 0; i < NumVars; i++) { 900 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 901 Arg->setName(isl_id_get_name(Id)); 902 IDToValue[Id] = &*Arg; 903 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 904 Arg++; 905 } 906 907 for (auto *V : SubtreeValues) { 908 Arg->setName(V->getName()); 909 ValueMap[V] = &*Arg; 910 Arg++; 911 } 912 913 return FN; 914 } 915 916 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 917 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 918 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 919 920 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 921 Intrinsic::nvvm_read_ptx_sreg_tid_y, 922 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 923 924 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 925 std::string Name = isl_id_get_name(Id); 926 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 927 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 928 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 929 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 930 IDToValue[Id] = Val; 931 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 932 }; 933 934 for (int i = 0; i < Kernel->n_grid; ++i) { 935 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 936 addId(Id, IntrinsicsBID[i]); 937 } 938 939 for (int i = 0; i < Kernel->n_block; ++i) { 940 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 941 addId(Id, IntrinsicsTID[i]); 942 } 943 } 944 945 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 946 SetVector<Value *> &SubtreeValues) { 947 948 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 949 GPUModule.reset(new Module(Identifier, Builder.getContext())); 950 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 951 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 952 953 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 954 955 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 956 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 957 958 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 959 DT.addNewBlock(EntryBlock, PrevBlock); 960 961 Builder.SetInsertPoint(EntryBlock); 962 Builder.CreateRetVoid(); 963 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 964 965 insertKernelIntrinsics(Kernel); 966 } 967 968 std::string GPUNodeBuilder::createKernelASM() { 969 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 970 std::string ErrMsg; 971 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 972 973 if (!GPUTarget) { 974 errs() << ErrMsg << "\n"; 975 return ""; 976 } 977 978 TargetOptions Options; 979 Options.UnsafeFPMath = FastMath; 980 std::unique_ptr<TargetMachine> TargetM( 981 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 982 Options, Optional<Reloc::Model>())); 983 984 SmallString<0> ASMString; 985 raw_svector_ostream ASMStream(ASMString); 986 llvm::legacy::PassManager PM; 987 988 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 989 990 if (TargetM->addPassesToEmitFile( 991 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 992 errs() << "The target does not support generation of this file type!\n"; 993 return ""; 994 } 995 996 PM.run(*GPUModule); 997 998 return ASMStream.str(); 999 } 1000 1001 std::string GPUNodeBuilder::finalizeKernelFunction() { 1002 // Verify module. 1003 llvm::legacy::PassManager Passes; 1004 Passes.add(createVerifierPass()); 1005 Passes.run(*GPUModule); 1006 1007 if (DumpKernelIR) 1008 outs() << *GPUModule << "\n"; 1009 1010 // Optimize module. 1011 llvm::legacy::PassManager OptPasses; 1012 PassManagerBuilder PassBuilder; 1013 PassBuilder.OptLevel = 3; 1014 PassBuilder.SizeLevel = 0; 1015 PassBuilder.populateModulePassManager(OptPasses); 1016 OptPasses.run(*GPUModule); 1017 1018 std::string Assembly = createKernelASM(); 1019 1020 if (DumpKernelASM) 1021 outs() << Assembly << "\n"; 1022 1023 GPUModule.release(); 1024 KernelIDs.clear(); 1025 1026 return Assembly; 1027 } 1028 1029 namespace { 1030 class PPCGCodeGeneration : public ScopPass { 1031 public: 1032 static char ID; 1033 1034 /// The scop that is currently processed. 1035 Scop *S; 1036 1037 LoopInfo *LI; 1038 DominatorTree *DT; 1039 ScalarEvolution *SE; 1040 const DataLayout *DL; 1041 RegionInfo *RI; 1042 1043 PPCGCodeGeneration() : ScopPass(ID) {} 1044 1045 /// Construct compilation options for PPCG. 1046 /// 1047 /// @returns The compilation options. 1048 ppcg_options *createPPCGOptions() { 1049 auto DebugOptions = 1050 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1051 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1052 1053 DebugOptions->dump_schedule_constraints = false; 1054 DebugOptions->dump_schedule = false; 1055 DebugOptions->dump_final_schedule = false; 1056 DebugOptions->dump_sizes = false; 1057 1058 Options->debug = DebugOptions; 1059 1060 Options->reschedule = true; 1061 Options->scale_tile_loops = false; 1062 Options->wrap = false; 1063 1064 Options->non_negative_parameters = false; 1065 Options->ctx = nullptr; 1066 Options->sizes = nullptr; 1067 1068 Options->tile_size = 32; 1069 1070 Options->use_private_memory = false; 1071 Options->use_shared_memory = false; 1072 Options->max_shared_memory = 0; 1073 1074 Options->target = PPCG_TARGET_CUDA; 1075 Options->openmp = false; 1076 Options->linearize_device_arrays = true; 1077 Options->live_range_reordering = false; 1078 1079 Options->opencl_compiler_options = nullptr; 1080 Options->opencl_use_gpu = false; 1081 Options->opencl_n_include_file = 0; 1082 Options->opencl_include_files = nullptr; 1083 Options->opencl_print_kernel_types = false; 1084 Options->opencl_embed_kernel_code = false; 1085 1086 Options->save_schedule_file = nullptr; 1087 Options->load_schedule_file = nullptr; 1088 1089 return Options; 1090 } 1091 1092 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1093 /// 1094 /// Instead of a normal access of the form: 1095 /// 1096 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1097 /// 1098 /// a tagged access has the form 1099 /// 1100 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1101 /// 1102 /// where 'id' is an additional space that references the memory access that 1103 /// triggered the access. 1104 /// 1105 /// @param AccessTy The type of the memory accesses to collect. 1106 /// 1107 /// @return The relation describing all tagged memory accesses. 1108 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1109 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1110 1111 for (auto &Stmt : *S) 1112 for (auto &Acc : Stmt) 1113 if (Acc->getType() == AccessTy) { 1114 isl_map *Relation = Acc->getAccessRelation(); 1115 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1116 1117 isl_space *Space = isl_map_get_space(Relation); 1118 Space = isl_space_range(Space); 1119 Space = isl_space_from_range(Space); 1120 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1121 isl_map *Universe = isl_map_universe(Space); 1122 Relation = isl_map_domain_product(Relation, Universe); 1123 Accesses = isl_union_map_add_map(Accesses, Relation); 1124 } 1125 1126 return Accesses; 1127 } 1128 1129 /// Get the set of all read accesses, tagged with the access id. 1130 /// 1131 /// @see getTaggedAccesses 1132 isl_union_map *getTaggedReads() { 1133 return getTaggedAccesses(MemoryAccess::READ); 1134 } 1135 1136 /// Get the set of all may (and must) accesses, tagged with the access id. 1137 /// 1138 /// @see getTaggedAccesses 1139 isl_union_map *getTaggedMayWrites() { 1140 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1141 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1142 } 1143 1144 /// Get the set of all must accesses, tagged with the access id. 1145 /// 1146 /// @see getTaggedAccesses 1147 isl_union_map *getTaggedMustWrites() { 1148 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1149 } 1150 1151 /// Collect parameter and array names as isl_ids. 1152 /// 1153 /// To reason about the different parameters and arrays used, ppcg requires 1154 /// a list of all isl_ids in use. As PPCG traditionally performs 1155 /// source-to-source compilation each of these isl_ids is mapped to the 1156 /// expression that represents it. As we do not have a corresponding 1157 /// expression in Polly, we just map each id to a 'zero' expression to match 1158 /// the data format that ppcg expects. 1159 /// 1160 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1161 __isl_give isl_id_to_ast_expr *getNames() { 1162 auto *Names = isl_id_to_ast_expr_alloc( 1163 S->getIslCtx(), 1164 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1165 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1166 auto *Space = S->getParamSpace(); 1167 1168 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1169 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1170 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1171 } 1172 1173 for (auto &Array : S->arrays()) { 1174 auto Id = Array.second->getBasePtrId(); 1175 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1176 } 1177 1178 isl_space_free(Space); 1179 isl_ast_expr_free(Zero); 1180 1181 return Names; 1182 } 1183 1184 /// Create a new PPCG scop from the current scop. 1185 /// 1186 /// The PPCG scop is initialized with data from the current polly::Scop. From 1187 /// this initial data, the data-dependences in the PPCG scop are initialized. 1188 /// We do not use Polly's dependence analysis for now, to ensure we match 1189 /// the PPCG default behaviour more closely. 1190 /// 1191 /// @returns A new ppcg scop. 1192 ppcg_scop *createPPCGScop() { 1193 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1194 1195 PPCGScop->options = createPPCGOptions(); 1196 1197 PPCGScop->start = 0; 1198 PPCGScop->end = 0; 1199 1200 PPCGScop->context = S->getContext(); 1201 PPCGScop->domain = S->getDomains(); 1202 PPCGScop->call = nullptr; 1203 PPCGScop->tagged_reads = getTaggedReads(); 1204 PPCGScop->reads = S->getReads(); 1205 PPCGScop->live_in = nullptr; 1206 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1207 PPCGScop->may_writes = S->getWrites(); 1208 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1209 PPCGScop->must_writes = S->getMustWrites(); 1210 PPCGScop->live_out = nullptr; 1211 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1212 PPCGScop->tagger = nullptr; 1213 1214 PPCGScop->independence = nullptr; 1215 PPCGScop->dep_flow = nullptr; 1216 PPCGScop->tagged_dep_flow = nullptr; 1217 PPCGScop->dep_false = nullptr; 1218 PPCGScop->dep_forced = nullptr; 1219 PPCGScop->dep_order = nullptr; 1220 PPCGScop->tagged_dep_order = nullptr; 1221 1222 PPCGScop->schedule = S->getScheduleTree(); 1223 PPCGScop->names = getNames(); 1224 1225 PPCGScop->pet = nullptr; 1226 1227 compute_tagger(PPCGScop); 1228 compute_dependences(PPCGScop); 1229 1230 return PPCGScop; 1231 } 1232 1233 /// Collect the array acesses in a statement. 1234 /// 1235 /// @param Stmt The statement for which to collect the accesses. 1236 /// 1237 /// @returns A list of array accesses. 1238 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1239 gpu_stmt_access *Accesses = nullptr; 1240 1241 for (MemoryAccess *Acc : Stmt) { 1242 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1243 Access->read = Acc->isRead(); 1244 Access->write = Acc->isWrite(); 1245 Access->access = Acc->getAccessRelation(); 1246 isl_space *Space = isl_map_get_space(Access->access); 1247 Space = isl_space_range(Space); 1248 Space = isl_space_from_range(Space); 1249 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1250 isl_map *Universe = isl_map_universe(Space); 1251 Access->tagged_access = 1252 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1253 Access->exact_write = Acc->isWrite(); 1254 Access->ref_id = Acc->getId(); 1255 Access->next = Accesses; 1256 Accesses = Access; 1257 } 1258 1259 return Accesses; 1260 } 1261 1262 /// Collect the list of GPU statements. 1263 /// 1264 /// Each statement has an id, a pointer to the underlying data structure, 1265 /// as well as a list with all memory accesses. 1266 /// 1267 /// TODO: Initialize the list of memory accesses. 1268 /// 1269 /// @returns A linked-list of statements. 1270 gpu_stmt *getStatements() { 1271 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1272 std::distance(S->begin(), S->end())); 1273 1274 int i = 0; 1275 for (auto &Stmt : *S) { 1276 gpu_stmt *GPUStmt = &Stmts[i]; 1277 1278 GPUStmt->id = Stmt.getDomainId(); 1279 1280 // We use the pet stmt pointer to keep track of the Polly statements. 1281 GPUStmt->stmt = (pet_stmt *)&Stmt; 1282 GPUStmt->accesses = getStmtAccesses(Stmt); 1283 i++; 1284 } 1285 1286 return Stmts; 1287 } 1288 1289 /// Derive the extent of an array. 1290 /// 1291 /// The extent of an array is defined by the set of memory locations for 1292 /// which a memory access in the iteration domain exists. 1293 /// 1294 /// @param Array The array to derive the extent for. 1295 /// 1296 /// @returns An isl_set describing the extent of the array. 1297 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1298 isl_union_map *Accesses = S->getAccesses(); 1299 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1300 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1301 isl_set *AccessSet = 1302 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1303 isl_union_set_free(AccessUSet); 1304 1305 return AccessSet; 1306 } 1307 1308 /// Derive the bounds of an array. 1309 /// 1310 /// For the first dimension we derive the bound of the array from the extent 1311 /// of this dimension. For inner dimensions we obtain their size directly from 1312 /// ScopArrayInfo. 1313 /// 1314 /// @param PPCGArray The array to compute bounds for. 1315 /// @param Array The polly array from which to take the information. 1316 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1317 if (PPCGArray.n_index > 0) { 1318 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1319 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1320 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1321 isl_set_free(Dom); 1322 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1323 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1324 isl_aff *One = isl_aff_zero_on_domain(LS); 1325 One = isl_aff_add_constant_si(One, 1); 1326 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1327 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1328 PPCGArray.bound[0] = Bound; 1329 } 1330 1331 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1332 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1333 auto LS = isl_pw_aff_get_domain_space(Bound); 1334 auto Aff = isl_multi_aff_zero(LS); 1335 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1336 PPCGArray.bound[i] = Bound; 1337 } 1338 } 1339 1340 /// Create the arrays for @p PPCGProg. 1341 /// 1342 /// @param PPCGProg The program to compute the arrays for. 1343 void createArrays(gpu_prog *PPCGProg) { 1344 int i = 0; 1345 for (auto &Element : S->arrays()) { 1346 ScopArrayInfo *Array = Element.second.get(); 1347 1348 std::string TypeName; 1349 raw_string_ostream OS(TypeName); 1350 1351 OS << *Array->getElementType(); 1352 TypeName = OS.str(); 1353 1354 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1355 1356 PPCGArray.space = Array->getSpace(); 1357 PPCGArray.type = strdup(TypeName.c_str()); 1358 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1359 PPCGArray.name = strdup(Array->getName().c_str()); 1360 PPCGArray.extent = nullptr; 1361 PPCGArray.n_index = Array->getNumberOfDimensions(); 1362 PPCGArray.bound = 1363 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1364 PPCGArray.extent = getExtent(Array); 1365 PPCGArray.n_ref = 0; 1366 PPCGArray.refs = nullptr; 1367 PPCGArray.accessed = true; 1368 PPCGArray.read_only_scalar = false; 1369 PPCGArray.has_compound_element = false; 1370 PPCGArray.local = false; 1371 PPCGArray.declare_local = false; 1372 PPCGArray.global = false; 1373 PPCGArray.linearize = false; 1374 PPCGArray.dep_order = nullptr; 1375 PPCGArray.user = Array; 1376 1377 setArrayBounds(PPCGArray, Array); 1378 i++; 1379 1380 collect_references(PPCGProg, &PPCGArray); 1381 } 1382 } 1383 1384 /// Create an identity map between the arrays in the scop. 1385 /// 1386 /// @returns An identity map between the arrays in the scop. 1387 isl_union_map *getArrayIdentity() { 1388 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1389 1390 for (auto &Item : S->arrays()) { 1391 ScopArrayInfo *Array = Item.second.get(); 1392 isl_space *Space = Array->getSpace(); 1393 Space = isl_space_map_from_set(Space); 1394 isl_map *Identity = isl_map_identity(Space); 1395 Maps = isl_union_map_add_map(Maps, Identity); 1396 } 1397 1398 return Maps; 1399 } 1400 1401 /// Create a default-initialized PPCG GPU program. 1402 /// 1403 /// @returns A new gpu grogram description. 1404 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1405 1406 if (!PPCGScop) 1407 return nullptr; 1408 1409 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1410 1411 PPCGProg->ctx = S->getIslCtx(); 1412 PPCGProg->scop = PPCGScop; 1413 PPCGProg->context = isl_set_copy(PPCGScop->context); 1414 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1415 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1416 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1417 PPCGProg->tagged_must_kill = 1418 isl_union_map_copy(PPCGScop->tagged_must_kills); 1419 PPCGProg->to_inner = getArrayIdentity(); 1420 PPCGProg->to_outer = getArrayIdentity(); 1421 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1422 PPCGProg->any_to_outer = nullptr; 1423 PPCGProg->array_order = nullptr; 1424 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1425 PPCGProg->stmts = getStatements(); 1426 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1427 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1428 PPCGProg->n_array); 1429 1430 createArrays(PPCGProg); 1431 1432 return PPCGProg; 1433 } 1434 1435 struct PrintGPUUserData { 1436 struct cuda_info *CudaInfo; 1437 struct gpu_prog *PPCGProg; 1438 std::vector<ppcg_kernel *> Kernels; 1439 }; 1440 1441 /// Print a user statement node in the host code. 1442 /// 1443 /// We use ppcg's printing facilities to print the actual statement and 1444 /// additionally build up a list of all kernels that are encountered in the 1445 /// host ast. 1446 /// 1447 /// @param P The printer to print to 1448 /// @param Options The printing options to use 1449 /// @param Node The node to print 1450 /// @param User A user pointer to carry additional data. This pointer is 1451 /// expected to be of type PrintGPUUserData. 1452 /// 1453 /// @returns A printer to which the output has been printed. 1454 static __isl_give isl_printer * 1455 printHostUser(__isl_take isl_printer *P, 1456 __isl_take isl_ast_print_options *Options, 1457 __isl_take isl_ast_node *Node, void *User) { 1458 auto Data = (struct PrintGPUUserData *)User; 1459 auto Id = isl_ast_node_get_annotation(Node); 1460 1461 if (Id) { 1462 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1463 1464 // If this is a user statement, format it ourselves as ppcg would 1465 // otherwise try to call pet functionality that is not available in 1466 // Polly. 1467 if (IsUser) { 1468 P = isl_printer_start_line(P); 1469 P = isl_printer_print_ast_node(P, Node); 1470 P = isl_printer_end_line(P); 1471 isl_id_free(Id); 1472 isl_ast_print_options_free(Options); 1473 return P; 1474 } 1475 1476 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1477 isl_id_free(Id); 1478 Data->Kernels.push_back(Kernel); 1479 } 1480 1481 return print_host_user(P, Options, Node, User); 1482 } 1483 1484 /// Print C code corresponding to the control flow in @p Kernel. 1485 /// 1486 /// @param Kernel The kernel to print 1487 void printKernel(ppcg_kernel *Kernel) { 1488 auto *P = isl_printer_to_str(S->getIslCtx()); 1489 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1490 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1491 P = isl_ast_node_print(Kernel->tree, P, Options); 1492 char *String = isl_printer_get_str(P); 1493 printf("%s\n", String); 1494 free(String); 1495 isl_printer_free(P); 1496 } 1497 1498 /// Print C code corresponding to the GPU code described by @p Tree. 1499 /// 1500 /// @param Tree An AST describing GPU code 1501 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1502 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1503 auto *P = isl_printer_to_str(S->getIslCtx()); 1504 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1505 1506 PrintGPUUserData Data; 1507 Data.PPCGProg = PPCGProg; 1508 1509 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1510 Options = 1511 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1512 P = isl_ast_node_print(Tree, P, Options); 1513 char *String = isl_printer_get_str(P); 1514 printf("# host\n"); 1515 printf("%s\n", String); 1516 free(String); 1517 isl_printer_free(P); 1518 1519 for (auto Kernel : Data.Kernels) { 1520 printf("# kernel%d\n", Kernel->id); 1521 printKernel(Kernel); 1522 } 1523 } 1524 1525 // Generate a GPU program using PPCG. 1526 // 1527 // GPU mapping consists of multiple steps: 1528 // 1529 // 1) Compute new schedule for the program. 1530 // 2) Map schedule to GPU (TODO) 1531 // 3) Generate code for new schedule (TODO) 1532 // 1533 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1534 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1535 // strategy directly from this pass. 1536 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1537 1538 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1539 1540 PPCGGen->ctx = S->getIslCtx(); 1541 PPCGGen->options = PPCGScop->options; 1542 PPCGGen->print = nullptr; 1543 PPCGGen->print_user = nullptr; 1544 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1545 PPCGGen->prog = PPCGProg; 1546 PPCGGen->tree = nullptr; 1547 PPCGGen->types.n = 0; 1548 PPCGGen->types.name = nullptr; 1549 PPCGGen->sizes = nullptr; 1550 PPCGGen->used_sizes = nullptr; 1551 PPCGGen->kernel_id = 0; 1552 1553 // Set scheduling strategy to same strategy PPCG is using. 1554 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1555 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1556 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1557 1558 isl_schedule *Schedule = get_schedule(PPCGGen); 1559 1560 int has_permutable = has_any_permutable_node(Schedule); 1561 1562 if (!has_permutable || has_permutable < 0) { 1563 Schedule = isl_schedule_free(Schedule); 1564 } else { 1565 Schedule = map_to_device(PPCGGen, Schedule); 1566 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1567 } 1568 1569 if (DumpSchedule) { 1570 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1571 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1572 P = isl_printer_print_str(P, "Schedule\n"); 1573 P = isl_printer_print_str(P, "========\n"); 1574 if (Schedule) 1575 P = isl_printer_print_schedule(P, Schedule); 1576 else 1577 P = isl_printer_print_str(P, "No schedule found\n"); 1578 1579 printf("%s\n", isl_printer_get_str(P)); 1580 isl_printer_free(P); 1581 } 1582 1583 if (DumpCode) { 1584 printf("Code\n"); 1585 printf("====\n"); 1586 if (PPCGGen->tree) 1587 printGPUTree(PPCGGen->tree, PPCGProg); 1588 else 1589 printf("No code generated\n"); 1590 } 1591 1592 isl_schedule_free(Schedule); 1593 1594 return PPCGGen; 1595 } 1596 1597 /// Free gpu_gen structure. 1598 /// 1599 /// @param PPCGGen The ppcg_gen object to free. 1600 void freePPCGGen(gpu_gen *PPCGGen) { 1601 isl_ast_node_free(PPCGGen->tree); 1602 isl_union_map_free(PPCGGen->sizes); 1603 isl_union_map_free(PPCGGen->used_sizes); 1604 free(PPCGGen); 1605 } 1606 1607 /// Free the options in the ppcg scop structure. 1608 /// 1609 /// ppcg is not freeing these options for us. To avoid leaks we do this 1610 /// ourselves. 1611 /// 1612 /// @param PPCGScop The scop referencing the options to free. 1613 void freeOptions(ppcg_scop *PPCGScop) { 1614 free(PPCGScop->options->debug); 1615 PPCGScop->options->debug = nullptr; 1616 free(PPCGScop->options); 1617 PPCGScop->options = nullptr; 1618 } 1619 1620 /// Generate code for a given GPU AST described by @p Root. 1621 /// 1622 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1623 /// @param Prog The GPU Program to generate code for. 1624 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1625 ScopAnnotator Annotator; 1626 Annotator.buildAliasScopes(*S); 1627 1628 Region *R = &S->getRegion(); 1629 1630 simplifyRegion(R, DT, LI, RI); 1631 1632 BasicBlock *EnteringBB = R->getEnteringBlock(); 1633 1634 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1635 1636 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1637 Prog); 1638 1639 // Only build the run-time condition and parameters _after_ having 1640 // introduced the conditional branch. This is important as the conditional 1641 // branch will guard the original scop from new induction variables that 1642 // the SCEVExpander may introduce while code generating the parameters and 1643 // which may introduce scalar dependences that prevent us from correctly 1644 // code generating this scop. 1645 BasicBlock *StartBlock = 1646 executeScopConditionally(*S, this, Builder.getTrue()); 1647 1648 // TODO: Handle LICM 1649 // TODO: Verify run-time checks 1650 auto SplitBlock = StartBlock->getSinglePredecessor(); 1651 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1652 NodeBuilder.addParameters(S->getContext()); 1653 Builder.SetInsertPoint(&*StartBlock->begin()); 1654 1655 NodeBuilder.initializeAfterRTH(); 1656 NodeBuilder.create(Root); 1657 NodeBuilder.finalize(); 1658 } 1659 1660 bool runOnScop(Scop &CurrentScop) override { 1661 S = &CurrentScop; 1662 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1663 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1664 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1665 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 1666 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 1667 1668 // We currently do not support scops with invariant loads. 1669 if (S->hasInvariantAccesses()) 1670 return false; 1671 1672 auto PPCGScop = createPPCGScop(); 1673 auto PPCGProg = createPPCGProg(PPCGScop); 1674 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 1675 1676 if (PPCGGen->tree) 1677 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 1678 1679 freeOptions(PPCGScop); 1680 freePPCGGen(PPCGGen); 1681 gpu_prog_free(PPCGProg); 1682 ppcg_scop_free(PPCGScop); 1683 1684 return true; 1685 } 1686 1687 void printScop(raw_ostream &, Scop &) const override {} 1688 1689 void getAnalysisUsage(AnalysisUsage &AU) const override { 1690 AU.addRequired<DominatorTreeWrapperPass>(); 1691 AU.addRequired<RegionInfoPass>(); 1692 AU.addRequired<ScalarEvolutionWrapperPass>(); 1693 AU.addRequired<ScopDetection>(); 1694 AU.addRequired<ScopInfoRegionPass>(); 1695 AU.addRequired<LoopInfoWrapperPass>(); 1696 1697 AU.addPreserved<AAResultsWrapperPass>(); 1698 AU.addPreserved<BasicAAWrapperPass>(); 1699 AU.addPreserved<LoopInfoWrapperPass>(); 1700 AU.addPreserved<DominatorTreeWrapperPass>(); 1701 AU.addPreserved<GlobalsAAWrapperPass>(); 1702 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1703 AU.addPreserved<ScopDetection>(); 1704 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1705 AU.addPreserved<SCEVAAWrapperPass>(); 1706 1707 // FIXME: We do not yet add regions for the newly generated code to the 1708 // region tree. 1709 AU.addPreserved<RegionInfoPass>(); 1710 AU.addPreserved<ScopInfoRegionPass>(); 1711 } 1712 }; 1713 } 1714 1715 char PPCGCodeGeneration::ID = 1; 1716 1717 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 1718 1719 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 1720 "Polly - Apply PPCG translation to SCOP", false, false) 1721 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1722 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1723 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 1724 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1725 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 1726 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1727 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 1728 "Polly - Apply PPCG translation to SCOP", false, false) 1729