1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/PPCGCodeGeneration.h" 16 #include "polly/CodeGen/IslAst.h" 17 #include "polly/CodeGen/IslNodeBuilder.h" 18 #include "polly/CodeGen/Utils.h" 19 #include "polly/DependenceInfo.h" 20 #include "polly/LinkAllPasses.h" 21 #include "polly/Options.h" 22 #include "polly/ScopDetection.h" 23 #include "polly/ScopInfo.h" 24 #include "polly/Support/SCEVValidator.h" 25 #include "llvm/ADT/PostOrderIterator.h" 26 #include "llvm/Analysis/AliasAnalysis.h" 27 #include "llvm/Analysis/BasicAliasAnalysis.h" 28 #include "llvm/Analysis/GlobalsModRef.h" 29 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 30 #include "llvm/Analysis/TargetLibraryInfo.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/IR/LegacyPassManager.h" 33 #include "llvm/IR/Verifier.h" 34 #include "llvm/IRReader/IRReader.h" 35 #include "llvm/Linker/Linker.h" 36 #include "llvm/Support/TargetRegistry.h" 37 #include "llvm/Support/TargetSelect.h" 38 #include "llvm/Target/TargetMachine.h" 39 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 40 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 41 42 #include "isl/union_map.h" 43 44 extern "C" { 45 #include "ppcg/cuda.h" 46 #include "ppcg/gpu.h" 47 #include "ppcg/gpu_print.h" 48 #include "ppcg/ppcg.h" 49 #include "ppcg/schedule.h" 50 } 51 52 #include "llvm/Support/Debug.h" 53 54 using namespace polly; 55 using namespace llvm; 56 57 #define DEBUG_TYPE "polly-codegen-ppcg" 58 59 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 60 cl::desc("Dump the computed GPU Schedule"), 61 cl::Hidden, cl::init(false), cl::ZeroOrMore, 62 cl::cat(PollyCategory)); 63 64 static cl::opt<bool> 65 DumpCode("polly-acc-dump-code", 66 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 67 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 68 69 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 70 cl::desc("Dump the kernel LLVM-IR"), 71 cl::Hidden, cl::init(false), cl::ZeroOrMore, 72 cl::cat(PollyCategory)); 73 74 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 75 cl::desc("Dump the kernel assembly code"), 76 cl::Hidden, cl::init(false), cl::ZeroOrMore, 77 cl::cat(PollyCategory)); 78 79 static cl::opt<bool> FastMath("polly-acc-fastmath", 80 cl::desc("Allow unsafe math optimizations"), 81 cl::Hidden, cl::init(false), cl::ZeroOrMore, 82 cl::cat(PollyCategory)); 83 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 84 cl::desc("Use shared memory"), cl::Hidden, 85 cl::init(false), cl::ZeroOrMore, 86 cl::cat(PollyCategory)); 87 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 88 cl::desc("Use private memory"), cl::Hidden, 89 cl::init(false), cl::ZeroOrMore, 90 cl::cat(PollyCategory)); 91 92 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory", 93 cl::desc("Generate Host kernel code assuming" 94 " that all memory has been" 95 " declared as managed memory"), 96 cl::Hidden, cl::init(false), cl::ZeroOrMore, 97 cl::cat(PollyCategory)); 98 99 static cl::opt<bool> 100 FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure", 101 cl::desc("Fail and generate a backtrace if" 102 " verifyModule fails on the GPU " 103 " kernel module."), 104 cl::Hidden, cl::init(false), cl::ZeroOrMore, 105 cl::cat(PollyCategory)); 106 107 static cl::opt<std::string> CUDALibDevice( 108 "polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden, 109 cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"), 110 cl::ZeroOrMore, cl::cat(PollyCategory)); 111 112 static cl::opt<std::string> 113 CudaVersion("polly-acc-cuda-version", 114 cl::desc("The CUDA version to compile for"), cl::Hidden, 115 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 116 117 static cl::opt<int> 118 MinCompute("polly-acc-mincompute", 119 cl::desc("Minimal number of compute statements to run on GPU."), 120 cl::Hidden, cl::init(10 * 512 * 512)); 121 122 /// Return a unique name for a Scop, which is the scop region with the 123 /// function name. 124 std::string getUniqueScopName(const Scop *S) { 125 return "Scop Region: " + S->getNameStr() + 126 " | Function: " + std::string(S->getFunction().getName()); 127 } 128 129 /// Used to store information PPCG wants for kills. This information is 130 /// used by live range reordering. 131 /// 132 /// @see computeLiveRangeReordering 133 /// @see GPUNodeBuilder::createPPCGScop 134 /// @see GPUNodeBuilder::createPPCGProg 135 struct MustKillsInfo { 136 /// Collection of all kill statements that will be sequenced at the end of 137 /// PPCGScop->schedule. 138 /// 139 /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` 140 /// which merges schedules in *arbitrary* order. 141 /// (we don't care about the order of the kills anyway). 142 isl::schedule KillsSchedule; 143 /// Map from kill statement instances to scalars that need to be 144 /// killed. 145 /// 146 /// We currently derive kill information for: 147 /// 1. phi nodes. PHI nodes are not alive outside the scop and can 148 /// consequently all be killed. 149 /// 2. Scalar arrays that are not used outside the Scop. This is 150 /// checked by `isScalarUsesContainedInScop`. 151 /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } 152 isl::union_map TaggedMustKills; 153 154 /// Tagged must kills stripped of the tags. 155 /// [params] -> { Stmt_phantom[] -> scalar_to_kill[] } 156 isl::union_map MustKills; 157 158 MustKillsInfo() : KillsSchedule(nullptr) {} 159 }; 160 161 /// Check if SAI's uses are entirely contained within Scop S. 162 /// If a scalar is used only with a Scop, we are free to kill it, as no data 163 /// can flow in/out of the value any more. 164 /// @see computeMustKillsInfo 165 static bool isScalarUsesContainedInScop(const Scop &S, 166 const ScopArrayInfo *SAI) { 167 assert(SAI->isValueKind() && "this function only deals with scalars." 168 " Dealing with arrays required alias analysis"); 169 170 const Region &R = S.getRegion(); 171 for (User *U : SAI->getBasePtr()->users()) { 172 Instruction *I = dyn_cast<Instruction>(U); 173 assert(I && "invalid user of scop array info"); 174 if (!R.contains(I)) 175 return false; 176 } 177 return true; 178 } 179 180 /// Compute must-kills needed to enable live range reordering with PPCG. 181 /// 182 /// @params S The Scop to compute live range reordering information 183 /// @returns live range reordering information that can be used to setup 184 /// PPCG. 185 static MustKillsInfo computeMustKillsInfo(const Scop &S) { 186 const isl::space ParamSpace(isl::manage(S.getParamSpace())); 187 MustKillsInfo Info; 188 189 // 1. Collect all ScopArrayInfo that satisfy *any* of the criteria: 190 // 1.1 phi nodes in scop. 191 // 1.2 scalars that are only used within the scop 192 SmallVector<isl::id, 4> KillMemIds; 193 for (ScopArrayInfo *SAI : S.arrays()) { 194 if (SAI->isPHIKind() || 195 (SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI))) 196 KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release())); 197 } 198 199 Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace)); 200 Info.MustKills = isl::union_map::empty(isl::space(ParamSpace)); 201 202 // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the 203 // schedule: 204 // - filter: "[control] -> { }" 205 // So, we choose to not create this to keep the output a little nicer, 206 // at the cost of some code complexity. 207 Info.KillsSchedule = nullptr; 208 209 for (isl::id &ToKillId : KillMemIds) { 210 isl::id KillStmtId = isl::id::alloc( 211 S.getIslCtx(), 212 std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr); 213 214 // NOTE: construction of tagged_must_kill: 215 // 2. We need to construct a map: 216 // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] } 217 // To construct this, we use `isl_map_domain_product` on 2 maps`: 218 // 2a. StmtToScalar: 219 // [param] -> { Stmt_phantom[] -> scalar_to_kill[] } 220 // 2b. PhantomRefToScalar: 221 // [param] -> { ref_phantom[] -> scalar_to_kill[] } 222 // 223 // Combining these with `isl_map_domain_product` gives us 224 // TaggedMustKill: 225 // [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } 226 227 // 2a. [param] -> { Stmt[] -> scalar_to_kill[] } 228 isl::map StmtToScalar = isl::map::universe(isl::space(ParamSpace)); 229 StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId)); 230 StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId)); 231 232 isl::id PhantomRefId = isl::id::alloc( 233 S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(), 234 nullptr); 235 236 // 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] } 237 isl::map PhantomRefToScalar = isl::map::universe(isl::space(ParamSpace)); 238 PhantomRefToScalar = 239 PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId); 240 PhantomRefToScalar = 241 PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId); 242 243 // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] } 244 isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar); 245 Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); 246 247 // 2. [param] -> { Stmt[] -> scalar_to_kill[] } 248 Info.MustKills = Info.TaggedMustKills.domain_factor_domain(); 249 250 // 3. Create the kill schedule of the form: 251 // "[param] -> { Stmt_phantom[] }" 252 // Then add this to Info.KillsSchedule. 253 isl::space KillStmtSpace = ParamSpace; 254 KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId); 255 isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace); 256 257 isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain); 258 if (Info.KillsSchedule) 259 Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule); 260 else 261 Info.KillsSchedule = KillSchedule; 262 } 263 264 return Info; 265 } 266 267 /// Create the ast expressions for a ScopStmt. 268 /// 269 /// This function is a callback for to generate the ast expressions for each 270 /// of the scheduled ScopStmts. 271 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 272 void *StmtT, __isl_take isl_ast_build *Build_C, 273 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 274 isl_id *Id, void *User), 275 void *UserIndex, 276 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 277 void *UserExpr) { 278 279 ScopStmt *Stmt = (ScopStmt *)StmtT; 280 281 if (!Stmt || !Build_C) 282 return NULL; 283 284 isl::ast_build Build = isl::manage(isl_ast_build_copy(Build_C)); 285 isl::ctx Ctx = Build.get_ctx(); 286 isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0); 287 288 for (MemoryAccess *Acc : *Stmt) { 289 isl::map AddrFunc = Acc->getAddressFunction(); 290 AddrFunc = AddrFunc.intersect_domain(isl::manage(Stmt->getDomain())); 291 292 isl::id RefId = Acc->getId(); 293 isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc); 294 295 isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA); 296 MPA = MPA.coalesce(); 297 MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex)); 298 299 isl::ast_expr Access = Build.access_from(MPA); 300 Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr)); 301 RefToExpr = RefToExpr.set(RefId, Access); 302 } 303 304 return RefToExpr.release(); 305 } 306 307 /// Given a LLVM Type, compute its size in bytes, 308 static int computeSizeInBytes(const Type *T) { 309 int bytes = T->getPrimitiveSizeInBits() / 8; 310 if (bytes == 0) 311 bytes = T->getScalarSizeInBits() / 8; 312 return bytes; 313 } 314 315 /// Generate code for a GPU specific isl AST. 316 /// 317 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 318 /// generates code for general-purpose AST nodes, with special functionality 319 /// for generating GPU specific user nodes. 320 /// 321 /// @see GPUNodeBuilder::createUser 322 class GPUNodeBuilder : public IslNodeBuilder { 323 public: 324 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, 325 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 326 DominatorTree &DT, Scop &S, BasicBlock *StartBlock, 327 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) 328 : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), 329 Prog(Prog), Runtime(Runtime), Arch(Arch) { 330 getExprBuilder().setIDToSAI(&IDToSAI); 331 } 332 333 /// Create after-run-time-check initialization code. 334 void initializeAfterRTH(); 335 336 /// Finalize the generated scop. 337 virtual void finalize(); 338 339 /// Track if the full build process was successful. 340 /// 341 /// This value is set to false, if throughout the build process an error 342 /// occurred which prevents us from generating valid GPU code. 343 bool BuildSuccessful = true; 344 345 /// The maximal number of loops surrounding a sequential kernel. 346 unsigned DeepestSequential = 0; 347 348 /// The maximal number of loops surrounding a parallel kernel. 349 unsigned DeepestParallel = 0; 350 351 /// Return the name to set for the ptx_kernel. 352 std::string getKernelFuncName(int Kernel_id); 353 354 private: 355 /// A vector of array base pointers for which a new ScopArrayInfo was created. 356 /// 357 /// This vector is used to delete the ScopArrayInfo when it is not needed any 358 /// more. 359 std::vector<Value *> LocalArrays; 360 361 /// A map from ScopArrays to their corresponding device allocations. 362 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 363 364 /// The current GPU context. 365 Value *GPUContext; 366 367 /// The set of isl_ids allocated in the kernel 368 std::vector<isl_id *> KernelIds; 369 370 /// A module containing GPU code. 371 /// 372 /// This pointer is only set in case we are currently generating GPU code. 373 std::unique_ptr<Module> GPUModule; 374 375 /// The GPU program we generate code for. 376 gpu_prog *Prog; 377 378 /// The GPU Runtime implementation to use (OpenCL or CUDA). 379 GPURuntime Runtime; 380 381 /// The GPU Architecture to target. 382 GPUArch Arch; 383 384 /// Class to free isl_ids. 385 class IslIdDeleter { 386 public: 387 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 388 }; 389 390 /// A set containing all isl_ids allocated in a GPU kernel. 391 /// 392 /// By releasing this set all isl_ids will be freed. 393 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 394 395 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 396 397 /// Create code for user-defined AST nodes. 398 /// 399 /// These AST nodes can be of type: 400 /// 401 /// - ScopStmt: A computational statement (TODO) 402 /// - Kernel: A GPU kernel call (TODO) 403 /// - Data-Transfer: A GPU <-> CPU data-transfer 404 /// - In-kernel synchronization 405 /// - In-kernel memory copy statement 406 /// 407 /// @param UserStmt The ast node to generate code for. 408 virtual void createUser(__isl_take isl_ast_node *UserStmt); 409 410 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 411 412 /// Create code for a data transfer statement 413 /// 414 /// @param TransferStmt The data transfer statement. 415 /// @param Direction The direction in which to transfer data. 416 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 417 enum DataDirection Direction); 418 419 /// Find llvm::Values referenced in GPU kernel. 420 /// 421 /// @param Kernel The kernel to scan for llvm::Values 422 /// 423 /// @returns A pair, whose first element contains the set of values 424 /// referenced by the kernel, and whose second element contains the 425 /// set of functions referenced by the kernel. All functions in the 426 /// second set satisfy isValidFunctionInKernel. 427 std::pair<SetVector<Value *>, SetVector<Function *>> 428 getReferencesInKernel(ppcg_kernel *Kernel); 429 430 /// Compute the sizes of the execution grid for a given kernel. 431 /// 432 /// @param Kernel The kernel to compute grid sizes for. 433 /// 434 /// @returns A tuple with grid sizes for X and Y dimension 435 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 436 437 /// Creates a array that can be sent to the kernel on the device using a 438 /// host pointer. This is required for managed memory, when we directly send 439 /// host pointers to the device. 440 /// \note 441 /// This is to be used only with managed memory 442 Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, 443 ScopArrayInfo *ArrayInfo); 444 445 /// Compute the sizes of the thread blocks for a given kernel. 446 /// 447 /// @param Kernel The kernel to compute thread block sizes for. 448 /// 449 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 450 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 451 452 /// Store a specific kernel launch parameter in the array of kernel launch 453 /// parameters. 454 /// 455 /// @param Parameters The list of parameters in which to store. 456 /// @param Param The kernel launch parameter to store. 457 /// @param Index The index in the parameter list, at which to store the 458 /// parameter. 459 void insertStoreParameter(Instruction *Parameters, Instruction *Param, 460 int Index); 461 462 /// Create kernel launch parameters. 463 /// 464 /// @param Kernel The kernel to create parameters for. 465 /// @param F The kernel function that has been created. 466 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 467 /// 468 /// @returns A stack allocated array with pointers to the parameter 469 /// values that are passed to the kernel. 470 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 471 SetVector<Value *> SubtreeValues); 472 473 /// Create declarations for kernel variable. 474 /// 475 /// This includes shared memory declarations. 476 /// 477 /// @param Kernel The kernel definition to create variables for. 478 /// @param FN The function into which to generate the variables. 479 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 480 481 /// Add CUDA annotations to module. 482 /// 483 /// Add a set of CUDA annotations that declares the maximal block dimensions 484 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 485 /// PTX compiler to bound the number of allocated registers to ensure the 486 /// resulting kernel is known to run with up to as many block dimensions 487 /// as specified here. 488 /// 489 /// @param M The module to add the annotations to. 490 /// @param BlockDimX The size of block dimension X. 491 /// @param BlockDimY The size of block dimension Y. 492 /// @param BlockDimZ The size of block dimension Z. 493 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 494 Value *BlockDimZ); 495 496 /// Create GPU kernel. 497 /// 498 /// Code generate the kernel described by @p KernelStmt. 499 /// 500 /// @param KernelStmt The ast node to generate kernel code for. 501 void createKernel(__isl_take isl_ast_node *KernelStmt); 502 503 /// Generate code that computes the size of an array. 504 /// 505 /// @param Array The array for which to compute a size. 506 Value *getArraySize(gpu_array_info *Array); 507 508 /// Generate code to compute the minimal offset at which an array is accessed. 509 /// 510 /// The offset of an array is the minimal array location accessed in a scop. 511 /// 512 /// Example: 513 /// 514 /// for (long i = 0; i < 100; i++) 515 /// A[i + 42] += ... 516 /// 517 /// getArrayOffset(A) results in 42. 518 /// 519 /// @param Array The array for which to compute the offset. 520 /// @returns An llvm::Value that contains the offset of the array. 521 Value *getArrayOffset(gpu_array_info *Array); 522 523 /// Prepare the kernel arguments for kernel code generation 524 /// 525 /// @param Kernel The kernel to generate code for. 526 /// @param FN The function created for the kernel. 527 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 528 529 /// Create kernel function. 530 /// 531 /// Create a kernel function located in a newly created module that can serve 532 /// as target for device code generation. Set the Builder to point to the 533 /// start block of this newly created function. 534 /// 535 /// @param Kernel The kernel to generate code for. 536 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 537 /// @param SubtreeFunctions The set of llvm::Functions referenced by this 538 /// kernel. 539 void createKernelFunction(ppcg_kernel *Kernel, 540 SetVector<Value *> &SubtreeValues, 541 SetVector<Function *> &SubtreeFunctions); 542 543 /// Create the declaration of a kernel function. 544 /// 545 /// The kernel function takes as arguments: 546 /// 547 /// - One i8 pointer for each external array reference used in the kernel. 548 /// - Host iterators 549 /// - Parameters 550 /// - Other LLVM Value references (TODO) 551 /// 552 /// @param Kernel The kernel to generate the function declaration for. 553 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 554 /// 555 /// @returns The newly declared function. 556 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 557 SetVector<Value *> &SubtreeValues); 558 559 /// Insert intrinsic functions to obtain thread and block ids. 560 /// 561 /// @param The kernel to generate the intrinsic functions for. 562 void insertKernelIntrinsics(ppcg_kernel *Kernel); 563 564 /// Insert function calls to retrieve the SPIR group/local ids. 565 /// 566 /// @param The kernel to generate the function calls for. 567 void insertKernelCallsSPIR(ppcg_kernel *Kernel); 568 569 /// Setup the creation of functions referenced by the GPU kernel. 570 /// 571 /// 1. Create new function declarations in GPUModule which are the same as 572 /// SubtreeFunctions. 573 /// 574 /// 2. Populate IslNodeBuilder::ValueMap with mappings from 575 /// old functions (that come from the original module) to new functions 576 /// (that are created within GPUModule). That way, we generate references 577 /// to the correct function (in GPUModule) in BlockGenerator. 578 /// 579 /// @see IslNodeBuilder::ValueMap 580 /// @see BlockGenerator::GlobalMap 581 /// @see BlockGenerator::getNewValue 582 /// @see GPUNodeBuilder::getReferencesInKernel. 583 /// 584 /// @param SubtreeFunctions The set of llvm::Functions referenced by 585 /// this kernel. 586 void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions); 587 588 /// Create a global-to-shared or shared-to-global copy statement. 589 /// 590 /// @param CopyStmt The copy statement to generate code for 591 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 592 593 /// Create code for a ScopStmt called in @p Expr. 594 /// 595 /// @param Expr The expression containing the call. 596 /// @param KernelStmt The kernel statement referenced in the call. 597 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 598 599 /// Create an in-kernel synchronization call. 600 void createKernelSync(); 601 602 /// Create a PTX assembly string for the current GPU kernel. 603 /// 604 /// @returns A string containing the corresponding PTX assembly code. 605 std::string createKernelASM(); 606 607 /// Remove references from the dominator tree to the kernel function @p F. 608 /// 609 /// @param F The function to remove references to. 610 void clearDominators(Function *F); 611 612 /// Remove references from scalar evolution to the kernel function @p F. 613 /// 614 /// @param F The function to remove references to. 615 void clearScalarEvolution(Function *F); 616 617 /// Remove references from loop info to the kernel function @p F. 618 /// 619 /// @param F The function to remove references to. 620 void clearLoops(Function *F); 621 622 /// Check if the scop requires to be linked with CUDA's libdevice. 623 bool requiresCUDALibDevice(); 624 625 /// Link with the NVIDIA libdevice library (if needed and available). 626 void addCUDALibDevice(); 627 628 /// Finalize the generation of the kernel function. 629 /// 630 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 631 /// dump its IR to stderr. 632 /// 633 /// @returns The Assembly string of the kernel. 634 std::string finalizeKernelFunction(); 635 636 /// Finalize the generation of the kernel arguments. 637 /// 638 /// This function ensures that not-read-only scalars used in a kernel are 639 /// stored back to the global memory location they are backed with before 640 /// the kernel terminates. 641 /// 642 /// @params Kernel The kernel to finalize kernel arguments for. 643 void finalizeKernelArguments(ppcg_kernel *Kernel); 644 645 /// Create code that allocates memory to store arrays on device. 646 void allocateDeviceArrays(); 647 648 /// Free all allocated device arrays. 649 void freeDeviceArrays(); 650 651 /// Create a call to initialize the GPU context. 652 /// 653 /// @returns A pointer to the newly initialized context. 654 Value *createCallInitContext(); 655 656 /// Create a call to get the device pointer for a kernel allocation. 657 /// 658 /// @param Allocation The Polly GPU allocation 659 /// 660 /// @returns The device parameter corresponding to this allocation. 661 Value *createCallGetDevicePtr(Value *Allocation); 662 663 /// Create a call to free the GPU context. 664 /// 665 /// @param Context A pointer to an initialized GPU context. 666 void createCallFreeContext(Value *Context); 667 668 /// Create a call to allocate memory on the device. 669 /// 670 /// @param Size The size of memory to allocate 671 /// 672 /// @returns A pointer that identifies this allocation. 673 Value *createCallAllocateMemoryForDevice(Value *Size); 674 675 /// Create a call to free a device array. 676 /// 677 /// @param Array The device array to free. 678 void createCallFreeDeviceMemory(Value *Array); 679 680 /// Create a call to copy data from host to device. 681 /// 682 /// @param HostPtr A pointer to the host data that should be copied. 683 /// @param DevicePtr A device pointer specifying the location to copy to. 684 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 685 Value *Size); 686 687 /// Create a call to copy data from device to host. 688 /// 689 /// @param DevicePtr A pointer to the device data that should be copied. 690 /// @param HostPtr A host pointer specifying the location to copy to. 691 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 692 Value *Size); 693 694 /// Create a call to synchronize Host & Device. 695 /// \note 696 /// This is to be used only with managed memory. 697 void createCallSynchronizeDevice(); 698 699 /// Create a call to get a kernel from an assembly string. 700 /// 701 /// @param Buffer The string describing the kernel. 702 /// @param Entry The name of the kernel function to call. 703 /// 704 /// @returns A pointer to a kernel object 705 Value *createCallGetKernel(Value *Buffer, Value *Entry); 706 707 /// Create a call to free a GPU kernel. 708 /// 709 /// @param GPUKernel THe kernel to free. 710 void createCallFreeKernel(Value *GPUKernel); 711 712 /// Create a call to launch a GPU kernel. 713 /// 714 /// @param GPUKernel The kernel to launch. 715 /// @param GridDimX The size of the first grid dimension. 716 /// @param GridDimY The size of the second grid dimension. 717 /// @param GridBlockX The size of the first block dimension. 718 /// @param GridBlockY The size of the second block dimension. 719 /// @param GridBlockZ The size of the third block dimension. 720 /// @param Parameters A pointer to an array that contains itself pointers to 721 /// the parameter values passed for each kernel argument. 722 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 723 Value *GridDimY, Value *BlockDimX, 724 Value *BlockDimY, Value *BlockDimZ, 725 Value *Parameters); 726 }; 727 728 std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) { 729 return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" + 730 std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id); 731 } 732 733 void GPUNodeBuilder::initializeAfterRTH() { 734 BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 735 &*Builder.GetInsertPoint(), &DT, &LI); 736 NewBB->setName("polly.acc.initialize"); 737 Builder.SetInsertPoint(&NewBB->front()); 738 739 GPUContext = createCallInitContext(); 740 741 if (!ManagedMemory) 742 allocateDeviceArrays(); 743 } 744 745 void GPUNodeBuilder::finalize() { 746 if (!ManagedMemory) 747 freeDeviceArrays(); 748 749 createCallFreeContext(GPUContext); 750 IslNodeBuilder::finalize(); 751 } 752 753 void GPUNodeBuilder::allocateDeviceArrays() { 754 assert(!ManagedMemory && "Managed memory will directly send host pointers " 755 "to the kernel. There is no need for device arrays"); 756 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 757 758 for (int i = 0; i < Prog->n_array; ++i) { 759 gpu_array_info *Array = &Prog->array[i]; 760 auto *ScopArray = (ScopArrayInfo *)Array->user; 761 std::string DevArrayName("p_dev_array_"); 762 DevArrayName.append(Array->name); 763 764 Value *ArraySize = getArraySize(Array); 765 Value *Offset = getArrayOffset(Array); 766 if (Offset) 767 ArraySize = Builder.CreateSub( 768 ArraySize, 769 Builder.CreateMul(Offset, 770 Builder.getInt64(ScopArray->getElemSizeInBytes()))); 771 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 772 DevArray->setName(DevArrayName); 773 DeviceAllocations[ScopArray] = DevArray; 774 } 775 776 isl_ast_build_free(Build); 777 } 778 779 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 780 Value *BlockDimY, Value *BlockDimZ) { 781 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 782 783 for (auto &F : *M) { 784 if (F.getCallingConv() != CallingConv::PTX_Kernel) 785 continue; 786 787 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 788 789 Metadata *Elements[] = { 790 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 791 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 792 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 793 ValueAsMetadata::get(V[2]), 794 }; 795 MDNode *Node = MDNode::get(M->getContext(), Elements); 796 AnnotationNode->addOperand(Node); 797 } 798 } 799 800 void GPUNodeBuilder::freeDeviceArrays() { 801 assert(!ManagedMemory && "Managed memory does not use device arrays"); 802 for (auto &Array : DeviceAllocations) 803 createCallFreeDeviceMemory(Array.second); 804 } 805 806 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 807 const char *Name = "polly_getKernel"; 808 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 809 Function *F = M->getFunction(Name); 810 811 // If F is not available, declare it. 812 if (!F) { 813 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 814 std::vector<Type *> Args; 815 Args.push_back(Builder.getInt8PtrTy()); 816 Args.push_back(Builder.getInt8PtrTy()); 817 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 818 F = Function::Create(Ty, Linkage, Name, M); 819 } 820 821 return Builder.CreateCall(F, {Buffer, Entry}); 822 } 823 824 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 825 const char *Name = "polly_getDevicePtr"; 826 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 827 Function *F = M->getFunction(Name); 828 829 // If F is not available, declare it. 830 if (!F) { 831 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 832 std::vector<Type *> Args; 833 Args.push_back(Builder.getInt8PtrTy()); 834 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 835 F = Function::Create(Ty, Linkage, Name, M); 836 } 837 838 return Builder.CreateCall(F, {Allocation}); 839 } 840 841 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 842 Value *GridDimY, Value *BlockDimX, 843 Value *BlockDimY, Value *BlockDimZ, 844 Value *Parameters) { 845 const char *Name = "polly_launchKernel"; 846 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 847 Function *F = M->getFunction(Name); 848 849 // If F is not available, declare it. 850 if (!F) { 851 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 852 std::vector<Type *> Args; 853 Args.push_back(Builder.getInt8PtrTy()); 854 Args.push_back(Builder.getInt32Ty()); 855 Args.push_back(Builder.getInt32Ty()); 856 Args.push_back(Builder.getInt32Ty()); 857 Args.push_back(Builder.getInt32Ty()); 858 Args.push_back(Builder.getInt32Ty()); 859 Args.push_back(Builder.getInt8PtrTy()); 860 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 861 F = Function::Create(Ty, Linkage, Name, M); 862 } 863 864 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 865 BlockDimZ, Parameters}); 866 } 867 868 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 869 const char *Name = "polly_freeKernel"; 870 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 871 Function *F = M->getFunction(Name); 872 873 // If F is not available, declare it. 874 if (!F) { 875 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 876 std::vector<Type *> Args; 877 Args.push_back(Builder.getInt8PtrTy()); 878 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 879 F = Function::Create(Ty, Linkage, Name, M); 880 } 881 882 Builder.CreateCall(F, {GPUKernel}); 883 } 884 885 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 886 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 887 "for device"); 888 const char *Name = "polly_freeDeviceMemory"; 889 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 890 Function *F = M->getFunction(Name); 891 892 // If F is not available, declare it. 893 if (!F) { 894 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 895 std::vector<Type *> Args; 896 Args.push_back(Builder.getInt8PtrTy()); 897 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 898 F = Function::Create(Ty, Linkage, Name, M); 899 } 900 901 Builder.CreateCall(F, {Array}); 902 } 903 904 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 905 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 906 "for device"); 907 const char *Name = "polly_allocateMemoryForDevice"; 908 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 909 Function *F = M->getFunction(Name); 910 911 // If F is not available, declare it. 912 if (!F) { 913 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 914 std::vector<Type *> Args; 915 Args.push_back(Builder.getInt64Ty()); 916 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 917 F = Function::Create(Ty, Linkage, Name, M); 918 } 919 920 return Builder.CreateCall(F, {Size}); 921 } 922 923 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 924 Value *DeviceData, 925 Value *Size) { 926 assert(!ManagedMemory && "Managed memory does not transfer memory between " 927 "device and host"); 928 const char *Name = "polly_copyFromHostToDevice"; 929 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 930 Function *F = M->getFunction(Name); 931 932 // If F is not available, declare it. 933 if (!F) { 934 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 935 std::vector<Type *> Args; 936 Args.push_back(Builder.getInt8PtrTy()); 937 Args.push_back(Builder.getInt8PtrTy()); 938 Args.push_back(Builder.getInt64Ty()); 939 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 940 F = Function::Create(Ty, Linkage, Name, M); 941 } 942 943 Builder.CreateCall(F, {HostData, DeviceData, Size}); 944 } 945 946 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 947 Value *HostData, 948 Value *Size) { 949 assert(!ManagedMemory && "Managed memory does not transfer memory between " 950 "device and host"); 951 const char *Name = "polly_copyFromDeviceToHost"; 952 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 953 Function *F = M->getFunction(Name); 954 955 // If F is not available, declare it. 956 if (!F) { 957 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 958 std::vector<Type *> Args; 959 Args.push_back(Builder.getInt8PtrTy()); 960 Args.push_back(Builder.getInt8PtrTy()); 961 Args.push_back(Builder.getInt64Ty()); 962 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 963 F = Function::Create(Ty, Linkage, Name, M); 964 } 965 966 Builder.CreateCall(F, {DeviceData, HostData, Size}); 967 } 968 969 void GPUNodeBuilder::createCallSynchronizeDevice() { 970 assert(ManagedMemory && "explicit synchronization is only necessary for " 971 "managed memory"); 972 const char *Name = "polly_synchronizeDevice"; 973 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 974 Function *F = M->getFunction(Name); 975 976 // If F is not available, declare it. 977 if (!F) { 978 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 979 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); 980 F = Function::Create(Ty, Linkage, Name, M); 981 } 982 983 Builder.CreateCall(F); 984 } 985 986 Value *GPUNodeBuilder::createCallInitContext() { 987 const char *Name; 988 989 switch (Runtime) { 990 case GPURuntime::CUDA: 991 Name = "polly_initContextCUDA"; 992 break; 993 case GPURuntime::OpenCL: 994 Name = "polly_initContextCL"; 995 break; 996 } 997 998 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 999 Function *F = M->getFunction(Name); 1000 1001 // If F is not available, declare it. 1002 if (!F) { 1003 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 1004 std::vector<Type *> Args; 1005 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 1006 F = Function::Create(Ty, Linkage, Name, M); 1007 } 1008 1009 return Builder.CreateCall(F, {}); 1010 } 1011 1012 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 1013 const char *Name = "polly_freeContext"; 1014 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1015 Function *F = M->getFunction(Name); 1016 1017 // If F is not available, declare it. 1018 if (!F) { 1019 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 1020 std::vector<Type *> Args; 1021 Args.push_back(Builder.getInt8PtrTy()); 1022 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 1023 F = Function::Create(Ty, Linkage, Name, M); 1024 } 1025 1026 Builder.CreateCall(F, {Context}); 1027 } 1028 1029 /// Check if one string is a prefix of another. 1030 /// 1031 /// @param String The string in which to look for the prefix. 1032 /// @param Prefix The prefix to look for. 1033 static bool isPrefix(std::string String, std::string Prefix) { 1034 return String.find(Prefix) == 0; 1035 } 1036 1037 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 1038 isl::ast_build Build = 1039 isl::ast_build::from_context(isl::manage(S.getContext())); 1040 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 1041 1042 if (!gpu_array_is_scalar(Array)) { 1043 isl::multi_pw_aff ArrayBound = 1044 isl::manage(isl_multi_pw_aff_copy(Array->bound)); 1045 1046 isl::pw_aff OffsetDimZero = ArrayBound.get_pw_aff(0); 1047 isl::ast_expr Res = Build.expr_from(OffsetDimZero); 1048 1049 for (unsigned int i = 1; i < Array->n_index; i++) { 1050 isl::pw_aff Bound_I = ArrayBound.get_pw_aff(i); 1051 isl::ast_expr Expr = Build.expr_from(Bound_I); 1052 Res = Res.mul(Expr); 1053 } 1054 1055 Value *NumElements = ExprBuilder.create(Res.release()); 1056 if (NumElements->getType() != ArraySize->getType()) 1057 NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); 1058 ArraySize = Builder.CreateMul(ArraySize, NumElements); 1059 } 1060 return ArraySize; 1061 } 1062 1063 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { 1064 if (gpu_array_is_scalar(Array)) 1065 return nullptr; 1066 1067 isl::ast_build Build = 1068 isl::ast_build::from_context(isl::manage(S.getContext())); 1069 1070 isl::set Min = isl::manage(isl_set_copy(Array->extent)).lexmin(); 1071 1072 isl::set ZeroSet = isl::set::universe(Min.get_space()); 1073 1074 for (long i = 0; i < Min.dim(isl::dim::set); i++) 1075 ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0); 1076 1077 if (Min.is_subset(ZeroSet)) { 1078 return nullptr; 1079 } 1080 1081 isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.get_ctx(), 0)); 1082 1083 for (long i = 0; i < Min.dim(isl::dim::set); i++) { 1084 if (i > 0) { 1085 isl::pw_aff Bound_I = 1086 isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1)); 1087 isl::ast_expr BExpr = Build.expr_from(Bound_I); 1088 Result = Result.mul(BExpr); 1089 } 1090 isl::pw_aff DimMin = Min.dim_min(i); 1091 isl::ast_expr MExpr = Build.expr_from(DimMin); 1092 Result = Result.add(MExpr); 1093 } 1094 1095 return ExprBuilder.create(Result.release()); 1096 } 1097 1098 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, 1099 ScopArrayInfo *ArrayInfo) { 1100 1101 assert(ManagedMemory && "Only used when you wish to get a host " 1102 "pointer for sending data to the kernel, " 1103 "with managed memory"); 1104 std::map<ScopArrayInfo *, Value *>::iterator it; 1105 if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { 1106 return it->second; 1107 } else { 1108 Value *HostPtr; 1109 1110 if (gpu_array_is_scalar(Array)) 1111 HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); 1112 else 1113 HostPtr = ArrayInfo->getBasePtr(); 1114 HostPtr = getLatestValue(HostPtr); 1115 1116 Value *Offset = getArrayOffset(Array); 1117 if (Offset) { 1118 HostPtr = Builder.CreatePointerCast( 1119 HostPtr, ArrayInfo->getElementType()->getPointerTo()); 1120 HostPtr = Builder.CreateGEP(HostPtr, Offset); 1121 } 1122 1123 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 1124 DeviceAllocations[ArrayInfo] = HostPtr; 1125 return HostPtr; 1126 } 1127 } 1128 1129 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 1130 enum DataDirection Direction) { 1131 assert(!ManagedMemory && "Managed memory needs no data transfers"); 1132 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 1133 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 1134 isl_id *Id = isl_ast_expr_get_id(Arg); 1135 auto Array = (gpu_array_info *)isl_id_get_user(Id); 1136 auto ScopArray = (ScopArrayInfo *)(Array->user); 1137 1138 Value *Size = getArraySize(Array); 1139 Value *Offset = getArrayOffset(Array); 1140 Value *DevPtr = DeviceAllocations[ScopArray]; 1141 1142 Value *HostPtr; 1143 1144 if (gpu_array_is_scalar(Array)) 1145 HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 1146 else 1147 HostPtr = ScopArray->getBasePtr(); 1148 HostPtr = getLatestValue(HostPtr); 1149 1150 if (Offset) { 1151 HostPtr = Builder.CreatePointerCast( 1152 HostPtr, ScopArray->getElementType()->getPointerTo()); 1153 HostPtr = Builder.CreateGEP(HostPtr, Offset); 1154 } 1155 1156 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 1157 1158 if (Offset) { 1159 Size = Builder.CreateSub( 1160 Size, Builder.CreateMul( 1161 Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); 1162 } 1163 1164 if (Direction == HOST_TO_DEVICE) 1165 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 1166 else 1167 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 1168 1169 isl_id_free(Id); 1170 isl_ast_expr_free(Arg); 1171 isl_ast_expr_free(Expr); 1172 isl_ast_node_free(TransferStmt); 1173 } 1174 1175 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 1176 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 1177 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1178 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1179 isl_id_free(Id); 1180 isl_ast_expr_free(StmtExpr); 1181 1182 const char *Str = isl_id_get_name(Id); 1183 if (!strcmp(Str, "kernel")) { 1184 createKernel(UserStmt); 1185 isl_ast_expr_free(Expr); 1186 return; 1187 } 1188 if (!strcmp(Str, "init_device")) { 1189 initializeAfterRTH(); 1190 isl_ast_node_free(UserStmt); 1191 isl_ast_expr_free(Expr); 1192 return; 1193 } 1194 if (!strcmp(Str, "clear_device")) { 1195 finalize(); 1196 isl_ast_node_free(UserStmt); 1197 isl_ast_expr_free(Expr); 1198 return; 1199 } 1200 if (isPrefix(Str, "to_device")) { 1201 if (!ManagedMemory) 1202 createDataTransfer(UserStmt, HOST_TO_DEVICE); 1203 else 1204 isl_ast_node_free(UserStmt); 1205 1206 isl_ast_expr_free(Expr); 1207 return; 1208 } 1209 1210 if (isPrefix(Str, "from_device")) { 1211 if (!ManagedMemory) { 1212 createDataTransfer(UserStmt, DEVICE_TO_HOST); 1213 } else { 1214 createCallSynchronizeDevice(); 1215 isl_ast_node_free(UserStmt); 1216 } 1217 isl_ast_expr_free(Expr); 1218 return; 1219 } 1220 1221 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 1222 struct ppcg_kernel_stmt *KernelStmt = 1223 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 1224 isl_id_free(Anno); 1225 1226 switch (KernelStmt->type) { 1227 case ppcg_kernel_domain: 1228 createScopStmt(Expr, KernelStmt); 1229 isl_ast_node_free(UserStmt); 1230 return; 1231 case ppcg_kernel_copy: 1232 createKernelCopy(KernelStmt); 1233 isl_ast_expr_free(Expr); 1234 isl_ast_node_free(UserStmt); 1235 return; 1236 case ppcg_kernel_sync: 1237 createKernelSync(); 1238 isl_ast_expr_free(Expr); 1239 isl_ast_node_free(UserStmt); 1240 return; 1241 } 1242 1243 isl_ast_expr_free(Expr); 1244 isl_ast_node_free(UserStmt); 1245 return; 1246 } 1247 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 1248 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 1249 LocalIndex = isl_ast_expr_address_of(LocalIndex); 1250 Value *LocalAddr = ExprBuilder.create(LocalIndex); 1251 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 1252 Index = isl_ast_expr_address_of(Index); 1253 Value *GlobalAddr = ExprBuilder.create(Index); 1254 1255 if (KernelStmt->u.c.read) { 1256 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 1257 Builder.CreateStore(Load, LocalAddr); 1258 } else { 1259 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 1260 Builder.CreateStore(Load, GlobalAddr); 1261 } 1262 } 1263 1264 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 1265 ppcg_kernel_stmt *KernelStmt) { 1266 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1267 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 1268 1269 LoopToScevMapT LTS; 1270 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 1271 1272 createSubstitutions(Expr, Stmt, LTS); 1273 1274 if (Stmt->isBlockStmt()) 1275 BlockGen.copyStmt(*Stmt, LTS, Indexes); 1276 else 1277 RegionGen.copyStmt(*Stmt, LTS, Indexes); 1278 } 1279 1280 void GPUNodeBuilder::createKernelSync() { 1281 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1282 const char *SpirName = "__gen_ocl_barrier_global"; 1283 1284 Function *Sync; 1285 1286 switch (Arch) { 1287 case GPUArch::SPIR64: 1288 case GPUArch::SPIR32: 1289 Sync = M->getFunction(SpirName); 1290 1291 // If Sync is not available, declare it. 1292 if (!Sync) { 1293 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 1294 std::vector<Type *> Args; 1295 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 1296 Sync = Function::Create(Ty, Linkage, SpirName, M); 1297 Sync->setCallingConv(CallingConv::SPIR_FUNC); 1298 } 1299 break; 1300 case GPUArch::NVPTX64: 1301 Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 1302 break; 1303 } 1304 1305 Builder.CreateCall(Sync, {}); 1306 } 1307 1308 /// Collect llvm::Values referenced from @p Node 1309 /// 1310 /// This function only applies to isl_ast_nodes that are user_nodes referring 1311 /// to a ScopStmt. All other node types are ignore. 1312 /// 1313 /// @param Node The node to collect references for. 1314 /// @param User A user pointer used as storage for the data that is collected. 1315 /// 1316 /// @returns isl_bool_true if data could be collected successfully. 1317 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 1318 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 1319 return isl_bool_true; 1320 1321 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 1322 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1323 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1324 const char *Str = isl_id_get_name(Id); 1325 isl_id_free(Id); 1326 isl_ast_expr_free(StmtExpr); 1327 isl_ast_expr_free(Expr); 1328 1329 if (!isPrefix(Str, "Stmt")) 1330 return isl_bool_true; 1331 1332 Id = isl_ast_node_get_annotation(Node); 1333 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 1334 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1335 isl_id_free(Id); 1336 1337 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 1338 1339 return isl_bool_true; 1340 } 1341 1342 /// A list of functions that are available in NVIDIA's libdevice. 1343 const std::set<std::string> CUDALibDeviceFunctions = { 1344 "exp", "expf", "expl", "cos", "cosf", 1345 "sqrt", "sqrtf", "copysign", "copysignf", "copysignl"}; 1346 1347 /// Return the corresponding CUDA libdevice function name for @p F. 1348 /// 1349 /// Return "" if we are not compiling for CUDA. 1350 std::string getCUDALibDeviceFuntion(Function *F) { 1351 if (CUDALibDeviceFunctions.count(F->getName())) 1352 return std::string("__nv_") + std::string(F->getName()); 1353 1354 return ""; 1355 } 1356 1357 /// Check if F is a function that we can code-generate in a GPU kernel. 1358 static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) { 1359 assert(F && "F is an invalid pointer"); 1360 // We string compare against the name of the function to allow 1361 // all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and 1362 // "llvm.copysign". 1363 const StringRef Name = F->getName(); 1364 1365 if (AllowLibDevice && getCUDALibDeviceFuntion(F).length() > 0) 1366 return true; 1367 1368 return F->isIntrinsic() && 1369 (Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") || 1370 Name.startswith("llvm.copysign")); 1371 } 1372 1373 /// Do not take `Function` as a subtree value. 1374 /// 1375 /// We try to take the reference of all subtree values and pass them along 1376 /// to the kernel from the host. Taking an address of any function and 1377 /// trying to pass along is nonsensical. Only allow `Value`s that are not 1378 /// `Function`s. 1379 static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); } 1380 1381 /// Return `Function`s from `RawSubtreeValues`. 1382 static SetVector<Function *> 1383 getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues, 1384 bool AllowCUDALibDevice) { 1385 SetVector<Function *> SubtreeFunctions; 1386 for (Value *It : RawSubtreeValues) { 1387 Function *F = dyn_cast<Function>(It); 1388 if (F) { 1389 assert(isValidFunctionInKernel(F, AllowCUDALibDevice) && 1390 "Code should have bailed out by " 1391 "this point if an invalid function " 1392 "were present in a kernel."); 1393 SubtreeFunctions.insert(F); 1394 } 1395 } 1396 return SubtreeFunctions; 1397 } 1398 1399 std::pair<SetVector<Value *>, SetVector<Function *>> 1400 GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 1401 SetVector<Value *> SubtreeValues; 1402 SetVector<const SCEV *> SCEVs; 1403 SetVector<const Loop *> Loops; 1404 SubtreeReferences References = { 1405 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 1406 1407 for (const auto &I : IDToValue) 1408 SubtreeValues.insert(I.second); 1409 1410 isl_ast_node_foreach_descendant_top_down( 1411 Kernel->tree, collectReferencesInGPUStmt, &References); 1412 1413 for (const SCEV *Expr : SCEVs) 1414 findValues(Expr, SE, SubtreeValues); 1415 1416 for (auto &SAI : S.arrays()) 1417 SubtreeValues.remove(SAI->getBasePtr()); 1418 1419 isl_space *Space = S.getParamSpace(); 1420 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 1421 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 1422 assert(IDToValue.count(Id)); 1423 Value *Val = IDToValue[Id]; 1424 SubtreeValues.remove(Val); 1425 isl_id_free(Id); 1426 } 1427 isl_space_free(Space); 1428 1429 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 1430 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1431 assert(IDToValue.count(Id)); 1432 Value *Val = IDToValue[Id]; 1433 SubtreeValues.remove(Val); 1434 isl_id_free(Id); 1435 } 1436 1437 // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions 1438 // SubtreeValues. This is important, because we should not lose any 1439 // SubtreeValues in the process of constructing the 1440 // "ValidSubtree{Values, Functions} sets. Nor should the set 1441 // ValidSubtree{Values, Functions} have any common element. 1442 auto ValidSubtreeValuesIt = 1443 make_filter_range(SubtreeValues, isValidSubtreeValue); 1444 SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(), 1445 ValidSubtreeValuesIt.end()); 1446 1447 bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64; 1448 1449 SetVector<Function *> ValidSubtreeFunctions( 1450 getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice)); 1451 1452 // @see IslNodeBuilder::getReferencesInSubtree 1453 SetVector<Value *> ReplacedValues; 1454 for (Value *V : ValidSubtreeValues) { 1455 auto It = ValueMap.find(V); 1456 if (It == ValueMap.end()) 1457 ReplacedValues.insert(V); 1458 else 1459 ReplacedValues.insert(It->second); 1460 } 1461 return std::make_pair(ReplacedValues, ValidSubtreeFunctions); 1462 } 1463 1464 void GPUNodeBuilder::clearDominators(Function *F) { 1465 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 1466 std::vector<BasicBlock *> Nodes; 1467 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 1468 Nodes.push_back(I->getBlock()); 1469 1470 for (BasicBlock *BB : Nodes) 1471 DT.eraseNode(BB); 1472 } 1473 1474 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 1475 for (BasicBlock &BB : *F) { 1476 Loop *L = LI.getLoopFor(&BB); 1477 if (L) 1478 SE.forgetLoop(L); 1479 } 1480 } 1481 1482 void GPUNodeBuilder::clearLoops(Function *F) { 1483 for (BasicBlock &BB : *F) { 1484 Loop *L = LI.getLoopFor(&BB); 1485 if (L) 1486 SE.forgetLoop(L); 1487 LI.removeBlock(&BB); 1488 } 1489 } 1490 1491 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 1492 std::vector<Value *> Sizes; 1493 isl::ast_build Context = 1494 isl::ast_build::from_context(isl::manage(S.getContext())); 1495 1496 isl::multi_pw_aff GridSizePwAffs = 1497 isl::manage(isl_multi_pw_aff_copy(Kernel->grid_size)); 1498 for (long i = 0; i < Kernel->n_grid; i++) { 1499 isl::pw_aff Size = GridSizePwAffs.get_pw_aff(i); 1500 isl::ast_expr GridSize = Context.expr_from(Size); 1501 Value *Res = ExprBuilder.create(GridSize.release()); 1502 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 1503 Sizes.push_back(Res); 1504 } 1505 1506 for (long i = Kernel->n_grid; i < 3; i++) 1507 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1508 1509 return std::make_tuple(Sizes[0], Sizes[1]); 1510 } 1511 1512 std::tuple<Value *, Value *, Value *> 1513 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 1514 std::vector<Value *> Sizes; 1515 1516 for (long i = 0; i < Kernel->n_block; i++) { 1517 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 1518 Sizes.push_back(Res); 1519 } 1520 1521 for (long i = Kernel->n_block; i < 3; i++) 1522 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1523 1524 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 1525 } 1526 1527 void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, 1528 Instruction *Param, int Index) { 1529 Value *Slot = Builder.CreateGEP( 1530 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1531 Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1532 Builder.CreateStore(ParamTyped, Slot); 1533 } 1534 1535 Value * 1536 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 1537 SetVector<Value *> SubtreeValues) { 1538 const int NumArgs = F->arg_size(); 1539 std::vector<int> ArgSizes(NumArgs); 1540 1541 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); 1542 1543 BasicBlock *EntryBlock = 1544 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 1545 auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); 1546 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 1547 Instruction *Parameters = new AllocaInst( 1548 ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); 1549 1550 int Index = 0; 1551 for (long i = 0; i < Prog->n_array; i++) { 1552 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1553 continue; 1554 1555 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1556 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); 1557 1558 ArgSizes[Index] = SAI->getElemSizeInBytes(); 1559 1560 Value *DevArray = nullptr; 1561 if (ManagedMemory) { 1562 DevArray = getOrCreateManagedDeviceArray( 1563 &Prog->array[i], const_cast<ScopArrayInfo *>(SAI)); 1564 } else { 1565 DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)]; 1566 DevArray = createCallGetDevicePtr(DevArray); 1567 } 1568 assert(DevArray != nullptr && "Array to be offloaded to device not " 1569 "initialized"); 1570 Value *Offset = getArrayOffset(&Prog->array[i]); 1571 1572 if (Offset) { 1573 DevArray = Builder.CreatePointerCast( 1574 DevArray, SAI->getElementType()->getPointerTo()); 1575 DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); 1576 DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); 1577 } 1578 Value *Slot = Builder.CreateGEP( 1579 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1580 1581 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1582 Value *ValPtr = nullptr; 1583 if (ManagedMemory) 1584 ValPtr = DevArray; 1585 else 1586 ValPtr = BlockGen.getOrCreateAlloca(SAI); 1587 1588 assert(ValPtr != nullptr && "ValPtr that should point to a valid object" 1589 " to be stored into Parameters"); 1590 Value *ValPtrCast = 1591 Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); 1592 Builder.CreateStore(ValPtrCast, Slot); 1593 } else { 1594 Instruction *Param = 1595 new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, 1596 Launch + "_param_" + std::to_string(Index), 1597 EntryBlock->getTerminator()); 1598 Builder.CreateStore(DevArray, Param); 1599 Value *ParamTyped = 1600 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1601 Builder.CreateStore(ParamTyped, Slot); 1602 } 1603 Index++; 1604 } 1605 1606 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1607 1608 for (long i = 0; i < NumHostIters; i++) { 1609 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1610 Value *Val = IDToValue[Id]; 1611 isl_id_free(Id); 1612 1613 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1614 1615 Instruction *Param = 1616 new AllocaInst(Val->getType(), AddressSpace, 1617 Launch + "_param_" + std::to_string(Index), 1618 EntryBlock->getTerminator()); 1619 Builder.CreateStore(Val, Param); 1620 insertStoreParameter(Parameters, Param, Index); 1621 Index++; 1622 } 1623 1624 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1625 1626 for (long i = 0; i < NumVars; i++) { 1627 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1628 Value *Val = IDToValue[Id]; 1629 if (ValueMap.count(Val)) 1630 Val = ValueMap[Val]; 1631 isl_id_free(Id); 1632 1633 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1634 1635 Instruction *Param = 1636 new AllocaInst(Val->getType(), AddressSpace, 1637 Launch + "_param_" + std::to_string(Index), 1638 EntryBlock->getTerminator()); 1639 Builder.CreateStore(Val, Param); 1640 insertStoreParameter(Parameters, Param, Index); 1641 Index++; 1642 } 1643 1644 for (auto Val : SubtreeValues) { 1645 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1646 1647 Instruction *Param = 1648 new AllocaInst(Val->getType(), AddressSpace, 1649 Launch + "_param_" + std::to_string(Index), 1650 EntryBlock->getTerminator()); 1651 Builder.CreateStore(Val, Param); 1652 insertStoreParameter(Parameters, Param, Index); 1653 Index++; 1654 } 1655 1656 for (int i = 0; i < NumArgs; i++) { 1657 Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); 1658 Instruction *Param = 1659 new AllocaInst(Builder.getInt32Ty(), AddressSpace, 1660 Launch + "_param_size_" + std::to_string(i), 1661 EntryBlock->getTerminator()); 1662 Builder.CreateStore(Val, Param); 1663 insertStoreParameter(Parameters, Param, Index); 1664 Index++; 1665 } 1666 1667 auto Location = EntryBlock->getTerminator(); 1668 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1669 Launch + "_params_i8ptr", Location); 1670 } 1671 1672 void GPUNodeBuilder::setupKernelSubtreeFunctions( 1673 SetVector<Function *> SubtreeFunctions) { 1674 for (auto Fn : SubtreeFunctions) { 1675 const std::string ClonedFnName = Fn->getName(); 1676 Function *Clone = GPUModule->getFunction(ClonedFnName); 1677 if (!Clone) 1678 Clone = 1679 Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, 1680 ClonedFnName, GPUModule.get()); 1681 assert(Clone && "Expected cloned function to be initialized."); 1682 assert(ValueMap.find(Fn) == ValueMap.end() && 1683 "Fn already present in ValueMap"); 1684 ValueMap[Fn] = Clone; 1685 } 1686 } 1687 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1688 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1689 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1690 isl_id_free(Id); 1691 isl_ast_node_free(KernelStmt); 1692 1693 if (Kernel->n_grid > 1) 1694 DeepestParallel = 1695 std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set)); 1696 else 1697 DeepestSequential = 1698 std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set)); 1699 1700 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1701 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1702 1703 SetVector<Value *> SubtreeValues; 1704 SetVector<Function *> SubtreeFunctions; 1705 std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); 1706 1707 assert(Kernel->tree && "Device AST of kernel node is empty"); 1708 1709 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1710 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1711 ValueMapT HostValueMap = ValueMap; 1712 BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; 1713 ScalarMap.clear(); 1714 1715 SetVector<const Loop *> Loops; 1716 1717 // Create for all loops we depend on values that contain the current loop 1718 // iteration. These values are necessary to generate code for SCEVs that 1719 // depend on such loops. As a result we need to pass them to the subfunction. 1720 for (const Loop *L : Loops) { 1721 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1722 SE.getUnknown(Builder.getInt64(1)), 1723 L, SCEV::FlagAnyWrap); 1724 Value *V = generateSCEV(OuterLIV); 1725 OutsideLoopIterations[L] = SE.getUnknown(V); 1726 SubtreeValues.insert(V); 1727 } 1728 1729 createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); 1730 setupKernelSubtreeFunctions(SubtreeFunctions); 1731 1732 create(isl_ast_node_copy(Kernel->tree)); 1733 1734 finalizeKernelArguments(Kernel); 1735 Function *F = Builder.GetInsertBlock()->getParent(); 1736 if (Arch == GPUArch::NVPTX64) 1737 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1738 clearDominators(F); 1739 clearScalarEvolution(F); 1740 clearLoops(F); 1741 1742 IDToValue = HostIDs; 1743 1744 ValueMap = std::move(HostValueMap); 1745 ScalarMap = std::move(HostScalarMap); 1746 EscapeMap.clear(); 1747 IDToSAI.clear(); 1748 Annotator.resetAlternativeAliasBases(); 1749 for (auto &BasePtr : LocalArrays) 1750 S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); 1751 LocalArrays.clear(); 1752 1753 std::string ASMString = finalizeKernelFunction(); 1754 Builder.SetInsertPoint(&HostInsertPoint); 1755 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1756 1757 std::string Name = getKernelFuncName(Kernel->id); 1758 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1759 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1760 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1761 1762 Value *GridDimX, *GridDimY; 1763 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1764 1765 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1766 BlockDimZ, Parameters); 1767 createCallFreeKernel(GPUKernel); 1768 1769 for (auto Id : KernelIds) 1770 isl_id_free(Id); 1771 1772 KernelIds.clear(); 1773 } 1774 1775 /// Compute the DataLayout string for the NVPTX backend. 1776 /// 1777 /// @param is64Bit Are we looking for a 64 bit architecture? 1778 static std::string computeNVPTXDataLayout(bool is64Bit) { 1779 std::string Ret = ""; 1780 1781 if (!is64Bit) { 1782 Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1783 "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1784 "64-v128:128:128-n16:32:64"; 1785 } else { 1786 Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1787 "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1788 "64-v128:128:128-n16:32:64"; 1789 } 1790 1791 return Ret; 1792 } 1793 1794 /// Compute the DataLayout string for a SPIR kernel. 1795 /// 1796 /// @param is64Bit Are we looking for a 64 bit architecture? 1797 static std::string computeSPIRDataLayout(bool is64Bit) { 1798 std::string Ret = ""; 1799 1800 if (!is64Bit) { 1801 Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1802 "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" 1803 "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" 1804 "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; 1805 } else { 1806 Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1807 "64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:" 1808 "32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:" 1809 "256:256-v256:256:256-v512:512:512-v1024:1024:1024"; 1810 } 1811 1812 return Ret; 1813 } 1814 1815 Function * 1816 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1817 SetVector<Value *> &SubtreeValues) { 1818 std::vector<Type *> Args; 1819 std::string Identifier = getKernelFuncName(Kernel->id); 1820 1821 std::vector<Metadata *> MemoryType; 1822 1823 for (long i = 0; i < Prog->n_array; i++) { 1824 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1825 continue; 1826 1827 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1828 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1829 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); 1830 Args.push_back(SAI->getElementType()); 1831 MemoryType.push_back( 1832 ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); 1833 } else { 1834 static const int UseGlobalMemory = 1; 1835 Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); 1836 MemoryType.push_back( 1837 ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1))); 1838 } 1839 } 1840 1841 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1842 1843 for (long i = 0; i < NumHostIters; i++) { 1844 Args.push_back(Builder.getInt64Ty()); 1845 MemoryType.push_back( 1846 ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); 1847 } 1848 1849 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1850 1851 for (long i = 0; i < NumVars; i++) { 1852 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1853 Value *Val = IDToValue[Id]; 1854 isl_id_free(Id); 1855 Args.push_back(Val->getType()); 1856 MemoryType.push_back( 1857 ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); 1858 } 1859 1860 for (auto *V : SubtreeValues) { 1861 Args.push_back(V->getType()); 1862 MemoryType.push_back( 1863 ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0))); 1864 } 1865 1866 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1867 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1868 GPUModule.get()); 1869 1870 std::vector<Metadata *> EmptyStrings; 1871 1872 for (unsigned int i = 0; i < MemoryType.size(); i++) { 1873 EmptyStrings.push_back(MDString::get(FN->getContext(), "")); 1874 } 1875 1876 if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) { 1877 FN->setMetadata("kernel_arg_addr_space", 1878 MDNode::get(FN->getContext(), MemoryType)); 1879 FN->setMetadata("kernel_arg_name", 1880 MDNode::get(FN->getContext(), EmptyStrings)); 1881 FN->setMetadata("kernel_arg_access_qual", 1882 MDNode::get(FN->getContext(), EmptyStrings)); 1883 FN->setMetadata("kernel_arg_type", 1884 MDNode::get(FN->getContext(), EmptyStrings)); 1885 FN->setMetadata("kernel_arg_type_qual", 1886 MDNode::get(FN->getContext(), EmptyStrings)); 1887 FN->setMetadata("kernel_arg_base_type", 1888 MDNode::get(FN->getContext(), EmptyStrings)); 1889 } 1890 1891 switch (Arch) { 1892 case GPUArch::NVPTX64: 1893 FN->setCallingConv(CallingConv::PTX_Kernel); 1894 break; 1895 case GPUArch::SPIR32: 1896 case GPUArch::SPIR64: 1897 FN->setCallingConv(CallingConv::SPIR_KERNEL); 1898 break; 1899 } 1900 1901 auto Arg = FN->arg_begin(); 1902 for (long i = 0; i < Kernel->n_array; i++) { 1903 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1904 continue; 1905 1906 Arg->setName(Kernel->array[i].array->name); 1907 1908 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1909 const ScopArrayInfo *SAI = 1910 ScopArrayInfo::getFromId(isl::manage(isl_id_copy(Id))); 1911 Type *EleTy = SAI->getElementType(); 1912 Value *Val = &*Arg; 1913 SmallVector<const SCEV *, 4> Sizes; 1914 isl_ast_build *Build = 1915 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1916 Sizes.push_back(nullptr); 1917 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1918 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1919 Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j)); 1920 auto V = ExprBuilder.create(DimSize); 1921 Sizes.push_back(SE.getSCEV(V)); 1922 } 1923 const ScopArrayInfo *SAIRep = 1924 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); 1925 LocalArrays.push_back(Val); 1926 1927 isl_ast_build_free(Build); 1928 KernelIds.push_back(Id); 1929 IDToSAI[Id] = SAIRep; 1930 Arg++; 1931 } 1932 1933 for (long i = 0; i < NumHostIters; i++) { 1934 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1935 Arg->setName(isl_id_get_name(Id)); 1936 IDToValue[Id] = &*Arg; 1937 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1938 Arg++; 1939 } 1940 1941 for (long i = 0; i < NumVars; i++) { 1942 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1943 Arg->setName(isl_id_get_name(Id)); 1944 Value *Val = IDToValue[Id]; 1945 ValueMap[Val] = &*Arg; 1946 IDToValue[Id] = &*Arg; 1947 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1948 Arg++; 1949 } 1950 1951 for (auto *V : SubtreeValues) { 1952 Arg->setName(V->getName()); 1953 ValueMap[V] = &*Arg; 1954 Arg++; 1955 } 1956 1957 return FN; 1958 } 1959 1960 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1961 Intrinsic::ID IntrinsicsBID[2]; 1962 Intrinsic::ID IntrinsicsTID[3]; 1963 1964 switch (Arch) { 1965 case GPUArch::SPIR64: 1966 case GPUArch::SPIR32: 1967 llvm_unreachable("Cannot generate NVVM intrinsics for SPIR"); 1968 case GPUArch::NVPTX64: 1969 IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; 1970 IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; 1971 1972 IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; 1973 IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; 1974 IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; 1975 break; 1976 } 1977 1978 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1979 std::string Name = isl_id_get_name(Id); 1980 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1981 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1982 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1983 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1984 IDToValue[Id] = Val; 1985 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1986 }; 1987 1988 for (int i = 0; i < Kernel->n_grid; ++i) { 1989 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1990 addId(Id, IntrinsicsBID[i]); 1991 } 1992 1993 for (int i = 0; i < Kernel->n_block; ++i) { 1994 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1995 addId(Id, IntrinsicsTID[i]); 1996 } 1997 } 1998 1999 void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) { 2000 const char *GroupName[3] = {"__gen_ocl_get_group_id0", 2001 "__gen_ocl_get_group_id1", 2002 "__gen_ocl_get_group_id2"}; 2003 2004 const char *LocalName[3] = {"__gen_ocl_get_local_id0", 2005 "__gen_ocl_get_local_id1", 2006 "__gen_ocl_get_local_id2"}; 2007 2008 auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable { 2009 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 2010 Function *FN = M->getFunction(Name); 2011 2012 // If FN is not available, declare it. 2013 if (!FN) { 2014 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 2015 std::vector<Type *> Args; 2016 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false); 2017 FN = Function::Create(Ty, Linkage, Name, M); 2018 FN->setCallingConv(CallingConv::SPIR_FUNC); 2019 } 2020 2021 Value *Val = Builder.CreateCall(FN, {}); 2022 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 2023 IDToValue[Id] = Val; 2024 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 2025 }; 2026 2027 for (int i = 0; i < Kernel->n_grid; ++i) 2028 createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i)); 2029 2030 for (int i = 0; i < Kernel->n_block; ++i) 2031 createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i)); 2032 } 2033 2034 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 2035 auto Arg = FN->arg_begin(); 2036 for (long i = 0; i < Kernel->n_array; i++) { 2037 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 2038 continue; 2039 2040 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 2041 const ScopArrayInfo *SAI = 2042 ScopArrayInfo::getFromId(isl::manage(isl_id_copy(Id))); 2043 isl_id_free(Id); 2044 2045 if (SAI->getNumberOfDimensions() > 0) { 2046 Arg++; 2047 continue; 2048 } 2049 2050 Value *Val = &*Arg; 2051 2052 if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { 2053 Type *TypePtr = SAI->getElementType()->getPointerTo(); 2054 Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); 2055 Val = Builder.CreateLoad(TypedArgPtr); 2056 } 2057 2058 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 2059 Builder.CreateStore(Val, Alloca); 2060 2061 Arg++; 2062 } 2063 } 2064 2065 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { 2066 auto *FN = Builder.GetInsertBlock()->getParent(); 2067 auto Arg = FN->arg_begin(); 2068 2069 bool StoredScalar = false; 2070 for (long i = 0; i < Kernel->n_array; i++) { 2071 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 2072 continue; 2073 2074 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 2075 const ScopArrayInfo *SAI = 2076 ScopArrayInfo::getFromId(isl::manage(isl_id_copy(Id))); 2077 isl_id_free(Id); 2078 2079 if (SAI->getNumberOfDimensions() > 0) { 2080 Arg++; 2081 continue; 2082 } 2083 2084 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 2085 Arg++; 2086 continue; 2087 } 2088 2089 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 2090 Value *ArgPtr = &*Arg; 2091 Type *TypePtr = SAI->getElementType()->getPointerTo(); 2092 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 2093 Value *Val = Builder.CreateLoad(Alloca); 2094 Builder.CreateStore(Val, TypedArgPtr); 2095 StoredScalar = true; 2096 2097 Arg++; 2098 } 2099 2100 if (StoredScalar) { 2101 /// In case more than one thread contains scalar stores, the generated 2102 /// code might be incorrect, if we only store at the end of the kernel. 2103 /// To support this case we need to store these scalars back at each 2104 /// memory store or at least before each kernel barrier. 2105 if (Kernel->n_block != 0 || Kernel->n_grid != 0) { 2106 BuildSuccessful = 0; 2107 DEBUG( 2108 dbgs() << getUniqueScopName(&S) 2109 << " has a store to a scalar value that" 2110 " would be undefined to run in parallel. Bailing out.\n";); 2111 } 2112 } 2113 } 2114 2115 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 2116 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 2117 2118 for (int i = 0; i < Kernel->n_var; ++i) { 2119 struct ppcg_kernel_var &Var = Kernel->var[i]; 2120 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 2121 Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType(); 2122 2123 Type *ArrayTy = EleTy; 2124 SmallVector<const SCEV *, 4> Sizes; 2125 2126 Sizes.push_back(nullptr); 2127 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 2128 isl_val *Val = isl_vec_get_element_val(Var.size, j); 2129 long Bound = isl_val_get_num_si(Val); 2130 isl_val_free(Val); 2131 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 2132 } 2133 2134 for (int j = Var.array->n_index - 1; j >= 0; --j) { 2135 isl_val *Val = isl_vec_get_element_val(Var.size, j); 2136 long Bound = isl_val_get_num_si(Val); 2137 isl_val_free(Val); 2138 ArrayTy = ArrayType::get(ArrayTy, Bound); 2139 } 2140 2141 const ScopArrayInfo *SAI; 2142 Value *Allocation; 2143 if (Var.type == ppcg_access_shared) { 2144 auto GlobalVar = new GlobalVariable( 2145 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 2146 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 2147 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 2148 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 2149 2150 Allocation = GlobalVar; 2151 } else if (Var.type == ppcg_access_private) { 2152 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 2153 } else { 2154 llvm_unreachable("unknown variable type"); 2155 } 2156 SAI = 2157 S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); 2158 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 2159 IDToValue[Id] = Allocation; 2160 LocalArrays.push_back(Allocation); 2161 KernelIds.push_back(Id); 2162 IDToSAI[Id] = SAI; 2163 } 2164 } 2165 2166 void GPUNodeBuilder::createKernelFunction( 2167 ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues, 2168 SetVector<Function *> &SubtreeFunctions) { 2169 std::string Identifier = getKernelFuncName(Kernel->id); 2170 GPUModule.reset(new Module(Identifier, Builder.getContext())); 2171 2172 switch (Arch) { 2173 case GPUArch::NVPTX64: 2174 if (Runtime == GPURuntime::CUDA) 2175 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 2176 else if (Runtime == GPURuntime::OpenCL) 2177 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); 2178 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 2179 break; 2180 case GPUArch::SPIR32: 2181 GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown")); 2182 GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */)); 2183 break; 2184 case GPUArch::SPIR64: 2185 GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown")); 2186 GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */)); 2187 break; 2188 } 2189 2190 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 2191 2192 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 2193 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 2194 2195 DT.addNewBlock(EntryBlock, PrevBlock); 2196 2197 Builder.SetInsertPoint(EntryBlock); 2198 Builder.CreateRetVoid(); 2199 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 2200 2201 ScopDetection::markFunctionAsInvalid(FN); 2202 2203 prepareKernelArguments(Kernel, FN); 2204 createKernelVariables(Kernel, FN); 2205 2206 switch (Arch) { 2207 case GPUArch::NVPTX64: 2208 insertKernelIntrinsics(Kernel); 2209 break; 2210 case GPUArch::SPIR32: 2211 case GPUArch::SPIR64: 2212 insertKernelCallsSPIR(Kernel); 2213 break; 2214 } 2215 } 2216 2217 std::string GPUNodeBuilder::createKernelASM() { 2218 llvm::Triple GPUTriple; 2219 2220 switch (Arch) { 2221 case GPUArch::NVPTX64: 2222 switch (Runtime) { 2223 case GPURuntime::CUDA: 2224 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); 2225 break; 2226 case GPURuntime::OpenCL: 2227 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); 2228 break; 2229 } 2230 break; 2231 case GPUArch::SPIR64: 2232 case GPUArch::SPIR32: 2233 std::string SPIRAssembly; 2234 raw_string_ostream IROstream(SPIRAssembly); 2235 IROstream << *GPUModule; 2236 IROstream.flush(); 2237 return SPIRAssembly; 2238 } 2239 2240 std::string ErrMsg; 2241 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 2242 2243 if (!GPUTarget) { 2244 errs() << ErrMsg << "\n"; 2245 return ""; 2246 } 2247 2248 TargetOptions Options; 2249 Options.UnsafeFPMath = FastMath; 2250 2251 std::string subtarget; 2252 2253 switch (Arch) { 2254 case GPUArch::NVPTX64: 2255 subtarget = CudaVersion; 2256 break; 2257 case GPUArch::SPIR32: 2258 case GPUArch::SPIR64: 2259 llvm_unreachable("No subtarget for SPIR architecture"); 2260 } 2261 2262 std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine( 2263 GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>())); 2264 2265 SmallString<0> ASMString; 2266 raw_svector_ostream ASMStream(ASMString); 2267 llvm::legacy::PassManager PM; 2268 2269 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 2270 2271 if (TargetM->addPassesToEmitFile( 2272 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 2273 errs() << "The target does not support generation of this file type!\n"; 2274 return ""; 2275 } 2276 2277 PM.run(*GPUModule); 2278 2279 return ASMStream.str(); 2280 } 2281 2282 bool GPUNodeBuilder::requiresCUDALibDevice() { 2283 for (Function &F : GPUModule->functions()) { 2284 if (!F.isDeclaration()) 2285 continue; 2286 2287 std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F); 2288 if (CUDALibDeviceFunc.length() != 0) { 2289 F.setName(CUDALibDeviceFunc); 2290 return true; 2291 } 2292 } 2293 2294 return false; 2295 } 2296 2297 void GPUNodeBuilder::addCUDALibDevice() { 2298 if (Arch != GPUArch::NVPTX64) 2299 return; 2300 2301 if (requiresCUDALibDevice()) { 2302 SMDiagnostic Error; 2303 2304 errs() << CUDALibDevice << "\n"; 2305 auto LibDeviceModule = 2306 parseIRFile(CUDALibDevice, Error, GPUModule->getContext()); 2307 2308 if (!LibDeviceModule) { 2309 BuildSuccessful = false; 2310 report_fatal_error("Could not find or load libdevice. Skipping GPU " 2311 "kernel generation. Please set -polly-acc-libdevice " 2312 "accordingly.\n"); 2313 return; 2314 } 2315 2316 Linker L(*GPUModule); 2317 2318 // Set an nvptx64 target triple to avoid linker warnings. The original 2319 // triple of the libdevice files are nvptx-unknown-unknown. 2320 LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 2321 L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded); 2322 } 2323 } 2324 2325 std::string GPUNodeBuilder::finalizeKernelFunction() { 2326 2327 if (verifyModule(*GPUModule)) { 2328 DEBUG(dbgs() << "verifyModule failed on module:\n"; 2329 GPUModule->print(dbgs(), nullptr); dbgs() << "\n";); 2330 DEBUG(dbgs() << "verifyModule Error:\n"; 2331 verifyModule(*GPUModule, &dbgs());); 2332 2333 if (FailOnVerifyModuleFailure) 2334 llvm_unreachable("VerifyModule failed."); 2335 2336 BuildSuccessful = false; 2337 return ""; 2338 } 2339 2340 addCUDALibDevice(); 2341 2342 if (DumpKernelIR) 2343 outs() << *GPUModule << "\n"; 2344 2345 if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) { 2346 // Optimize module. 2347 llvm::legacy::PassManager OptPasses; 2348 PassManagerBuilder PassBuilder; 2349 PassBuilder.OptLevel = 3; 2350 PassBuilder.SizeLevel = 0; 2351 PassBuilder.populateModulePassManager(OptPasses); 2352 OptPasses.run(*GPUModule); 2353 } 2354 2355 std::string Assembly = createKernelASM(); 2356 2357 if (DumpKernelASM) 2358 outs() << Assembly << "\n"; 2359 2360 GPUModule.release(); 2361 KernelIDs.clear(); 2362 2363 return Assembly; 2364 } 2365 /// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff` 2366 /// @param PwAffs The list of piecewise affine functions to create an 2367 /// `isl_pw_aff_list` from. We expect an rvalue ref because 2368 /// all the isl_pw_aff are used up by this function. 2369 /// 2370 /// @returns The `isl_pw_aff_list`. 2371 __isl_give isl_pw_aff_list * 2372 createPwAffList(isl_ctx *Context, 2373 const std::vector<__isl_take isl_pw_aff *> &&PwAffs) { 2374 isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size()); 2375 2376 for (unsigned i = 0; i < PwAffs.size(); i++) { 2377 List = isl_pw_aff_list_insert(List, i, PwAffs[i]); 2378 } 2379 return List; 2380 } 2381 2382 /// Align all the `PwAffs` such that they have the same parameter dimensions. 2383 /// 2384 /// We loop over all `pw_aff` and align all of their spaces together to 2385 /// create a common space for all the `pw_aff`. This common space is the 2386 /// `AlignSpace`. We then align all the `pw_aff` to this space. We start 2387 /// with the given `SeedSpace`. 2388 /// @param PwAffs The list of piecewise affine functions we want to align. 2389 /// This is an rvalue reference because the entire vector is 2390 /// used up by the end of the operation. 2391 /// @param SeedSpace The space to start the alignment process with. 2392 /// @returns A std::pair, whose first element is the aligned space, 2393 /// whose second element is the vector of aligned piecewise 2394 /// affines. 2395 static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>> 2396 alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs, 2397 __isl_take isl_space *SeedSpace) { 2398 assert(SeedSpace && "Invalid seed space given."); 2399 2400 isl_space *AlignSpace = SeedSpace; 2401 for (isl_pw_aff *PwAff : PwAffs) { 2402 isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff); 2403 AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace); 2404 } 2405 std::vector<isl_pw_aff *> AdjustedPwAffs; 2406 2407 for (unsigned i = 0; i < PwAffs.size(); i++) { 2408 isl_pw_aff *Adjusted = PwAffs[i]; 2409 assert(Adjusted && "Invalid pw_aff given."); 2410 Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace)); 2411 AdjustedPwAffs.push_back(Adjusted); 2412 } 2413 return std::make_pair(AlignSpace, AdjustedPwAffs); 2414 } 2415 2416 namespace { 2417 class PPCGCodeGeneration : public ScopPass { 2418 public: 2419 static char ID; 2420 2421 GPURuntime Runtime = GPURuntime::CUDA; 2422 2423 GPUArch Architecture = GPUArch::NVPTX64; 2424 2425 /// The scop that is currently processed. 2426 Scop *S; 2427 2428 LoopInfo *LI; 2429 DominatorTree *DT; 2430 ScalarEvolution *SE; 2431 const DataLayout *DL; 2432 RegionInfo *RI; 2433 2434 PPCGCodeGeneration() : ScopPass(ID) {} 2435 2436 /// Construct compilation options for PPCG. 2437 /// 2438 /// @returns The compilation options. 2439 ppcg_options *createPPCGOptions() { 2440 auto DebugOptions = 2441 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 2442 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 2443 2444 DebugOptions->dump_schedule_constraints = false; 2445 DebugOptions->dump_schedule = false; 2446 DebugOptions->dump_final_schedule = false; 2447 DebugOptions->dump_sizes = false; 2448 DebugOptions->verbose = false; 2449 2450 Options->debug = DebugOptions; 2451 2452 Options->group_chains = false; 2453 Options->reschedule = true; 2454 Options->scale_tile_loops = false; 2455 Options->wrap = false; 2456 2457 Options->non_negative_parameters = false; 2458 Options->ctx = nullptr; 2459 Options->sizes = nullptr; 2460 2461 Options->tile = true; 2462 Options->tile_size = 32; 2463 2464 Options->isolate_full_tiles = false; 2465 2466 Options->use_private_memory = PrivateMemory; 2467 Options->use_shared_memory = SharedMemory; 2468 Options->max_shared_memory = 48 * 1024; 2469 2470 Options->target = PPCG_TARGET_CUDA; 2471 Options->openmp = false; 2472 Options->linearize_device_arrays = true; 2473 Options->allow_gnu_extensions = false; 2474 2475 Options->unroll_copy_shared = false; 2476 Options->unroll_gpu_tile = false; 2477 Options->live_range_reordering = true; 2478 2479 Options->live_range_reordering = true; 2480 Options->hybrid = false; 2481 Options->opencl_compiler_options = nullptr; 2482 Options->opencl_use_gpu = false; 2483 Options->opencl_n_include_file = 0; 2484 Options->opencl_include_files = nullptr; 2485 Options->opencl_print_kernel_types = false; 2486 Options->opencl_embed_kernel_code = false; 2487 2488 Options->save_schedule_file = nullptr; 2489 Options->load_schedule_file = nullptr; 2490 2491 return Options; 2492 } 2493 2494 /// Get a tagged access relation containing all accesses of type @p AccessTy. 2495 /// 2496 /// Instead of a normal access of the form: 2497 /// 2498 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 2499 /// 2500 /// a tagged access has the form 2501 /// 2502 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 2503 /// 2504 /// where 'id' is an additional space that references the memory access that 2505 /// triggered the access. 2506 /// 2507 /// @param AccessTy The type of the memory accesses to collect. 2508 /// 2509 /// @return The relation describing all tagged memory accesses. 2510 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 2511 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 2512 2513 for (auto &Stmt : *S) 2514 for (auto &Acc : Stmt) 2515 if (Acc->getType() == AccessTy) { 2516 isl_map *Relation = Acc->getAccessRelation().release(); 2517 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 2518 2519 isl_space *Space = isl_map_get_space(Relation); 2520 Space = isl_space_range(Space); 2521 Space = isl_space_from_range(Space); 2522 Space = 2523 isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); 2524 isl_map *Universe = isl_map_universe(Space); 2525 Relation = isl_map_domain_product(Relation, Universe); 2526 Accesses = isl_union_map_add_map(Accesses, Relation); 2527 } 2528 2529 return Accesses; 2530 } 2531 2532 /// Get the set of all read accesses, tagged with the access id. 2533 /// 2534 /// @see getTaggedAccesses 2535 isl_union_map *getTaggedReads() { 2536 return getTaggedAccesses(MemoryAccess::READ); 2537 } 2538 2539 /// Get the set of all may (and must) accesses, tagged with the access id. 2540 /// 2541 /// @see getTaggedAccesses 2542 isl_union_map *getTaggedMayWrites() { 2543 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 2544 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 2545 } 2546 2547 /// Get the set of all must accesses, tagged with the access id. 2548 /// 2549 /// @see getTaggedAccesses 2550 isl_union_map *getTaggedMustWrites() { 2551 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 2552 } 2553 2554 /// Collect parameter and array names as isl_ids. 2555 /// 2556 /// To reason about the different parameters and arrays used, ppcg requires 2557 /// a list of all isl_ids in use. As PPCG traditionally performs 2558 /// source-to-source compilation each of these isl_ids is mapped to the 2559 /// expression that represents it. As we do not have a corresponding 2560 /// expression in Polly, we just map each id to a 'zero' expression to match 2561 /// the data format that ppcg expects. 2562 /// 2563 /// @returns Retun a map from collected ids to 'zero' ast expressions. 2564 __isl_give isl_id_to_ast_expr *getNames() { 2565 auto *Names = isl_id_to_ast_expr_alloc( 2566 S->getIslCtx(), 2567 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 2568 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 2569 2570 for (const SCEV *P : S->parameters()) { 2571 isl_id *Id = S->getIdForParam(P); 2572 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 2573 } 2574 2575 for (auto &Array : S->arrays()) { 2576 auto Id = Array->getBasePtrId().release(); 2577 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 2578 } 2579 2580 isl_ast_expr_free(Zero); 2581 2582 return Names; 2583 } 2584 2585 /// Create a new PPCG scop from the current scop. 2586 /// 2587 /// The PPCG scop is initialized with data from the current polly::Scop. From 2588 /// this initial data, the data-dependences in the PPCG scop are initialized. 2589 /// We do not use Polly's dependence analysis for now, to ensure we match 2590 /// the PPCG default behaviour more closely. 2591 /// 2592 /// @returns A new ppcg scop. 2593 ppcg_scop *createPPCGScop() { 2594 MustKillsInfo KillsInfo = computeMustKillsInfo(*S); 2595 2596 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 2597 2598 PPCGScop->options = createPPCGOptions(); 2599 // enable live range reordering 2600 PPCGScop->options->live_range_reordering = 1; 2601 2602 PPCGScop->start = 0; 2603 PPCGScop->end = 0; 2604 2605 PPCGScop->context = S->getContext(); 2606 PPCGScop->domain = S->getDomains(); 2607 // TODO: investigate this further. PPCG calls collect_call_domains. 2608 PPCGScop->call = isl_union_set_from_set(S->getContext()); 2609 PPCGScop->tagged_reads = getTaggedReads(); 2610 PPCGScop->reads = S->getReads(); 2611 PPCGScop->live_in = nullptr; 2612 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 2613 PPCGScop->may_writes = S->getWrites(); 2614 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 2615 PPCGScop->must_writes = S->getMustWrites(); 2616 PPCGScop->live_out = nullptr; 2617 PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take(); 2618 PPCGScop->must_kills = KillsInfo.MustKills.take(); 2619 2620 PPCGScop->tagger = nullptr; 2621 PPCGScop->independence = 2622 isl_union_map_empty(isl_set_get_space(PPCGScop->context)); 2623 PPCGScop->dep_flow = nullptr; 2624 PPCGScop->tagged_dep_flow = nullptr; 2625 PPCGScop->dep_false = nullptr; 2626 PPCGScop->dep_forced = nullptr; 2627 PPCGScop->dep_order = nullptr; 2628 PPCGScop->tagged_dep_order = nullptr; 2629 2630 PPCGScop->schedule = S->getScheduleTree(); 2631 // If we have something non-trivial to kill, add it to the schedule 2632 if (KillsInfo.KillsSchedule.get()) 2633 PPCGScop->schedule = isl_schedule_sequence( 2634 PPCGScop->schedule, KillsInfo.KillsSchedule.take()); 2635 2636 PPCGScop->names = getNames(); 2637 PPCGScop->pet = nullptr; 2638 2639 compute_tagger(PPCGScop); 2640 compute_dependences(PPCGScop); 2641 eliminate_dead_code(PPCGScop); 2642 2643 return PPCGScop; 2644 } 2645 2646 /// Collect the array accesses in a statement. 2647 /// 2648 /// @param Stmt The statement for which to collect the accesses. 2649 /// 2650 /// @returns A list of array accesses. 2651 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 2652 gpu_stmt_access *Accesses = nullptr; 2653 2654 for (MemoryAccess *Acc : Stmt) { 2655 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 2656 Access->read = Acc->isRead(); 2657 Access->write = Acc->isWrite(); 2658 Access->access = Acc->getAccessRelation().release(); 2659 isl_space *Space = isl_map_get_space(Access->access); 2660 Space = isl_space_range(Space); 2661 Space = isl_space_from_range(Space); 2662 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release()); 2663 isl_map *Universe = isl_map_universe(Space); 2664 Access->tagged_access = 2665 isl_map_domain_product(Acc->getAccessRelation().release(), Universe); 2666 Access->exact_write = !Acc->isMayWrite(); 2667 Access->ref_id = Acc->getId().release(); 2668 Access->next = Accesses; 2669 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 2670 Accesses = Access; 2671 } 2672 2673 return Accesses; 2674 } 2675 2676 /// Collect the list of GPU statements. 2677 /// 2678 /// Each statement has an id, a pointer to the underlying data structure, 2679 /// as well as a list with all memory accesses. 2680 /// 2681 /// TODO: Initialize the list of memory accesses. 2682 /// 2683 /// @returns A linked-list of statements. 2684 gpu_stmt *getStatements() { 2685 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 2686 std::distance(S->begin(), S->end())); 2687 2688 int i = 0; 2689 for (auto &Stmt : *S) { 2690 gpu_stmt *GPUStmt = &Stmts[i]; 2691 2692 GPUStmt->id = Stmt.getDomainId(); 2693 2694 // We use the pet stmt pointer to keep track of the Polly statements. 2695 GPUStmt->stmt = (pet_stmt *)&Stmt; 2696 GPUStmt->accesses = getStmtAccesses(Stmt); 2697 i++; 2698 } 2699 2700 return Stmts; 2701 } 2702 2703 /// Derive the extent of an array. 2704 /// 2705 /// The extent of an array is the set of elements that are within the 2706 /// accessed array. For the inner dimensions, the extent constraints are 2707 /// 0 and the size of the corresponding array dimension. For the first 2708 /// (outermost) dimension, the extent constraints are the minimal and maximal 2709 /// subscript value for the first dimension. 2710 /// 2711 /// @param Array The array to derive the extent for. 2712 /// 2713 /// @returns An isl_set describing the extent of the array. 2714 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 2715 unsigned NumDims = Array->getNumberOfDimensions(); 2716 isl_union_map *Accesses = S->getAccesses(); 2717 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 2718 Accesses = isl_union_map_detect_equalities(Accesses); 2719 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 2720 AccessUSet = isl_union_set_coalesce(AccessUSet); 2721 AccessUSet = isl_union_set_detect_equalities(AccessUSet); 2722 AccessUSet = isl_union_set_coalesce(AccessUSet); 2723 2724 if (isl_union_set_is_empty(AccessUSet)) { 2725 isl_union_set_free(AccessUSet); 2726 return isl_set_empty(Array->getSpace().release()); 2727 } 2728 2729 if (Array->getNumberOfDimensions() == 0) { 2730 isl_union_set_free(AccessUSet); 2731 return isl_set_universe(Array->getSpace().release()); 2732 } 2733 2734 isl_set *AccessSet = 2735 isl_union_set_extract_set(AccessUSet, Array->getSpace().release()); 2736 2737 isl_union_set_free(AccessUSet); 2738 isl_local_space *LS = 2739 isl_local_space_from_space(Array->getSpace().release()); 2740 2741 isl_pw_aff *Val = 2742 isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 2743 2744 isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 2745 isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 2746 OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 2747 isl_pw_aff_dim(Val, isl_dim_in)); 2748 OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 2749 isl_pw_aff_dim(Val, isl_dim_in)); 2750 OuterMin = isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, 2751 Array->getBasePtrId().release()); 2752 OuterMax = isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, 2753 Array->getBasePtrId().release()); 2754 2755 isl_set *Extent = isl_set_universe(Array->getSpace().release()); 2756 2757 Extent = isl_set_intersect( 2758 Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 2759 Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 2760 2761 for (unsigned i = 1; i < NumDims; ++i) 2762 Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 2763 2764 for (unsigned i = 0; i < NumDims; ++i) { 2765 isl_pw_aff *PwAff = 2766 const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i).release()); 2767 2768 // isl_pw_aff can be NULL for zero dimension. Only in the case of a 2769 // Fortran array will we have a legitimate dimension. 2770 if (!PwAff) { 2771 assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); 2772 continue; 2773 } 2774 2775 isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 2776 isl_local_space_from_space(Array->getSpace().release()), isl_dim_set, 2777 i)); 2778 PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 2779 isl_pw_aff_dim(Val, isl_dim_in)); 2780 PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 2781 isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 2782 auto *Set = isl_pw_aff_gt_set(PwAff, Val); 2783 Extent = isl_set_intersect(Set, Extent); 2784 } 2785 2786 return Extent; 2787 } 2788 2789 /// Derive the bounds of an array. 2790 /// 2791 /// For the first dimension we derive the bound of the array from the extent 2792 /// of this dimension. For inner dimensions we obtain their size directly from 2793 /// ScopArrayInfo. 2794 /// 2795 /// @param PPCGArray The array to compute bounds for. 2796 /// @param Array The polly array from which to take the information. 2797 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 2798 std::vector<isl_pw_aff *> Bounds; 2799 2800 if (PPCGArray.n_index > 0) { 2801 if (isl_set_is_empty(PPCGArray.extent)) { 2802 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2803 isl_local_space *LS = isl_local_space_from_space( 2804 isl_space_params(isl_set_get_space(Dom))); 2805 isl_set_free(Dom); 2806 isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS)); 2807 Bounds.push_back(Zero); 2808 } else { 2809 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2810 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 2811 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 2812 isl_set_free(Dom); 2813 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 2814 isl_local_space *LS = 2815 isl_local_space_from_space(isl_set_get_space(Dom)); 2816 isl_aff *One = isl_aff_zero_on_domain(LS); 2817 One = isl_aff_add_constant_si(One, 1); 2818 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 2819 Bound = isl_pw_aff_gist(Bound, S->getContext()); 2820 Bounds.push_back(Bound); 2821 } 2822 } 2823 2824 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 2825 isl_pw_aff *Bound = Array->getDimensionSizePw(i).release(); 2826 auto LS = isl_pw_aff_get_domain_space(Bound); 2827 auto Aff = isl_multi_aff_zero(LS); 2828 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 2829 Bounds.push_back(Bound); 2830 } 2831 2832 /// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff` 2833 /// to have the same parameter dimensions. So, we need to align them to an 2834 /// appropriate space. 2835 /// Scop::Context is _not_ an appropriate space, because when we have 2836 /// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not 2837 /// contain all parameter dimensions. 2838 /// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together. 2839 isl_space *SeedAlignSpace = S->getParamSpace(); 2840 SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1); 2841 2842 isl_space *AlignSpace = nullptr; 2843 std::vector<isl_pw_aff *> AlignedBounds; 2844 std::tie(AlignSpace, AlignedBounds) = 2845 alignPwAffs(std::move(Bounds), SeedAlignSpace); 2846 2847 assert(AlignSpace && "alignPwAffs did not initialise AlignSpace"); 2848 2849 isl_pw_aff_list *BoundsList = 2850 createPwAffList(S->getIslCtx(), std::move(AlignedBounds)); 2851 2852 isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent); 2853 BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace); 2854 2855 assert(BoundsSpace && "Unable to access space of array."); 2856 assert(BoundsList && "Unable to access list of bounds."); 2857 2858 PPCGArray.bound = 2859 isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList); 2860 assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly."); 2861 } 2862 2863 /// Create the arrays for @p PPCGProg. 2864 /// 2865 /// @param PPCGProg The program to compute the arrays for. 2866 void createArrays(gpu_prog *PPCGProg, 2867 const SmallVector<ScopArrayInfo *, 4> &ValidSAIs) { 2868 int i = 0; 2869 for (auto &Array : ValidSAIs) { 2870 std::string TypeName; 2871 raw_string_ostream OS(TypeName); 2872 2873 OS << *Array->getElementType(); 2874 TypeName = OS.str(); 2875 2876 gpu_array_info &PPCGArray = PPCGProg->array[i]; 2877 2878 PPCGArray.space = Array->getSpace().release(); 2879 PPCGArray.type = strdup(TypeName.c_str()); 2880 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 2881 PPCGArray.name = strdup(Array->getName().c_str()); 2882 PPCGArray.extent = nullptr; 2883 PPCGArray.n_index = Array->getNumberOfDimensions(); 2884 PPCGArray.extent = getExtent(Array); 2885 PPCGArray.n_ref = 0; 2886 PPCGArray.refs = nullptr; 2887 PPCGArray.accessed = true; 2888 PPCGArray.read_only_scalar = 2889 Array->isReadOnly() && Array->getNumberOfDimensions() == 0; 2890 PPCGArray.has_compound_element = false; 2891 PPCGArray.local = false; 2892 PPCGArray.declare_local = false; 2893 PPCGArray.global = false; 2894 PPCGArray.linearize = false; 2895 PPCGArray.dep_order = nullptr; 2896 PPCGArray.user = Array; 2897 2898 PPCGArray.bound = nullptr; 2899 setArrayBounds(PPCGArray, Array); 2900 i++; 2901 2902 collect_references(PPCGProg, &PPCGArray); 2903 } 2904 } 2905 2906 /// Create an identity map between the arrays in the scop. 2907 /// 2908 /// @returns An identity map between the arrays in the scop. 2909 isl_union_map *getArrayIdentity() { 2910 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 2911 2912 for (auto &Array : S->arrays()) { 2913 isl_space *Space = Array->getSpace().release(); 2914 Space = isl_space_map_from_set(Space); 2915 isl_map *Identity = isl_map_identity(Space); 2916 Maps = isl_union_map_add_map(Maps, Identity); 2917 } 2918 2919 return Maps; 2920 } 2921 2922 /// Create a default-initialized PPCG GPU program. 2923 /// 2924 /// @returns A new gpu program description. 2925 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 2926 2927 if (!PPCGScop) 2928 return nullptr; 2929 2930 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 2931 2932 PPCGProg->ctx = S->getIslCtx(); 2933 PPCGProg->scop = PPCGScop; 2934 PPCGProg->context = isl_set_copy(PPCGScop->context); 2935 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 2936 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 2937 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 2938 PPCGProg->tagged_must_kill = 2939 isl_union_map_copy(PPCGScop->tagged_must_kills); 2940 PPCGProg->to_inner = getArrayIdentity(); 2941 PPCGProg->to_outer = getArrayIdentity(); 2942 // TODO: verify that this assignment is correct. 2943 PPCGProg->any_to_outer = nullptr; 2944 2945 // this needs to be set when live range reordering is enabled. 2946 // NOTE: I believe that is conservatively correct. I'm not sure 2947 // what the semantics of this is. 2948 // Quoting PPCG/gpu.h: "Order dependences on non-scalars." 2949 PPCGProg->array_order = 2950 isl_union_map_empty(isl_set_get_space(PPCGScop->context)); 2951 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 2952 PPCGProg->stmts = getStatements(); 2953 2954 // Only consider arrays that have a non-empty extent. 2955 // Otherwise, this will cause us to consider the following kinds of 2956 // empty arrays: 2957 // 1. Invariant loads that are represented by SAI objects. 2958 // 2. Arrays with statically known zero size. 2959 auto ValidSAIsRange = 2960 make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool { 2961 return !isl::manage(getExtent(SAI)).is_empty(); 2962 }); 2963 SmallVector<ScopArrayInfo *, 4> ValidSAIs(ValidSAIsRange.begin(), 2964 ValidSAIsRange.end()); 2965 2966 PPCGProg->n_array = 2967 ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end()); 2968 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 2969 PPCGProg->n_array); 2970 2971 createArrays(PPCGProg, ValidSAIs); 2972 2973 PPCGProg->may_persist = compute_may_persist(PPCGProg); 2974 return PPCGProg; 2975 } 2976 2977 struct PrintGPUUserData { 2978 struct cuda_info *CudaInfo; 2979 struct gpu_prog *PPCGProg; 2980 std::vector<ppcg_kernel *> Kernels; 2981 }; 2982 2983 /// Print a user statement node in the host code. 2984 /// 2985 /// We use ppcg's printing facilities to print the actual statement and 2986 /// additionally build up a list of all kernels that are encountered in the 2987 /// host ast. 2988 /// 2989 /// @param P The printer to print to 2990 /// @param Options The printing options to use 2991 /// @param Node The node to print 2992 /// @param User A user pointer to carry additional data. This pointer is 2993 /// expected to be of type PrintGPUUserData. 2994 /// 2995 /// @returns A printer to which the output has been printed. 2996 static __isl_give isl_printer * 2997 printHostUser(__isl_take isl_printer *P, 2998 __isl_take isl_ast_print_options *Options, 2999 __isl_take isl_ast_node *Node, void *User) { 3000 auto Data = (struct PrintGPUUserData *)User; 3001 auto Id = isl_ast_node_get_annotation(Node); 3002 3003 if (Id) { 3004 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 3005 3006 // If this is a user statement, format it ourselves as ppcg would 3007 // otherwise try to call pet functionality that is not available in 3008 // Polly. 3009 if (IsUser) { 3010 P = isl_printer_start_line(P); 3011 P = isl_printer_print_ast_node(P, Node); 3012 P = isl_printer_end_line(P); 3013 isl_id_free(Id); 3014 isl_ast_print_options_free(Options); 3015 return P; 3016 } 3017 3018 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 3019 isl_id_free(Id); 3020 Data->Kernels.push_back(Kernel); 3021 } 3022 3023 return print_host_user(P, Options, Node, User); 3024 } 3025 3026 /// Print C code corresponding to the control flow in @p Kernel. 3027 /// 3028 /// @param Kernel The kernel to print 3029 void printKernel(ppcg_kernel *Kernel) { 3030 auto *P = isl_printer_to_str(S->getIslCtx()); 3031 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 3032 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 3033 P = isl_ast_node_print(Kernel->tree, P, Options); 3034 char *String = isl_printer_get_str(P); 3035 printf("%s\n", String); 3036 free(String); 3037 isl_printer_free(P); 3038 } 3039 3040 /// Print C code corresponding to the GPU code described by @p Tree. 3041 /// 3042 /// @param Tree An AST describing GPU code 3043 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 3044 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 3045 auto *P = isl_printer_to_str(S->getIslCtx()); 3046 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 3047 3048 PrintGPUUserData Data; 3049 Data.PPCGProg = PPCGProg; 3050 3051 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 3052 Options = 3053 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 3054 P = isl_ast_node_print(Tree, P, Options); 3055 char *String = isl_printer_get_str(P); 3056 printf("# host\n"); 3057 printf("%s\n", String); 3058 free(String); 3059 isl_printer_free(P); 3060 3061 for (auto Kernel : Data.Kernels) { 3062 printf("# kernel%d\n", Kernel->id); 3063 printKernel(Kernel); 3064 } 3065 } 3066 3067 // Generate a GPU program using PPCG. 3068 // 3069 // GPU mapping consists of multiple steps: 3070 // 3071 // 1) Compute new schedule for the program. 3072 // 2) Map schedule to GPU (TODO) 3073 // 3) Generate code for new schedule (TODO) 3074 // 3075 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 3076 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 3077 // strategy directly from this pass. 3078 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 3079 3080 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 3081 3082 PPCGGen->ctx = S->getIslCtx(); 3083 PPCGGen->options = PPCGScop->options; 3084 PPCGGen->print = nullptr; 3085 PPCGGen->print_user = nullptr; 3086 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 3087 PPCGGen->prog = PPCGProg; 3088 PPCGGen->tree = nullptr; 3089 PPCGGen->types.n = 0; 3090 PPCGGen->types.name = nullptr; 3091 PPCGGen->sizes = nullptr; 3092 PPCGGen->used_sizes = nullptr; 3093 PPCGGen->kernel_id = 0; 3094 3095 // Set scheduling strategy to same strategy PPCG is using. 3096 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 3097 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 3098 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 3099 3100 isl_schedule *Schedule = get_schedule(PPCGGen); 3101 3102 int has_permutable = has_any_permutable_node(Schedule); 3103 3104 Schedule = 3105 isl_schedule_align_params(Schedule, S->getFullParamSpace().release()); 3106 3107 if (!has_permutable || has_permutable < 0) { 3108 Schedule = isl_schedule_free(Schedule); 3109 DEBUG(dbgs() << getUniqueScopName(S) 3110 << " does not have permutable bands. Bailing out\n";); 3111 } else { 3112 Schedule = map_to_device(PPCGGen, Schedule); 3113 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 3114 } 3115 3116 if (DumpSchedule) { 3117 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 3118 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 3119 P = isl_printer_print_str(P, "Schedule\n"); 3120 P = isl_printer_print_str(P, "========\n"); 3121 if (Schedule) 3122 P = isl_printer_print_schedule(P, Schedule); 3123 else 3124 P = isl_printer_print_str(P, "No schedule found\n"); 3125 3126 printf("%s\n", isl_printer_get_str(P)); 3127 isl_printer_free(P); 3128 } 3129 3130 if (DumpCode) { 3131 printf("Code\n"); 3132 printf("====\n"); 3133 if (PPCGGen->tree) 3134 printGPUTree(PPCGGen->tree, PPCGProg); 3135 else 3136 printf("No code generated\n"); 3137 } 3138 3139 isl_schedule_free(Schedule); 3140 3141 return PPCGGen; 3142 } 3143 3144 /// Free gpu_gen structure. 3145 /// 3146 /// @param PPCGGen The ppcg_gen object to free. 3147 void freePPCGGen(gpu_gen *PPCGGen) { 3148 isl_ast_node_free(PPCGGen->tree); 3149 isl_union_map_free(PPCGGen->sizes); 3150 isl_union_map_free(PPCGGen->used_sizes); 3151 free(PPCGGen); 3152 } 3153 3154 /// Free the options in the ppcg scop structure. 3155 /// 3156 /// ppcg is not freeing these options for us. To avoid leaks we do this 3157 /// ourselves. 3158 /// 3159 /// @param PPCGScop The scop referencing the options to free. 3160 void freeOptions(ppcg_scop *PPCGScop) { 3161 free(PPCGScop->options->debug); 3162 PPCGScop->options->debug = nullptr; 3163 free(PPCGScop->options); 3164 PPCGScop->options = nullptr; 3165 } 3166 3167 /// Approximate the number of points in the set. 3168 /// 3169 /// This function returns an ast expression that overapproximates the number 3170 /// of points in an isl set through the rectangular hull surrounding this set. 3171 /// 3172 /// @param Set The set to count. 3173 /// @param Build The isl ast build object to use for creating the ast 3174 /// expression. 3175 /// 3176 /// @returns An approximation of the number of points in the set. 3177 __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, 3178 __isl_keep isl_ast_build *Build) { 3179 3180 isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); 3181 auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); 3182 3183 isl_space *Space = isl_set_get_space(Set); 3184 Space = isl_space_params(Space); 3185 auto *Univ = isl_set_universe(Space); 3186 isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); 3187 3188 for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) { 3189 isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); 3190 isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); 3191 isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); 3192 DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); 3193 auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); 3194 Expr = isl_ast_expr_mul(Expr, DimSizeExpr); 3195 } 3196 3197 isl_set_free(Set); 3198 isl_pw_aff_free(OneAff); 3199 3200 return Expr; 3201 } 3202 3203 /// Approximate a number of dynamic instructions executed by a given 3204 /// statement. 3205 /// 3206 /// @param Stmt The statement for which to compute the number of dynamic 3207 /// instructions. 3208 /// @param Build The isl ast build object to use for creating the ast 3209 /// expression. 3210 /// @returns An approximation of the number of dynamic instructions executed 3211 /// by @p Stmt. 3212 __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, 3213 __isl_keep isl_ast_build *Build) { 3214 auto Iterations = approxPointsInSet(Stmt.getDomain(), Build); 3215 3216 long InstCount = 0; 3217 3218 if (Stmt.isBlockStmt()) { 3219 auto *BB = Stmt.getBasicBlock(); 3220 InstCount = std::distance(BB->begin(), BB->end()); 3221 } else { 3222 auto *R = Stmt.getRegion(); 3223 3224 for (auto *BB : R->blocks()) { 3225 InstCount += std::distance(BB->begin(), BB->end()); 3226 } 3227 } 3228 3229 isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount); 3230 auto *InstExpr = isl_ast_expr_from_val(InstVal); 3231 return isl_ast_expr_mul(InstExpr, Iterations); 3232 } 3233 3234 /// Approximate dynamic instructions executed in scop. 3235 /// 3236 /// @param S The scop for which to approximate dynamic instructions. 3237 /// @param Build The isl ast build object to use for creating the ast 3238 /// expression. 3239 /// @returns An approximation of the number of dynamic instructions executed 3240 /// in @p S. 3241 __isl_give isl_ast_expr * 3242 getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { 3243 isl_ast_expr *Instructions; 3244 3245 isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0); 3246 Instructions = isl_ast_expr_from_val(Zero); 3247 3248 for (ScopStmt &Stmt : S) { 3249 isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); 3250 Instructions = isl_ast_expr_add(Instructions, StmtInstructions); 3251 } 3252 return Instructions; 3253 } 3254 3255 /// Create a check that ensures sufficient compute in scop. 3256 /// 3257 /// @param S The scop for which to ensure sufficient compute. 3258 /// @param Build The isl ast build object to use for creating the ast 3259 /// expression. 3260 /// @returns An expression that evaluates to TRUE in case of sufficient 3261 /// compute and to FALSE, otherwise. 3262 __isl_give isl_ast_expr * 3263 createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { 3264 auto Iterations = getNumberOfIterations(S, Build); 3265 auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute); 3266 auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); 3267 return isl_ast_expr_ge(Iterations, MinComputeExpr); 3268 } 3269 3270 /// Check if the basic block contains a function we cannot codegen for GPU 3271 /// kernels. 3272 /// 3273 /// If this basic block does something with a `Function` other than calling 3274 /// a function that we support in a kernel, return true. 3275 bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB, 3276 bool AllowCUDALibDevice) { 3277 for (const Instruction &Inst : *BB) { 3278 const CallInst *Call = dyn_cast<CallInst>(&Inst); 3279 if (Call && isValidFunctionInKernel(Call->getCalledFunction(), 3280 AllowCUDALibDevice)) { 3281 continue; 3282 } 3283 3284 for (Value *SrcVal : Inst.operands()) { 3285 PointerType *p = dyn_cast<PointerType>(SrcVal->getType()); 3286 if (!p) 3287 continue; 3288 if (isa<FunctionType>(p->getElementType())) 3289 return true; 3290 } 3291 } 3292 return false; 3293 } 3294 3295 /// Return whether the Scop S uses functions in a way that we do not support. 3296 bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) { 3297 for (auto &Stmt : S) { 3298 if (Stmt.isBlockStmt()) { 3299 if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(), 3300 AllowCUDALibDevice)) 3301 return true; 3302 } else { 3303 assert(Stmt.isRegionStmt() && 3304 "Stmt was neither block nor region statement"); 3305 for (const BasicBlock *BB : Stmt.getRegion()->blocks()) 3306 if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice)) 3307 return true; 3308 } 3309 } 3310 return false; 3311 } 3312 3313 /// Generate code for a given GPU AST described by @p Root. 3314 /// 3315 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 3316 /// @param Prog The GPU Program to generate code for. 3317 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 3318 ScopAnnotator Annotator; 3319 Annotator.buildAliasScopes(*S); 3320 3321 Region *R = &S->getRegion(); 3322 3323 simplifyRegion(R, DT, LI, RI); 3324 3325 BasicBlock *EnteringBB = R->getEnteringBlock(); 3326 3327 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 3328 3329 // Only build the run-time condition and parameters _after_ having 3330 // introduced the conditional branch. This is important as the conditional 3331 // branch will guard the original scop from new induction variables that 3332 // the SCEVExpander may introduce while code generating the parameters and 3333 // which may introduce scalar dependences that prevent us from correctly 3334 // code generating this scop. 3335 BBPair StartExitBlocks; 3336 BranchInst *CondBr = nullptr; 3337 std::tie(StartExitBlocks, CondBr) = 3338 executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); 3339 BasicBlock *StartBlock = std::get<0>(StartExitBlocks); 3340 3341 assert(CondBr && "CondBr not initialized by executeScopConditionally"); 3342 3343 GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, 3344 StartBlock, Prog, Runtime, Architecture); 3345 3346 // TODO: Handle LICM 3347 auto SplitBlock = StartBlock->getSinglePredecessor(); 3348 Builder.SetInsertPoint(SplitBlock->getTerminator()); 3349 NodeBuilder.addParameters(S->getContext()); 3350 3351 isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 3352 isl_ast_expr *Condition = IslAst::buildRunCondition(*S, Build); 3353 isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); 3354 Condition = isl_ast_expr_and(Condition, SufficientCompute); 3355 isl_ast_build_free(Build); 3356 3357 // preload invariant loads. Note: This should happen before the RTC 3358 // because the RTC may depend on values that are invariant load hoisted. 3359 if (!NodeBuilder.preloadInvariantLoads()) 3360 report_fatal_error("preloading invariant loads failed in function: " + 3361 S->getFunction().getName() + 3362 " | Scop Region: " + S->getNameStr()); 3363 3364 Value *RTC = NodeBuilder.createRTC(Condition); 3365 Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 3366 3367 Builder.SetInsertPoint(&*StartBlock->begin()); 3368 3369 NodeBuilder.create(Root); 3370 3371 /// In case a sequential kernel has more surrounding loops as any parallel 3372 /// kernel, the SCoP is probably mostly sequential. Hence, there is no 3373 /// point in running it on a GPU. 3374 if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) 3375 CondBr->setOperand(0, Builder.getFalse()); 3376 3377 if (!NodeBuilder.BuildSuccessful) 3378 CondBr->setOperand(0, Builder.getFalse()); 3379 } 3380 3381 bool runOnScop(Scop &CurrentScop) override { 3382 S = &CurrentScop; 3383 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 3384 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 3385 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 3386 DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); 3387 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 3388 3389 // We currently do not support functions other than intrinsics inside 3390 // kernels, as code generation will need to offload function calls to the 3391 // kernel. This may lead to a kernel trying to call a function on the host. 3392 // This also allows us to prevent codegen from trying to take the 3393 // address of an intrinsic function to send to the kernel. 3394 if (containsInvalidKernelFunction(CurrentScop, 3395 Architecture == GPUArch::NVPTX64)) { 3396 DEBUG( 3397 dbgs() << getUniqueScopName(S) 3398 << " contains function which cannot be materialised in a GPU " 3399 "kernel. Bailing out.\n";); 3400 return false; 3401 } 3402 3403 auto PPCGScop = createPPCGScop(); 3404 auto PPCGProg = createPPCGProg(PPCGScop); 3405 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 3406 3407 if (PPCGGen->tree) { 3408 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 3409 CurrentScop.markAsToBeSkipped(); 3410 } else { 3411 DEBUG(dbgs() << getUniqueScopName(S) 3412 << " has empty PPCGGen->tree. Bailing out.\n"); 3413 } 3414 3415 freeOptions(PPCGScop); 3416 freePPCGGen(PPCGGen); 3417 gpu_prog_free(PPCGProg); 3418 ppcg_scop_free(PPCGScop); 3419 3420 return true; 3421 } 3422 3423 void printScop(raw_ostream &, Scop &) const override {} 3424 3425 void getAnalysisUsage(AnalysisUsage &AU) const override { 3426 AU.addRequired<DominatorTreeWrapperPass>(); 3427 AU.addRequired<RegionInfoPass>(); 3428 AU.addRequired<ScalarEvolutionWrapperPass>(); 3429 AU.addRequired<ScopDetectionWrapperPass>(); 3430 AU.addRequired<ScopInfoRegionPass>(); 3431 AU.addRequired<LoopInfoWrapperPass>(); 3432 3433 AU.addPreserved<AAResultsWrapperPass>(); 3434 AU.addPreserved<BasicAAWrapperPass>(); 3435 AU.addPreserved<LoopInfoWrapperPass>(); 3436 AU.addPreserved<DominatorTreeWrapperPass>(); 3437 AU.addPreserved<GlobalsAAWrapperPass>(); 3438 AU.addPreserved<ScopDetectionWrapperPass>(); 3439 AU.addPreserved<ScalarEvolutionWrapperPass>(); 3440 AU.addPreserved<SCEVAAWrapperPass>(); 3441 3442 // FIXME: We do not yet add regions for the newly generated code to the 3443 // region tree. 3444 AU.addPreserved<RegionInfoPass>(); 3445 AU.addPreserved<ScopInfoRegionPass>(); 3446 } 3447 }; 3448 } // namespace 3449 3450 char PPCGCodeGeneration::ID = 1; 3451 3452 Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { 3453 PPCGCodeGeneration *generator = new PPCGCodeGeneration(); 3454 generator->Runtime = Runtime; 3455 generator->Architecture = Arch; 3456 return generator; 3457 } 3458 3459 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 3460 "Polly - Apply PPCG translation to SCOP", false, false) 3461 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 3462 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 3463 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 3464 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 3465 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 3466 INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); 3467 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 3468 "Polly - Apply PPCG translation to SCOP", false, false) 3469