1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/PPCGCodeGeneration.h" 16 #include "polly/CodeGen/IslAst.h" 17 #include "polly/CodeGen/IslNodeBuilder.h" 18 #include "polly/CodeGen/Utils.h" 19 #include "polly/DependenceInfo.h" 20 #include "polly/LinkAllPasses.h" 21 #include "polly/Options.h" 22 #include "polly/ScopDetection.h" 23 #include "polly/ScopInfo.h" 24 #include "polly/Support/SCEVValidator.h" 25 #include "llvm/ADT/PostOrderIterator.h" 26 #include "llvm/Analysis/AliasAnalysis.h" 27 #include "llvm/Analysis/BasicAliasAnalysis.h" 28 #include "llvm/Analysis/GlobalsModRef.h" 29 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 30 #include "llvm/Analysis/TargetLibraryInfo.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/IR/LegacyPassManager.h" 33 #include "llvm/IR/Verifier.h" 34 #include "llvm/Support/TargetRegistry.h" 35 #include "llvm/Support/TargetSelect.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 40 #include "isl/union_map.h" 41 42 extern "C" { 43 #include "ppcg/cuda.h" 44 #include "ppcg/gpu.h" 45 #include "ppcg/gpu_print.h" 46 #include "ppcg/ppcg.h" 47 #include "ppcg/schedule.h" 48 } 49 50 #include "llvm/Support/Debug.h" 51 52 using namespace polly; 53 using namespace llvm; 54 55 #define DEBUG_TYPE "polly-codegen-ppcg" 56 57 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 58 cl::desc("Dump the computed GPU Schedule"), 59 cl::Hidden, cl::init(false), cl::ZeroOrMore, 60 cl::cat(PollyCategory)); 61 62 static cl::opt<bool> 63 DumpCode("polly-acc-dump-code", 64 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 65 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 66 67 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 68 cl::desc("Dump the kernel LLVM-IR"), 69 cl::Hidden, cl::init(false), cl::ZeroOrMore, 70 cl::cat(PollyCategory)); 71 72 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 73 cl::desc("Dump the kernel assembly code"), 74 cl::Hidden, cl::init(false), cl::ZeroOrMore, 75 cl::cat(PollyCategory)); 76 77 static cl::opt<bool> FastMath("polly-acc-fastmath", 78 cl::desc("Allow unsafe math optimizations"), 79 cl::Hidden, cl::init(false), cl::ZeroOrMore, 80 cl::cat(PollyCategory)); 81 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 82 cl::desc("Use shared memory"), cl::Hidden, 83 cl::init(false), cl::ZeroOrMore, 84 cl::cat(PollyCategory)); 85 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 86 cl::desc("Use private memory"), cl::Hidden, 87 cl::init(false), cl::ZeroOrMore, 88 cl::cat(PollyCategory)); 89 90 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory", 91 cl::desc("Generate Host kernel code assuming" 92 " that all memory has been" 93 " declared as managed memory"), 94 cl::Hidden, cl::init(false), cl::ZeroOrMore, 95 cl::cat(PollyCategory)); 96 97 static cl::opt<bool> 98 FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure", 99 cl::desc("Fail and generate a backtrace if" 100 " verifyModule fails on the GPU " 101 " kernel module."), 102 cl::Hidden, cl::init(false), cl::ZeroOrMore, 103 cl::cat(PollyCategory)); 104 105 static cl::opt<std::string> 106 CudaVersion("polly-acc-cuda-version", 107 cl::desc("The CUDA version to compile for"), cl::Hidden, 108 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 109 110 static cl::opt<int> 111 MinCompute("polly-acc-mincompute", 112 cl::desc("Minimal number of compute statements to run on GPU."), 113 cl::Hidden, cl::init(10 * 512 * 512)); 114 115 /// Used to store information PPCG wants for kills. This information is 116 /// used by live range reordering. 117 /// 118 /// @see computeLiveRangeReordering 119 /// @see GPUNodeBuilder::createPPCGScop 120 /// @see GPUNodeBuilder::createPPCGProg 121 struct MustKillsInfo { 122 /// Collection of all kill statements that will be sequenced at the end of 123 /// PPCGScop->schedule. 124 /// 125 /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set` 126 /// which merges schedules in *arbitrary* order. 127 /// (we don't care about the order of the kills anyway). 128 isl::schedule KillsSchedule; 129 /// Map from kill statement instances to scalars that need to be 130 /// killed. 131 /// 132 /// We currently only derive kill information for phi nodes, as phi nodes 133 /// allow us to easily derive kill information. PHI nodes are not alive 134 /// outside the scop and can consequently all be "killed". [params] -> { 135 /// [Stmt_phantom[] -> ref_phantom[]] -> phi_ref[] } 136 isl::union_map TaggedMustKills; 137 138 MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){}; 139 }; 140 141 /// Compute must-kills needed to enable live range reordering with PPCG. 142 /// 143 /// @params S The Scop to compute live range reordering information 144 /// @returns live range reordering information that can be used to setup 145 /// PPCG. 146 static MustKillsInfo computeMustKillsInfo(const Scop &S) { 147 const isl::space ParamSpace(isl::manage(S.getParamSpace())); 148 MustKillsInfo Info; 149 150 // 1. Collect phi nodes in scop. 151 SmallVector<isl::id, 4> KillMemIds; 152 for (ScopArrayInfo *SAI : S.arrays()) { 153 if (!SAI->isPHIKind()) 154 continue; 155 156 KillMemIds.push_back(isl::manage(SAI->getBasePtrId())); 157 } 158 159 Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace)); 160 161 // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the 162 // schedule: 163 // - filter: "[control] -> { }" 164 // So, we choose to not create this to keep the output a little nicer, 165 // at the cost of some code complexity. 166 Info.KillsSchedule = nullptr; 167 168 for (isl::id &phiId : KillMemIds) { 169 isl::id KillStmtId = isl::id::alloc( 170 S.getIslCtx(), std::string("SKill_phantom_").append(phiId.get_name()), 171 nullptr); 172 173 // NOTE: construction of tagged_must_kill: 174 // 2. We need to construct a map: 175 // [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref } 176 // To construct this, we use `isl_map_domain_product` on 2 maps`: 177 // 2a. StmtToPhi: 178 // [param] -> { Stmt_phantom[] -> phi_ref[] } 179 // 2b. PhantomRefToPhi: 180 // [param] -> { ref_phantom[] -> phi_ref[] } 181 // 182 // Combining these with `isl_map_domain_product` gives us 183 // TaggedMustKill: 184 // [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] } 185 186 // 2a. [param] -> { S_2[] -> phi_ref[] } 187 isl::map StmtToPhi = isl::map::universe(isl::space(ParamSpace)); 188 StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::in, isl::id(KillStmtId)); 189 StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::out, isl::id(phiId)); 190 191 isl::id PhantomRefId = isl::id::alloc( 192 S.getIslCtx(), std::string("ref_phantom") + phiId.get_name(), nullptr); 193 194 // 2b. [param] -> { phantom_ref[] -> memref[] } 195 isl::map PhantomRefToPhi = isl::map::universe(isl::space(ParamSpace)); 196 PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::in, PhantomRefId); 197 PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::out, phiId); 198 199 // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] } 200 isl::map TaggedMustKill = StmtToPhi.domain_product(PhantomRefToPhi); 201 Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill); 202 203 // 3. Create the kill schedule of the form: 204 // "[param] -> { Stmt_phantom[] }" 205 // Then add this to Info.KillsSchedule. 206 isl::space KillStmtSpace = ParamSpace; 207 KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId); 208 isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace); 209 210 isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain); 211 if (Info.KillsSchedule) 212 Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule); 213 else 214 Info.KillsSchedule = KillSchedule; 215 } 216 217 return Info; 218 } 219 220 /// Create the ast expressions for a ScopStmt. 221 /// 222 /// This function is a callback for to generate the ast expressions for each 223 /// of the scheduled ScopStmts. 224 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 225 void *StmtT, isl_ast_build *Build, 226 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 227 isl_id *Id, void *User), 228 void *UserIndex, 229 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 230 void *UserExpr) { 231 232 ScopStmt *Stmt = (ScopStmt *)StmtT; 233 234 isl_ctx *Ctx; 235 236 if (!Stmt || !Build) 237 return NULL; 238 239 Ctx = isl_ast_build_get_ctx(Build); 240 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 241 242 for (MemoryAccess *Acc : *Stmt) { 243 isl_map *AddrFunc = Acc->getAddressFunction(); 244 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 245 isl_id *RefId = Acc->getId(); 246 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 247 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 248 MPA = isl_multi_pw_aff_coalesce(MPA); 249 MPA = FunctionIndex(MPA, RefId, UserIndex); 250 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 251 Access = FunctionExpr(Access, RefId, UserExpr); 252 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 253 } 254 255 return RefToExpr; 256 } 257 258 /// Given a LLVM Type, compute its size in bytes, 259 static int computeSizeInBytes(const Type *T) { 260 int bytes = T->getPrimitiveSizeInBits() / 8; 261 if (bytes == 0) 262 bytes = T->getScalarSizeInBits() / 8; 263 return bytes; 264 } 265 266 /// Generate code for a GPU specific isl AST. 267 /// 268 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 269 /// generates code for general-purpose AST nodes, with special functionality 270 /// for generating GPU specific user nodes. 271 /// 272 /// @see GPUNodeBuilder::createUser 273 class GPUNodeBuilder : public IslNodeBuilder { 274 public: 275 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, 276 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 277 DominatorTree &DT, Scop &S, BasicBlock *StartBlock, 278 gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch) 279 : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock), 280 Prog(Prog), Runtime(Runtime), Arch(Arch) { 281 getExprBuilder().setIDToSAI(&IDToSAI); 282 } 283 284 /// Create after-run-time-check initialization code. 285 void initializeAfterRTH(); 286 287 /// Finalize the generated scop. 288 virtual void finalize(); 289 290 /// Track if the full build process was successful. 291 /// 292 /// This value is set to false, if throughout the build process an error 293 /// occurred which prevents us from generating valid GPU code. 294 bool BuildSuccessful = true; 295 296 /// The maximal number of loops surrounding a sequential kernel. 297 unsigned DeepestSequential = 0; 298 299 /// The maximal number of loops surrounding a parallel kernel. 300 unsigned DeepestParallel = 0; 301 302 private: 303 /// A vector of array base pointers for which a new ScopArrayInfo was created. 304 /// 305 /// This vector is used to delete the ScopArrayInfo when it is not needed any 306 /// more. 307 std::vector<Value *> LocalArrays; 308 309 /// A map from ScopArrays to their corresponding device allocations. 310 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 311 312 /// The current GPU context. 313 Value *GPUContext; 314 315 /// The set of isl_ids allocated in the kernel 316 std::vector<isl_id *> KernelIds; 317 318 /// A module containing GPU code. 319 /// 320 /// This pointer is only set in case we are currently generating GPU code. 321 std::unique_ptr<Module> GPUModule; 322 323 /// The GPU program we generate code for. 324 gpu_prog *Prog; 325 326 /// The GPU Runtime implementation to use (OpenCL or CUDA). 327 GPURuntime Runtime; 328 329 /// The GPU Architecture to target. 330 GPUArch Arch; 331 332 /// Class to free isl_ids. 333 class IslIdDeleter { 334 public: 335 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 336 }; 337 338 /// A set containing all isl_ids allocated in a GPU kernel. 339 /// 340 /// By releasing this set all isl_ids will be freed. 341 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 342 343 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 344 345 /// Create code for user-defined AST nodes. 346 /// 347 /// These AST nodes can be of type: 348 /// 349 /// - ScopStmt: A computational statement (TODO) 350 /// - Kernel: A GPU kernel call (TODO) 351 /// - Data-Transfer: A GPU <-> CPU data-transfer 352 /// - In-kernel synchronization 353 /// - In-kernel memory copy statement 354 /// 355 /// @param UserStmt The ast node to generate code for. 356 virtual void createUser(__isl_take isl_ast_node *UserStmt); 357 358 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 359 360 /// Create code for a data transfer statement 361 /// 362 /// @param TransferStmt The data transfer statement. 363 /// @param Direction The direction in which to transfer data. 364 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 365 enum DataDirection Direction); 366 367 /// Find llvm::Values referenced in GPU kernel. 368 /// 369 /// @param Kernel The kernel to scan for llvm::Values 370 /// 371 /// @returns A pair, whose first element contains the set of values 372 /// referenced by the kernel, and whose second element contains the 373 /// set of functions referenced by the kernel. All functions in the 374 /// second set satisfy isValidFunctionInKernel. 375 std::pair<SetVector<Value *>, SetVector<Function *>> 376 getReferencesInKernel(ppcg_kernel *Kernel); 377 378 /// Compute the sizes of the execution grid for a given kernel. 379 /// 380 /// @param Kernel The kernel to compute grid sizes for. 381 /// 382 /// @returns A tuple with grid sizes for X and Y dimension 383 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 384 385 /// Creates a array that can be sent to the kernel on the device using a 386 /// host pointer. This is required for managed memory, when we directly send 387 /// host pointers to the device. 388 /// \note 389 /// This is to be used only with managed memory 390 Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, 391 ScopArrayInfo *ArrayInfo); 392 393 /// Compute the sizes of the thread blocks for a given kernel. 394 /// 395 /// @param Kernel The kernel to compute thread block sizes for. 396 /// 397 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 398 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 399 400 /// Store a specific kernel launch parameter in the array of kernel launch 401 /// parameters. 402 /// 403 /// @param Parameters The list of parameters in which to store. 404 /// @param Param The kernel launch parameter to store. 405 /// @param Index The index in the parameter list, at which to store the 406 /// parameter. 407 void insertStoreParameter(Instruction *Parameters, Instruction *Param, 408 int Index); 409 410 /// Create kernel launch parameters. 411 /// 412 /// @param Kernel The kernel to create parameters for. 413 /// @param F The kernel function that has been created. 414 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 415 /// 416 /// @returns A stack allocated array with pointers to the parameter 417 /// values that are passed to the kernel. 418 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 419 SetVector<Value *> SubtreeValues); 420 421 /// Create declarations for kernel variable. 422 /// 423 /// This includes shared memory declarations. 424 /// 425 /// @param Kernel The kernel definition to create variables for. 426 /// @param FN The function into which to generate the variables. 427 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 428 429 /// Add CUDA annotations to module. 430 /// 431 /// Add a set of CUDA annotations that declares the maximal block dimensions 432 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 433 /// PTX compiler to bound the number of allocated registers to ensure the 434 /// resulting kernel is known to run with up to as many block dimensions 435 /// as specified here. 436 /// 437 /// @param M The module to add the annotations to. 438 /// @param BlockDimX The size of block dimension X. 439 /// @param BlockDimY The size of block dimension Y. 440 /// @param BlockDimZ The size of block dimension Z. 441 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 442 Value *BlockDimZ); 443 444 /// Create GPU kernel. 445 /// 446 /// Code generate the kernel described by @p KernelStmt. 447 /// 448 /// @param KernelStmt The ast node to generate kernel code for. 449 void createKernel(__isl_take isl_ast_node *KernelStmt); 450 451 /// Generate code that computes the size of an array. 452 /// 453 /// @param Array The array for which to compute a size. 454 Value *getArraySize(gpu_array_info *Array); 455 456 /// Generate code to compute the minimal offset at which an array is accessed. 457 /// 458 /// The offset of an array is the minimal array location accessed in a scop. 459 /// 460 /// Example: 461 /// 462 /// for (long i = 0; i < 100; i++) 463 /// A[i + 42] += ... 464 /// 465 /// getArrayOffset(A) results in 42. 466 /// 467 /// @param Array The array for which to compute the offset. 468 /// @returns An llvm::Value that contains the offset of the array. 469 Value *getArrayOffset(gpu_array_info *Array); 470 471 /// Prepare the kernel arguments for kernel code generation 472 /// 473 /// @param Kernel The kernel to generate code for. 474 /// @param FN The function created for the kernel. 475 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 476 477 /// Create kernel function. 478 /// 479 /// Create a kernel function located in a newly created module that can serve 480 /// as target for device code generation. Set the Builder to point to the 481 /// start block of this newly created function. 482 /// 483 /// @param Kernel The kernel to generate code for. 484 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 485 /// @param SubtreeFunctions The set of llvm::Functions referenced by this 486 /// kernel. 487 void createKernelFunction(ppcg_kernel *Kernel, 488 SetVector<Value *> &SubtreeValues, 489 SetVector<Function *> &SubtreeFunctions); 490 491 /// Create the declaration of a kernel function. 492 /// 493 /// The kernel function takes as arguments: 494 /// 495 /// - One i8 pointer for each external array reference used in the kernel. 496 /// - Host iterators 497 /// - Parameters 498 /// - Other LLVM Value references (TODO) 499 /// 500 /// @param Kernel The kernel to generate the function declaration for. 501 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 502 /// 503 /// @returns The newly declared function. 504 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 505 SetVector<Value *> &SubtreeValues); 506 507 /// Insert intrinsic functions to obtain thread and block ids. 508 /// 509 /// @param The kernel to generate the intrinsic functions for. 510 void insertKernelIntrinsics(ppcg_kernel *Kernel); 511 512 /// Setup the creation of functions referenced by the GPU kernel. 513 /// 514 /// 1. Create new function declarations in GPUModule which are the same as 515 /// SubtreeFunctions. 516 /// 517 /// 2. Populate IslNodeBuilder::ValueMap with mappings from 518 /// old functions (that come from the original module) to new functions 519 /// (that are created within GPUModule). That way, we generate references 520 /// to the correct function (in GPUModule) in BlockGenerator. 521 /// 522 /// @see IslNodeBuilder::ValueMap 523 /// @see BlockGenerator::GlobalMap 524 /// @see BlockGenerator::getNewValue 525 /// @see GPUNodeBuilder::getReferencesInKernel. 526 /// 527 /// @param SubtreeFunctions The set of llvm::Functions referenced by 528 /// this kernel. 529 void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions); 530 531 /// Create a global-to-shared or shared-to-global copy statement. 532 /// 533 /// @param CopyStmt The copy statement to generate code for 534 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 535 536 /// Create code for a ScopStmt called in @p Expr. 537 /// 538 /// @param Expr The expression containing the call. 539 /// @param KernelStmt The kernel statement referenced in the call. 540 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 541 542 /// Create an in-kernel synchronization call. 543 void createKernelSync(); 544 545 /// Create a PTX assembly string for the current GPU kernel. 546 /// 547 /// @returns A string containing the corresponding PTX assembly code. 548 std::string createKernelASM(); 549 550 /// Remove references from the dominator tree to the kernel function @p F. 551 /// 552 /// @param F The function to remove references to. 553 void clearDominators(Function *F); 554 555 /// Remove references from scalar evolution to the kernel function @p F. 556 /// 557 /// @param F The function to remove references to. 558 void clearScalarEvolution(Function *F); 559 560 /// Remove references from loop info to the kernel function @p F. 561 /// 562 /// @param F The function to remove references to. 563 void clearLoops(Function *F); 564 565 /// Finalize the generation of the kernel function. 566 /// 567 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 568 /// dump its IR to stderr. 569 /// 570 /// @returns The Assembly string of the kernel. 571 std::string finalizeKernelFunction(); 572 573 /// Finalize the generation of the kernel arguments. 574 /// 575 /// This function ensures that not-read-only scalars used in a kernel are 576 /// stored back to the global memory location they are backed with before 577 /// the kernel terminates. 578 /// 579 /// @params Kernel The kernel to finalize kernel arguments for. 580 void finalizeKernelArguments(ppcg_kernel *Kernel); 581 582 /// Create code that allocates memory to store arrays on device. 583 void allocateDeviceArrays(); 584 585 /// Free all allocated device arrays. 586 void freeDeviceArrays(); 587 588 /// Create a call to initialize the GPU context. 589 /// 590 /// @returns A pointer to the newly initialized context. 591 Value *createCallInitContext(); 592 593 /// Create a call to get the device pointer for a kernel allocation. 594 /// 595 /// @param Allocation The Polly GPU allocation 596 /// 597 /// @returns The device parameter corresponding to this allocation. 598 Value *createCallGetDevicePtr(Value *Allocation); 599 600 /// Create a call to free the GPU context. 601 /// 602 /// @param Context A pointer to an initialized GPU context. 603 void createCallFreeContext(Value *Context); 604 605 /// Create a call to allocate memory on the device. 606 /// 607 /// @param Size The size of memory to allocate 608 /// 609 /// @returns A pointer that identifies this allocation. 610 Value *createCallAllocateMemoryForDevice(Value *Size); 611 612 /// Create a call to free a device array. 613 /// 614 /// @param Array The device array to free. 615 void createCallFreeDeviceMemory(Value *Array); 616 617 /// Create a call to copy data from host to device. 618 /// 619 /// @param HostPtr A pointer to the host data that should be copied. 620 /// @param DevicePtr A device pointer specifying the location to copy to. 621 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 622 Value *Size); 623 624 /// Create a call to copy data from device to host. 625 /// 626 /// @param DevicePtr A pointer to the device data that should be copied. 627 /// @param HostPtr A host pointer specifying the location to copy to. 628 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 629 Value *Size); 630 631 /// Create a call to synchronize Host & Device. 632 /// \note 633 /// This is to be used only with managed memory. 634 void createCallSynchronizeDevice(); 635 636 /// Create a call to get a kernel from an assembly string. 637 /// 638 /// @param Buffer The string describing the kernel. 639 /// @param Entry The name of the kernel function to call. 640 /// 641 /// @returns A pointer to a kernel object 642 Value *createCallGetKernel(Value *Buffer, Value *Entry); 643 644 /// Create a call to free a GPU kernel. 645 /// 646 /// @param GPUKernel THe kernel to free. 647 void createCallFreeKernel(Value *GPUKernel); 648 649 /// Create a call to launch a GPU kernel. 650 /// 651 /// @param GPUKernel The kernel to launch. 652 /// @param GridDimX The size of the first grid dimension. 653 /// @param GridDimY The size of the second grid dimension. 654 /// @param GridBlockX The size of the first block dimension. 655 /// @param GridBlockY The size of the second block dimension. 656 /// @param GridBlockZ The size of the third block dimension. 657 /// @param Parameters A pointer to an array that contains itself pointers to 658 /// the parameter values passed for each kernel argument. 659 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 660 Value *GridDimY, Value *BlockDimX, 661 Value *BlockDimY, Value *BlockDimZ, 662 Value *Parameters); 663 }; 664 665 void GPUNodeBuilder::initializeAfterRTH() { 666 BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 667 &*Builder.GetInsertPoint(), &DT, &LI); 668 NewBB->setName("polly.acc.initialize"); 669 Builder.SetInsertPoint(&NewBB->front()); 670 671 GPUContext = createCallInitContext(); 672 673 if (!ManagedMemory) 674 allocateDeviceArrays(); 675 } 676 677 void GPUNodeBuilder::finalize() { 678 if (!ManagedMemory) 679 freeDeviceArrays(); 680 681 createCallFreeContext(GPUContext); 682 IslNodeBuilder::finalize(); 683 } 684 685 void GPUNodeBuilder::allocateDeviceArrays() { 686 assert(!ManagedMemory && "Managed memory will directly send host pointers " 687 "to the kernel. There is no need for device arrays"); 688 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 689 690 for (int i = 0; i < Prog->n_array; ++i) { 691 gpu_array_info *Array = &Prog->array[i]; 692 auto *ScopArray = (ScopArrayInfo *)Array->user; 693 std::string DevArrayName("p_dev_array_"); 694 DevArrayName.append(Array->name); 695 696 Value *ArraySize = getArraySize(Array); 697 Value *Offset = getArrayOffset(Array); 698 if (Offset) 699 ArraySize = Builder.CreateSub( 700 ArraySize, 701 Builder.CreateMul(Offset, 702 Builder.getInt64(ScopArray->getElemSizeInBytes()))); 703 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 704 DevArray->setName(DevArrayName); 705 DeviceAllocations[ScopArray] = DevArray; 706 } 707 708 isl_ast_build_free(Build); 709 } 710 711 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 712 Value *BlockDimY, Value *BlockDimZ) { 713 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 714 715 for (auto &F : *M) { 716 if (F.getCallingConv() != CallingConv::PTX_Kernel) 717 continue; 718 719 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 720 721 Metadata *Elements[] = { 722 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 723 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 724 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 725 ValueAsMetadata::get(V[2]), 726 }; 727 MDNode *Node = MDNode::get(M->getContext(), Elements); 728 AnnotationNode->addOperand(Node); 729 } 730 } 731 732 void GPUNodeBuilder::freeDeviceArrays() { 733 assert(!ManagedMemory && "Managed memory does not use device arrays"); 734 for (auto &Array : DeviceAllocations) 735 createCallFreeDeviceMemory(Array.second); 736 } 737 738 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 739 const char *Name = "polly_getKernel"; 740 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 741 Function *F = M->getFunction(Name); 742 743 // If F is not available, declare it. 744 if (!F) { 745 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 746 std::vector<Type *> Args; 747 Args.push_back(Builder.getInt8PtrTy()); 748 Args.push_back(Builder.getInt8PtrTy()); 749 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 750 F = Function::Create(Ty, Linkage, Name, M); 751 } 752 753 return Builder.CreateCall(F, {Buffer, Entry}); 754 } 755 756 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 757 const char *Name = "polly_getDevicePtr"; 758 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 759 Function *F = M->getFunction(Name); 760 761 // If F is not available, declare it. 762 if (!F) { 763 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 764 std::vector<Type *> Args; 765 Args.push_back(Builder.getInt8PtrTy()); 766 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 767 F = Function::Create(Ty, Linkage, Name, M); 768 } 769 770 return Builder.CreateCall(F, {Allocation}); 771 } 772 773 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 774 Value *GridDimY, Value *BlockDimX, 775 Value *BlockDimY, Value *BlockDimZ, 776 Value *Parameters) { 777 const char *Name = "polly_launchKernel"; 778 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 779 Function *F = M->getFunction(Name); 780 781 // If F is not available, declare it. 782 if (!F) { 783 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 784 std::vector<Type *> Args; 785 Args.push_back(Builder.getInt8PtrTy()); 786 Args.push_back(Builder.getInt32Ty()); 787 Args.push_back(Builder.getInt32Ty()); 788 Args.push_back(Builder.getInt32Ty()); 789 Args.push_back(Builder.getInt32Ty()); 790 Args.push_back(Builder.getInt32Ty()); 791 Args.push_back(Builder.getInt8PtrTy()); 792 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 793 F = Function::Create(Ty, Linkage, Name, M); 794 } 795 796 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 797 BlockDimZ, Parameters}); 798 } 799 800 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 801 const char *Name = "polly_freeKernel"; 802 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 803 Function *F = M->getFunction(Name); 804 805 // If F is not available, declare it. 806 if (!F) { 807 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 808 std::vector<Type *> Args; 809 Args.push_back(Builder.getInt8PtrTy()); 810 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 811 F = Function::Create(Ty, Linkage, Name, M); 812 } 813 814 Builder.CreateCall(F, {GPUKernel}); 815 } 816 817 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 818 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 819 "for device"); 820 const char *Name = "polly_freeDeviceMemory"; 821 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 822 Function *F = M->getFunction(Name); 823 824 // If F is not available, declare it. 825 if (!F) { 826 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 827 std::vector<Type *> Args; 828 Args.push_back(Builder.getInt8PtrTy()); 829 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 830 F = Function::Create(Ty, Linkage, Name, M); 831 } 832 833 Builder.CreateCall(F, {Array}); 834 } 835 836 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 837 assert(!ManagedMemory && "Managed memory does not allocate or free memory " 838 "for device"); 839 const char *Name = "polly_allocateMemoryForDevice"; 840 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 841 Function *F = M->getFunction(Name); 842 843 // If F is not available, declare it. 844 if (!F) { 845 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 846 std::vector<Type *> Args; 847 Args.push_back(Builder.getInt64Ty()); 848 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 849 F = Function::Create(Ty, Linkage, Name, M); 850 } 851 852 return Builder.CreateCall(F, {Size}); 853 } 854 855 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 856 Value *DeviceData, 857 Value *Size) { 858 assert(!ManagedMemory && "Managed memory does not transfer memory between " 859 "device and host"); 860 const char *Name = "polly_copyFromHostToDevice"; 861 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 862 Function *F = M->getFunction(Name); 863 864 // If F is not available, declare it. 865 if (!F) { 866 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 867 std::vector<Type *> Args; 868 Args.push_back(Builder.getInt8PtrTy()); 869 Args.push_back(Builder.getInt8PtrTy()); 870 Args.push_back(Builder.getInt64Ty()); 871 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 872 F = Function::Create(Ty, Linkage, Name, M); 873 } 874 875 Builder.CreateCall(F, {HostData, DeviceData, Size}); 876 } 877 878 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 879 Value *HostData, 880 Value *Size) { 881 assert(!ManagedMemory && "Managed memory does not transfer memory between " 882 "device and host"); 883 const char *Name = "polly_copyFromDeviceToHost"; 884 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 885 Function *F = M->getFunction(Name); 886 887 // If F is not available, declare it. 888 if (!F) { 889 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 890 std::vector<Type *> Args; 891 Args.push_back(Builder.getInt8PtrTy()); 892 Args.push_back(Builder.getInt8PtrTy()); 893 Args.push_back(Builder.getInt64Ty()); 894 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 895 F = Function::Create(Ty, Linkage, Name, M); 896 } 897 898 Builder.CreateCall(F, {DeviceData, HostData, Size}); 899 } 900 901 void GPUNodeBuilder::createCallSynchronizeDevice() { 902 assert(ManagedMemory && "explicit synchronization is only necessary for " 903 "managed memory"); 904 const char *Name = "polly_synchronizeDevice"; 905 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 906 Function *F = M->getFunction(Name); 907 908 // If F is not available, declare it. 909 if (!F) { 910 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 911 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); 912 F = Function::Create(Ty, Linkage, Name, M); 913 } 914 915 Builder.CreateCall(F); 916 } 917 918 Value *GPUNodeBuilder::createCallInitContext() { 919 const char *Name; 920 921 switch (Runtime) { 922 case GPURuntime::CUDA: 923 Name = "polly_initContextCUDA"; 924 break; 925 case GPURuntime::OpenCL: 926 Name = "polly_initContextCL"; 927 break; 928 } 929 930 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 931 Function *F = M->getFunction(Name); 932 933 // If F is not available, declare it. 934 if (!F) { 935 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 936 std::vector<Type *> Args; 937 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 938 F = Function::Create(Ty, Linkage, Name, M); 939 } 940 941 return Builder.CreateCall(F, {}); 942 } 943 944 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 945 const char *Name = "polly_freeContext"; 946 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 947 Function *F = M->getFunction(Name); 948 949 // If F is not available, declare it. 950 if (!F) { 951 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 952 std::vector<Type *> Args; 953 Args.push_back(Builder.getInt8PtrTy()); 954 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 955 F = Function::Create(Ty, Linkage, Name, M); 956 } 957 958 Builder.CreateCall(F, {Context}); 959 } 960 961 /// Check if one string is a prefix of another. 962 /// 963 /// @param String The string in which to look for the prefix. 964 /// @param Prefix The prefix to look for. 965 static bool isPrefix(std::string String, std::string Prefix) { 966 return String.find(Prefix) == 0; 967 } 968 969 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 970 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 971 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 972 973 if (!gpu_array_is_scalar(Array)) { 974 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 975 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 976 977 for (unsigned int i = 1; i < Array->n_index; i++) { 978 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 979 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 980 Res = isl_ast_expr_mul(Res, Expr); 981 } 982 983 Value *NumElements = ExprBuilder.create(Res); 984 if (NumElements->getType() != ArraySize->getType()) 985 NumElements = Builder.CreateSExt(NumElements, ArraySize->getType()); 986 ArraySize = Builder.CreateMul(ArraySize, NumElements); 987 } 988 isl_ast_build_free(Build); 989 return ArraySize; 990 } 991 992 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { 993 if (gpu_array_is_scalar(Array)) 994 return nullptr; 995 996 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 997 998 isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent)); 999 1000 isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min)); 1001 1002 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) 1003 ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0); 1004 1005 if (isl_set_is_subset(Min, ZeroSet)) { 1006 isl_set_free(Min); 1007 isl_set_free(ZeroSet); 1008 isl_ast_build_free(Build); 1009 return nullptr; 1010 } 1011 isl_set_free(ZeroSet); 1012 1013 isl_ast_expr *Result = 1014 isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0)); 1015 1016 for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) { 1017 if (i > 0) { 1018 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]); 1019 isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 1020 Result = isl_ast_expr_mul(Result, BExpr); 1021 } 1022 isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i); 1023 isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin); 1024 Result = isl_ast_expr_add(Result, MExpr); 1025 } 1026 1027 Value *ResultValue = ExprBuilder.create(Result); 1028 isl_set_free(Min); 1029 isl_ast_build_free(Build); 1030 1031 return ResultValue; 1032 } 1033 1034 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, 1035 ScopArrayInfo *ArrayInfo) { 1036 1037 assert(ManagedMemory && "Only used when you wish to get a host " 1038 "pointer for sending data to the kernel, " 1039 "with managed memory"); 1040 std::map<ScopArrayInfo *, Value *>::iterator it; 1041 if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { 1042 return it->second; 1043 } else { 1044 Value *HostPtr; 1045 1046 if (gpu_array_is_scalar(Array)) 1047 HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); 1048 else 1049 HostPtr = ArrayInfo->getBasePtr(); 1050 1051 Value *Offset = getArrayOffset(Array); 1052 if (Offset) { 1053 HostPtr = Builder.CreatePointerCast( 1054 HostPtr, ArrayInfo->getElementType()->getPointerTo()); 1055 HostPtr = Builder.CreateGEP(HostPtr, Offset); 1056 } 1057 1058 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 1059 DeviceAllocations[ArrayInfo] = HostPtr; 1060 return HostPtr; 1061 } 1062 } 1063 1064 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 1065 enum DataDirection Direction) { 1066 assert(!ManagedMemory && "Managed memory needs no data transfers"); 1067 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 1068 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 1069 isl_id *Id = isl_ast_expr_get_id(Arg); 1070 auto Array = (gpu_array_info *)isl_id_get_user(Id); 1071 auto ScopArray = (ScopArrayInfo *)(Array->user); 1072 1073 Value *Size = getArraySize(Array); 1074 Value *Offset = getArrayOffset(Array); 1075 Value *DevPtr = DeviceAllocations[ScopArray]; 1076 1077 Value *HostPtr; 1078 1079 if (gpu_array_is_scalar(Array)) 1080 HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 1081 else 1082 HostPtr = ScopArray->getBasePtr(); 1083 1084 if (Offset) { 1085 HostPtr = Builder.CreatePointerCast( 1086 HostPtr, ScopArray->getElementType()->getPointerTo()); 1087 HostPtr = Builder.CreateGEP(HostPtr, Offset); 1088 } 1089 1090 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 1091 1092 if (Offset) { 1093 Size = Builder.CreateSub( 1094 Size, Builder.CreateMul( 1095 Offset, Builder.getInt64(ScopArray->getElemSizeInBytes()))); 1096 } 1097 1098 if (Direction == HOST_TO_DEVICE) 1099 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 1100 else 1101 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 1102 1103 isl_id_free(Id); 1104 isl_ast_expr_free(Arg); 1105 isl_ast_expr_free(Expr); 1106 isl_ast_node_free(TransferStmt); 1107 } 1108 1109 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 1110 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 1111 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1112 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1113 isl_id_free(Id); 1114 isl_ast_expr_free(StmtExpr); 1115 1116 const char *Str = isl_id_get_name(Id); 1117 if (!strcmp(Str, "kernel")) { 1118 createKernel(UserStmt); 1119 isl_ast_expr_free(Expr); 1120 return; 1121 } 1122 1123 if (isPrefix(Str, "to_device")) { 1124 if (!ManagedMemory) 1125 createDataTransfer(UserStmt, HOST_TO_DEVICE); 1126 else 1127 isl_ast_node_free(UserStmt); 1128 1129 isl_ast_expr_free(Expr); 1130 return; 1131 } 1132 1133 if (isPrefix(Str, "from_device")) { 1134 if (!ManagedMemory) { 1135 createDataTransfer(UserStmt, DEVICE_TO_HOST); 1136 } else { 1137 createCallSynchronizeDevice(); 1138 isl_ast_node_free(UserStmt); 1139 } 1140 isl_ast_expr_free(Expr); 1141 return; 1142 } 1143 1144 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 1145 struct ppcg_kernel_stmt *KernelStmt = 1146 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 1147 isl_id_free(Anno); 1148 1149 switch (KernelStmt->type) { 1150 case ppcg_kernel_domain: 1151 createScopStmt(Expr, KernelStmt); 1152 isl_ast_node_free(UserStmt); 1153 return; 1154 case ppcg_kernel_copy: 1155 createKernelCopy(KernelStmt); 1156 isl_ast_expr_free(Expr); 1157 isl_ast_node_free(UserStmt); 1158 return; 1159 case ppcg_kernel_sync: 1160 createKernelSync(); 1161 isl_ast_expr_free(Expr); 1162 isl_ast_node_free(UserStmt); 1163 return; 1164 } 1165 1166 isl_ast_expr_free(Expr); 1167 isl_ast_node_free(UserStmt); 1168 return; 1169 } 1170 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 1171 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 1172 LocalIndex = isl_ast_expr_address_of(LocalIndex); 1173 Value *LocalAddr = ExprBuilder.create(LocalIndex); 1174 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 1175 Index = isl_ast_expr_address_of(Index); 1176 Value *GlobalAddr = ExprBuilder.create(Index); 1177 1178 if (KernelStmt->u.c.read) { 1179 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 1180 Builder.CreateStore(Load, LocalAddr); 1181 } else { 1182 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 1183 Builder.CreateStore(Load, GlobalAddr); 1184 } 1185 } 1186 1187 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 1188 ppcg_kernel_stmt *KernelStmt) { 1189 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1190 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 1191 1192 LoopToScevMapT LTS; 1193 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 1194 1195 createSubstitutions(Expr, Stmt, LTS); 1196 1197 if (Stmt->isBlockStmt()) 1198 BlockGen.copyStmt(*Stmt, LTS, Indexes); 1199 else 1200 RegionGen.copyStmt(*Stmt, LTS, Indexes); 1201 } 1202 1203 void GPUNodeBuilder::createKernelSync() { 1204 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1205 1206 Function *Sync; 1207 1208 switch (Arch) { 1209 case GPUArch::NVPTX64: 1210 Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 1211 break; 1212 } 1213 1214 Builder.CreateCall(Sync, {}); 1215 } 1216 1217 /// Collect llvm::Values referenced from @p Node 1218 /// 1219 /// This function only applies to isl_ast_nodes that are user_nodes referring 1220 /// to a ScopStmt. All other node types are ignore. 1221 /// 1222 /// @param Node The node to collect references for. 1223 /// @param User A user pointer used as storage for the data that is collected. 1224 /// 1225 /// @returns isl_bool_true if data could be collected successfully. 1226 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 1227 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 1228 return isl_bool_true; 1229 1230 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 1231 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 1232 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 1233 const char *Str = isl_id_get_name(Id); 1234 isl_id_free(Id); 1235 isl_ast_expr_free(StmtExpr); 1236 isl_ast_expr_free(Expr); 1237 1238 if (!isPrefix(Str, "Stmt")) 1239 return isl_bool_true; 1240 1241 Id = isl_ast_node_get_annotation(Node); 1242 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 1243 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 1244 isl_id_free(Id); 1245 1246 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 1247 1248 return isl_bool_true; 1249 } 1250 1251 /// Check if F is a function that we can code-generate in a GPU kernel. 1252 static bool isValidFunctionInKernel(llvm::Function *F) { 1253 assert(F && "F is an invalid pointer"); 1254 // We string compare against the name of the function to allow 1255 // all variants of the intrinsic "llvm.sqrt.*" 1256 return F->isIntrinsic() && F->getName().startswith("llvm.sqrt"); 1257 } 1258 1259 /// Do not take `Function` as a subtree value. 1260 /// 1261 /// We try to take the reference of all subtree values and pass them along 1262 /// to the kernel from the host. Taking an address of any function and 1263 /// trying to pass along is nonsensical. Only allow `Value`s that are not 1264 /// `Function`s. 1265 static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); } 1266 1267 /// Return `Function`s from `RawSubtreeValues`. 1268 static SetVector<Function *> 1269 getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) { 1270 SetVector<Function *> SubtreeFunctions; 1271 for (Value *It : RawSubtreeValues) { 1272 Function *F = dyn_cast<Function>(It); 1273 if (F) { 1274 assert(isValidFunctionInKernel(F) && "Code should have bailed out by " 1275 "this point if an invalid function " 1276 "were present in a kernel."); 1277 SubtreeFunctions.insert(F); 1278 } 1279 } 1280 return SubtreeFunctions; 1281 } 1282 1283 std::pair<SetVector<Value *>, SetVector<Function *>> 1284 GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 1285 SetVector<Value *> SubtreeValues; 1286 SetVector<const SCEV *> SCEVs; 1287 SetVector<const Loop *> Loops; 1288 SubtreeReferences References = { 1289 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 1290 1291 for (const auto &I : IDToValue) 1292 SubtreeValues.insert(I.second); 1293 1294 isl_ast_node_foreach_descendant_top_down( 1295 Kernel->tree, collectReferencesInGPUStmt, &References); 1296 1297 for (const SCEV *Expr : SCEVs) 1298 findValues(Expr, SE, SubtreeValues); 1299 1300 for (auto &SAI : S.arrays()) 1301 SubtreeValues.remove(SAI->getBasePtr()); 1302 1303 isl_space *Space = S.getParamSpace(); 1304 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 1305 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 1306 assert(IDToValue.count(Id)); 1307 Value *Val = IDToValue[Id]; 1308 SubtreeValues.remove(Val); 1309 isl_id_free(Id); 1310 } 1311 isl_space_free(Space); 1312 1313 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 1314 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1315 assert(IDToValue.count(Id)); 1316 Value *Val = IDToValue[Id]; 1317 SubtreeValues.remove(Val); 1318 isl_id_free(Id); 1319 } 1320 1321 // Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions 1322 // SubtreeValues. This is important, because we should not lose any 1323 // SubtreeValues in the process of constructing the 1324 // "ValidSubtree{Values, Functions} sets. Nor should the set 1325 // ValidSubtree{Values, Functions} have any common element. 1326 auto ValidSubtreeValuesIt = 1327 make_filter_range(SubtreeValues, isValidSubtreeValue); 1328 SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(), 1329 ValidSubtreeValuesIt.end()); 1330 SetVector<Function *> ValidSubtreeFunctions( 1331 getFunctionsFromRawSubtreeValues(SubtreeValues)); 1332 1333 return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions); 1334 } 1335 1336 void GPUNodeBuilder::clearDominators(Function *F) { 1337 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 1338 std::vector<BasicBlock *> Nodes; 1339 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 1340 Nodes.push_back(I->getBlock()); 1341 1342 for (BasicBlock *BB : Nodes) 1343 DT.eraseNode(BB); 1344 } 1345 1346 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 1347 for (BasicBlock &BB : *F) { 1348 Loop *L = LI.getLoopFor(&BB); 1349 if (L) 1350 SE.forgetLoop(L); 1351 } 1352 } 1353 1354 void GPUNodeBuilder::clearLoops(Function *F) { 1355 for (BasicBlock &BB : *F) { 1356 Loop *L = LI.getLoopFor(&BB); 1357 if (L) 1358 SE.forgetLoop(L); 1359 LI.removeBlock(&BB); 1360 } 1361 } 1362 1363 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 1364 std::vector<Value *> Sizes; 1365 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 1366 1367 for (long i = 0; i < Kernel->n_grid; i++) { 1368 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 1369 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 1370 Value *Res = ExprBuilder.create(GridSize); 1371 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 1372 Sizes.push_back(Res); 1373 } 1374 isl_ast_build_free(Context); 1375 1376 for (long i = Kernel->n_grid; i < 3; i++) 1377 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1378 1379 return std::make_tuple(Sizes[0], Sizes[1]); 1380 } 1381 1382 std::tuple<Value *, Value *, Value *> 1383 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 1384 std::vector<Value *> Sizes; 1385 1386 for (long i = 0; i < Kernel->n_block; i++) { 1387 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 1388 Sizes.push_back(Res); 1389 } 1390 1391 for (long i = Kernel->n_block; i < 3; i++) 1392 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 1393 1394 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 1395 } 1396 1397 void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, 1398 Instruction *Param, int Index) { 1399 Value *Slot = Builder.CreateGEP( 1400 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1401 Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1402 Builder.CreateStore(ParamTyped, Slot); 1403 } 1404 1405 Value * 1406 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 1407 SetVector<Value *> SubtreeValues) { 1408 const int NumArgs = F->arg_size(); 1409 std::vector<int> ArgSizes(NumArgs); 1410 1411 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); 1412 1413 BasicBlock *EntryBlock = 1414 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 1415 auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace(); 1416 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 1417 Instruction *Parameters = new AllocaInst( 1418 ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator()); 1419 1420 int Index = 0; 1421 for (long i = 0; i < Prog->n_array; i++) { 1422 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1423 continue; 1424 1425 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1426 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1427 1428 ArgSizes[Index] = SAI->getElemSizeInBytes(); 1429 1430 Value *DevArray = nullptr; 1431 if (ManagedMemory) { 1432 DevArray = getOrCreateManagedDeviceArray( 1433 &Prog->array[i], const_cast<ScopArrayInfo *>(SAI)); 1434 } else { 1435 DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)]; 1436 DevArray = createCallGetDevicePtr(DevArray); 1437 } 1438 assert(DevArray != nullptr && "Array to be offloaded to device not " 1439 "initialized"); 1440 Value *Offset = getArrayOffset(&Prog->array[i]); 1441 1442 if (Offset) { 1443 DevArray = Builder.CreatePointerCast( 1444 DevArray, SAI->getElementType()->getPointerTo()); 1445 DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset)); 1446 DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy()); 1447 } 1448 Value *Slot = Builder.CreateGEP( 1449 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1450 1451 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1452 Value *ValPtr = nullptr; 1453 if (ManagedMemory) 1454 ValPtr = DevArray; 1455 else 1456 ValPtr = BlockGen.getOrCreateAlloca(SAI); 1457 1458 assert(ValPtr != nullptr && "ValPtr that should point to a valid object" 1459 " to be stored into Parameters"); 1460 Value *ValPtrCast = 1461 Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); 1462 Builder.CreateStore(ValPtrCast, Slot); 1463 } else { 1464 Instruction *Param = 1465 new AllocaInst(Builder.getInt8PtrTy(), AddressSpace, 1466 Launch + "_param_" + std::to_string(Index), 1467 EntryBlock->getTerminator()); 1468 Builder.CreateStore(DevArray, Param); 1469 Value *ParamTyped = 1470 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1471 Builder.CreateStore(ParamTyped, Slot); 1472 } 1473 Index++; 1474 } 1475 1476 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1477 1478 for (long i = 0; i < NumHostIters; i++) { 1479 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1480 Value *Val = IDToValue[Id]; 1481 isl_id_free(Id); 1482 1483 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1484 1485 Instruction *Param = 1486 new AllocaInst(Val->getType(), AddressSpace, 1487 Launch + "_param_" + std::to_string(Index), 1488 EntryBlock->getTerminator()); 1489 Builder.CreateStore(Val, Param); 1490 insertStoreParameter(Parameters, Param, Index); 1491 Index++; 1492 } 1493 1494 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1495 1496 for (long i = 0; i < NumVars; i++) { 1497 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1498 Value *Val = IDToValue[Id]; 1499 isl_id_free(Id); 1500 1501 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1502 1503 Instruction *Param = 1504 new AllocaInst(Val->getType(), AddressSpace, 1505 Launch + "_param_" + std::to_string(Index), 1506 EntryBlock->getTerminator()); 1507 Builder.CreateStore(Val, Param); 1508 insertStoreParameter(Parameters, Param, Index); 1509 Index++; 1510 } 1511 1512 for (auto Val : SubtreeValues) { 1513 ArgSizes[Index] = computeSizeInBytes(Val->getType()); 1514 1515 Instruction *Param = 1516 new AllocaInst(Val->getType(), AddressSpace, 1517 Launch + "_param_" + std::to_string(Index), 1518 EntryBlock->getTerminator()); 1519 Builder.CreateStore(Val, Param); 1520 insertStoreParameter(Parameters, Param, Index); 1521 Index++; 1522 } 1523 1524 for (int i = 0; i < NumArgs; i++) { 1525 Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); 1526 Instruction *Param = 1527 new AllocaInst(Builder.getInt32Ty(), AddressSpace, 1528 Launch + "_param_size_" + std::to_string(i), 1529 EntryBlock->getTerminator()); 1530 Builder.CreateStore(Val, Param); 1531 insertStoreParameter(Parameters, Param, Index); 1532 Index++; 1533 } 1534 1535 auto Location = EntryBlock->getTerminator(); 1536 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1537 Launch + "_params_i8ptr", Location); 1538 } 1539 1540 void GPUNodeBuilder::setupKernelSubtreeFunctions( 1541 SetVector<Function *> SubtreeFunctions) { 1542 for (auto Fn : SubtreeFunctions) { 1543 const std::string ClonedFnName = Fn->getName(); 1544 Function *Clone = GPUModule->getFunction(ClonedFnName); 1545 if (!Clone) 1546 Clone = 1547 Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage, 1548 ClonedFnName, GPUModule.get()); 1549 assert(Clone && "Expected cloned function to be initialized."); 1550 assert(ValueMap.find(Fn) == ValueMap.end() && 1551 "Fn already present in ValueMap"); 1552 ValueMap[Fn] = Clone; 1553 } 1554 } 1555 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1556 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1557 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1558 isl_id_free(Id); 1559 isl_ast_node_free(KernelStmt); 1560 1561 if (Kernel->n_grid > 1) 1562 DeepestParallel = 1563 std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set)); 1564 else 1565 DeepestSequential = 1566 std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set)); 1567 1568 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1569 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1570 1571 SetVector<Value *> SubtreeValues; 1572 SetVector<Function *> SubtreeFunctions; 1573 std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel); 1574 1575 assert(Kernel->tree && "Device AST of kernel node is empty"); 1576 1577 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1578 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1579 ValueMapT HostValueMap = ValueMap; 1580 BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap; 1581 ScalarMap.clear(); 1582 1583 SetVector<const Loop *> Loops; 1584 1585 // Create for all loops we depend on values that contain the current loop 1586 // iteration. These values are necessary to generate code for SCEVs that 1587 // depend on such loops. As a result we need to pass them to the subfunction. 1588 for (const Loop *L : Loops) { 1589 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1590 SE.getUnknown(Builder.getInt64(1)), 1591 L, SCEV::FlagAnyWrap); 1592 Value *V = generateSCEV(OuterLIV); 1593 OutsideLoopIterations[L] = SE.getUnknown(V); 1594 SubtreeValues.insert(V); 1595 } 1596 1597 createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions); 1598 setupKernelSubtreeFunctions(SubtreeFunctions); 1599 1600 create(isl_ast_node_copy(Kernel->tree)); 1601 1602 finalizeKernelArguments(Kernel); 1603 Function *F = Builder.GetInsertBlock()->getParent(); 1604 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1605 clearDominators(F); 1606 clearScalarEvolution(F); 1607 clearLoops(F); 1608 1609 IDToValue = HostIDs; 1610 1611 ValueMap = std::move(HostValueMap); 1612 ScalarMap = std::move(HostScalarMap); 1613 EscapeMap.clear(); 1614 IDToSAI.clear(); 1615 Annotator.resetAlternativeAliasBases(); 1616 for (auto &BasePtr : LocalArrays) 1617 S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array); 1618 LocalArrays.clear(); 1619 1620 std::string ASMString = finalizeKernelFunction(); 1621 Builder.SetInsertPoint(&HostInsertPoint); 1622 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1623 1624 std::string Name = "kernel_" + std::to_string(Kernel->id); 1625 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1626 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1627 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1628 1629 Value *GridDimX, *GridDimY; 1630 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1631 1632 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1633 BlockDimZ, Parameters); 1634 createCallFreeKernel(GPUKernel); 1635 1636 for (auto Id : KernelIds) 1637 isl_id_free(Id); 1638 1639 KernelIds.clear(); 1640 } 1641 1642 /// Compute the DataLayout string for the NVPTX backend. 1643 /// 1644 /// @param is64Bit Are we looking for a 64 bit architecture? 1645 static std::string computeNVPTXDataLayout(bool is64Bit) { 1646 std::string Ret = ""; 1647 1648 if (!is64Bit) { 1649 Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1650 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1651 "64-v128:128:128-n16:32:64"; 1652 } else { 1653 Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:" 1654 "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:" 1655 "64-v128:128:128-n16:32:64"; 1656 } 1657 1658 return Ret; 1659 } 1660 1661 Function * 1662 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1663 SetVector<Value *> &SubtreeValues) { 1664 std::vector<Type *> Args; 1665 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1666 1667 for (long i = 0; i < Prog->n_array; i++) { 1668 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1669 continue; 1670 1671 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1672 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1673 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 1674 Args.push_back(SAI->getElementType()); 1675 } else { 1676 static const int UseGlobalMemory = 1; 1677 Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory)); 1678 } 1679 } 1680 1681 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1682 1683 for (long i = 0; i < NumHostIters; i++) 1684 Args.push_back(Builder.getInt64Ty()); 1685 1686 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1687 1688 for (long i = 0; i < NumVars; i++) { 1689 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1690 Value *Val = IDToValue[Id]; 1691 isl_id_free(Id); 1692 Args.push_back(Val->getType()); 1693 } 1694 1695 for (auto *V : SubtreeValues) 1696 Args.push_back(V->getType()); 1697 1698 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1699 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1700 GPUModule.get()); 1701 1702 switch (Arch) { 1703 case GPUArch::NVPTX64: 1704 FN->setCallingConv(CallingConv::PTX_Kernel); 1705 break; 1706 } 1707 1708 auto Arg = FN->arg_begin(); 1709 for (long i = 0; i < Kernel->n_array; i++) { 1710 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1711 continue; 1712 1713 Arg->setName(Kernel->array[i].array->name); 1714 1715 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1716 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1717 Type *EleTy = SAI->getElementType(); 1718 Value *Val = &*Arg; 1719 SmallVector<const SCEV *, 4> Sizes; 1720 isl_ast_build *Build = 1721 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1722 Sizes.push_back(nullptr); 1723 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1724 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1725 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1726 auto V = ExprBuilder.create(DimSize); 1727 Sizes.push_back(SE.getSCEV(V)); 1728 } 1729 const ScopArrayInfo *SAIRep = 1730 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array); 1731 LocalArrays.push_back(Val); 1732 1733 isl_ast_build_free(Build); 1734 KernelIds.push_back(Id); 1735 IDToSAI[Id] = SAIRep; 1736 Arg++; 1737 } 1738 1739 for (long i = 0; i < NumHostIters; i++) { 1740 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1741 Arg->setName(isl_id_get_name(Id)); 1742 IDToValue[Id] = &*Arg; 1743 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1744 Arg++; 1745 } 1746 1747 for (long i = 0; i < NumVars; i++) { 1748 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1749 Arg->setName(isl_id_get_name(Id)); 1750 Value *Val = IDToValue[Id]; 1751 ValueMap[Val] = &*Arg; 1752 IDToValue[Id] = &*Arg; 1753 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1754 Arg++; 1755 } 1756 1757 for (auto *V : SubtreeValues) { 1758 Arg->setName(V->getName()); 1759 ValueMap[V] = &*Arg; 1760 Arg++; 1761 } 1762 1763 return FN; 1764 } 1765 1766 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1767 Intrinsic::ID IntrinsicsBID[2]; 1768 Intrinsic::ID IntrinsicsTID[3]; 1769 1770 switch (Arch) { 1771 case GPUArch::NVPTX64: 1772 IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x; 1773 IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y; 1774 1775 IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x; 1776 IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y; 1777 IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z; 1778 break; 1779 } 1780 1781 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1782 std::string Name = isl_id_get_name(Id); 1783 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1784 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1785 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1786 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1787 IDToValue[Id] = Val; 1788 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1789 }; 1790 1791 for (int i = 0; i < Kernel->n_grid; ++i) { 1792 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1793 addId(Id, IntrinsicsBID[i]); 1794 } 1795 1796 for (int i = 0; i < Kernel->n_block; ++i) { 1797 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1798 addId(Id, IntrinsicsTID[i]); 1799 } 1800 } 1801 1802 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1803 auto Arg = FN->arg_begin(); 1804 for (long i = 0; i < Kernel->n_array; i++) { 1805 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1806 continue; 1807 1808 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1809 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1810 isl_id_free(Id); 1811 1812 if (SAI->getNumberOfDimensions() > 0) { 1813 Arg++; 1814 continue; 1815 } 1816 1817 Value *Val = &*Arg; 1818 1819 if (!gpu_array_is_read_only_scalar(&Prog->array[i])) { 1820 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1821 Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr); 1822 Val = Builder.CreateLoad(TypedArgPtr); 1823 } 1824 1825 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1826 Builder.CreateStore(Val, Alloca); 1827 1828 Arg++; 1829 } 1830 } 1831 1832 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) { 1833 auto *FN = Builder.GetInsertBlock()->getParent(); 1834 auto Arg = FN->arg_begin(); 1835 1836 bool StoredScalar = false; 1837 for (long i = 0; i < Kernel->n_array; i++) { 1838 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1839 continue; 1840 1841 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1842 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1843 isl_id_free(Id); 1844 1845 if (SAI->getNumberOfDimensions() > 0) { 1846 Arg++; 1847 continue; 1848 } 1849 1850 if (gpu_array_is_read_only_scalar(&Prog->array[i])) { 1851 Arg++; 1852 continue; 1853 } 1854 1855 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1856 Value *ArgPtr = &*Arg; 1857 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1858 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1859 Value *Val = Builder.CreateLoad(Alloca); 1860 Builder.CreateStore(Val, TypedArgPtr); 1861 StoredScalar = true; 1862 1863 Arg++; 1864 } 1865 1866 if (StoredScalar) 1867 /// In case more than one thread contains scalar stores, the generated 1868 /// code might be incorrect, if we only store at the end of the kernel. 1869 /// To support this case we need to store these scalars back at each 1870 /// memory store or at least before each kernel barrier. 1871 if (Kernel->n_block != 0 || Kernel->n_grid != 0) 1872 BuildSuccessful = 0; 1873 } 1874 1875 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1876 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1877 1878 for (int i = 0; i < Kernel->n_var; ++i) { 1879 struct ppcg_kernel_var &Var = Kernel->var[i]; 1880 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1881 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1882 1883 Type *ArrayTy = EleTy; 1884 SmallVector<const SCEV *, 4> Sizes; 1885 1886 Sizes.push_back(nullptr); 1887 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1888 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1889 long Bound = isl_val_get_num_si(Val); 1890 isl_val_free(Val); 1891 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1892 } 1893 1894 for (int j = Var.array->n_index - 1; j >= 0; --j) { 1895 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1896 long Bound = isl_val_get_num_si(Val); 1897 isl_val_free(Val); 1898 ArrayTy = ArrayType::get(ArrayTy, Bound); 1899 } 1900 1901 const ScopArrayInfo *SAI; 1902 Value *Allocation; 1903 if (Var.type == ppcg_access_shared) { 1904 auto GlobalVar = new GlobalVariable( 1905 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1906 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1907 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1908 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1909 1910 Allocation = GlobalVar; 1911 } else if (Var.type == ppcg_access_private) { 1912 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1913 } else { 1914 llvm_unreachable("unknown variable type"); 1915 } 1916 SAI = 1917 S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array); 1918 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1919 IDToValue[Id] = Allocation; 1920 LocalArrays.push_back(Allocation); 1921 KernelIds.push_back(Id); 1922 IDToSAI[Id] = SAI; 1923 } 1924 } 1925 1926 void GPUNodeBuilder::createKernelFunction( 1927 ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues, 1928 SetVector<Function *> &SubtreeFunctions) { 1929 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1930 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1931 1932 switch (Arch) { 1933 case GPUArch::NVPTX64: 1934 if (Runtime == GPURuntime::CUDA) 1935 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1936 else if (Runtime == GPURuntime::OpenCL) 1937 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl")); 1938 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1939 break; 1940 } 1941 1942 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1943 1944 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1945 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1946 1947 DT.addNewBlock(EntryBlock, PrevBlock); 1948 1949 Builder.SetInsertPoint(EntryBlock); 1950 Builder.CreateRetVoid(); 1951 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1952 1953 ScopDetection::markFunctionAsInvalid(FN); 1954 1955 prepareKernelArguments(Kernel, FN); 1956 createKernelVariables(Kernel, FN); 1957 insertKernelIntrinsics(Kernel); 1958 } 1959 1960 std::string GPUNodeBuilder::createKernelASM() { 1961 llvm::Triple GPUTriple; 1962 1963 switch (Arch) { 1964 case GPUArch::NVPTX64: 1965 switch (Runtime) { 1966 case GPURuntime::CUDA: 1967 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda")); 1968 break; 1969 case GPURuntime::OpenCL: 1970 GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl")); 1971 break; 1972 } 1973 break; 1974 } 1975 1976 std::string ErrMsg; 1977 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1978 1979 if (!GPUTarget) { 1980 errs() << ErrMsg << "\n"; 1981 return ""; 1982 } 1983 1984 TargetOptions Options; 1985 Options.UnsafeFPMath = FastMath; 1986 1987 std::string subtarget; 1988 1989 switch (Arch) { 1990 case GPUArch::NVPTX64: 1991 subtarget = CudaVersion; 1992 break; 1993 } 1994 1995 std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine( 1996 GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>())); 1997 1998 SmallString<0> ASMString; 1999 raw_svector_ostream ASMStream(ASMString); 2000 llvm::legacy::PassManager PM; 2001 2002 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 2003 2004 if (TargetM->addPassesToEmitFile( 2005 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 2006 errs() << "The target does not support generation of this file type!\n"; 2007 return ""; 2008 } 2009 2010 PM.run(*GPUModule); 2011 2012 return ASMStream.str(); 2013 } 2014 2015 std::string GPUNodeBuilder::finalizeKernelFunction() { 2016 2017 if (verifyModule(*GPUModule)) { 2018 DEBUG(dbgs() << "verifyModule failed on module:\n"; 2019 GPUModule->print(dbgs(), nullptr); dbgs() << "\n";); 2020 2021 if (FailOnVerifyModuleFailure) 2022 llvm_unreachable("VerifyModule failed."); 2023 2024 BuildSuccessful = false; 2025 return ""; 2026 } 2027 2028 if (DumpKernelIR) 2029 outs() << *GPUModule << "\n"; 2030 2031 // Optimize module. 2032 llvm::legacy::PassManager OptPasses; 2033 PassManagerBuilder PassBuilder; 2034 PassBuilder.OptLevel = 3; 2035 PassBuilder.SizeLevel = 0; 2036 PassBuilder.populateModulePassManager(OptPasses); 2037 OptPasses.run(*GPUModule); 2038 2039 std::string Assembly = createKernelASM(); 2040 2041 if (DumpKernelASM) 2042 outs() << Assembly << "\n"; 2043 2044 GPUModule.release(); 2045 KernelIDs.clear(); 2046 2047 return Assembly; 2048 } 2049 2050 namespace { 2051 class PPCGCodeGeneration : public ScopPass { 2052 public: 2053 static char ID; 2054 2055 GPURuntime Runtime = GPURuntime::CUDA; 2056 2057 GPUArch Architecture = GPUArch::NVPTX64; 2058 2059 /// The scop that is currently processed. 2060 Scop *S; 2061 2062 LoopInfo *LI; 2063 DominatorTree *DT; 2064 ScalarEvolution *SE; 2065 const DataLayout *DL; 2066 RegionInfo *RI; 2067 2068 PPCGCodeGeneration() : ScopPass(ID) {} 2069 2070 /// Construct compilation options for PPCG. 2071 /// 2072 /// @returns The compilation options. 2073 ppcg_options *createPPCGOptions() { 2074 auto DebugOptions = 2075 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 2076 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 2077 2078 DebugOptions->dump_schedule_constraints = false; 2079 DebugOptions->dump_schedule = false; 2080 DebugOptions->dump_final_schedule = false; 2081 DebugOptions->dump_sizes = false; 2082 DebugOptions->verbose = false; 2083 2084 Options->debug = DebugOptions; 2085 2086 Options->reschedule = true; 2087 Options->scale_tile_loops = false; 2088 Options->wrap = false; 2089 2090 Options->non_negative_parameters = false; 2091 Options->ctx = nullptr; 2092 Options->sizes = nullptr; 2093 2094 Options->tile_size = 32; 2095 2096 Options->use_private_memory = PrivateMemory; 2097 Options->use_shared_memory = SharedMemory; 2098 Options->max_shared_memory = 48 * 1024; 2099 2100 Options->target = PPCG_TARGET_CUDA; 2101 Options->openmp = false; 2102 Options->linearize_device_arrays = true; 2103 Options->live_range_reordering = false; 2104 2105 Options->opencl_compiler_options = nullptr; 2106 Options->opencl_use_gpu = false; 2107 Options->opencl_n_include_file = 0; 2108 Options->opencl_include_files = nullptr; 2109 Options->opencl_print_kernel_types = false; 2110 Options->opencl_embed_kernel_code = false; 2111 2112 Options->save_schedule_file = nullptr; 2113 Options->load_schedule_file = nullptr; 2114 2115 return Options; 2116 } 2117 2118 /// Get a tagged access relation containing all accesses of type @p AccessTy. 2119 /// 2120 /// Instead of a normal access of the form: 2121 /// 2122 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 2123 /// 2124 /// a tagged access has the form 2125 /// 2126 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 2127 /// 2128 /// where 'id' is an additional space that references the memory access that 2129 /// triggered the access. 2130 /// 2131 /// @param AccessTy The type of the memory accesses to collect. 2132 /// 2133 /// @return The relation describing all tagged memory accesses. 2134 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 2135 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 2136 2137 for (auto &Stmt : *S) 2138 for (auto &Acc : Stmt) 2139 if (Acc->getType() == AccessTy) { 2140 isl_map *Relation = Acc->getAccessRelation(); 2141 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 2142 2143 isl_space *Space = isl_map_get_space(Relation); 2144 Space = isl_space_range(Space); 2145 Space = isl_space_from_range(Space); 2146 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 2147 isl_map *Universe = isl_map_universe(Space); 2148 Relation = isl_map_domain_product(Relation, Universe); 2149 Accesses = isl_union_map_add_map(Accesses, Relation); 2150 } 2151 2152 return Accesses; 2153 } 2154 2155 /// Get the set of all read accesses, tagged with the access id. 2156 /// 2157 /// @see getTaggedAccesses 2158 isl_union_map *getTaggedReads() { 2159 return getTaggedAccesses(MemoryAccess::READ); 2160 } 2161 2162 /// Get the set of all may (and must) accesses, tagged with the access id. 2163 /// 2164 /// @see getTaggedAccesses 2165 isl_union_map *getTaggedMayWrites() { 2166 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 2167 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 2168 } 2169 2170 /// Get the set of all must accesses, tagged with the access id. 2171 /// 2172 /// @see getTaggedAccesses 2173 isl_union_map *getTaggedMustWrites() { 2174 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 2175 } 2176 2177 /// Collect parameter and array names as isl_ids. 2178 /// 2179 /// To reason about the different parameters and arrays used, ppcg requires 2180 /// a list of all isl_ids in use. As PPCG traditionally performs 2181 /// source-to-source compilation each of these isl_ids is mapped to the 2182 /// expression that represents it. As we do not have a corresponding 2183 /// expression in Polly, we just map each id to a 'zero' expression to match 2184 /// the data format that ppcg expects. 2185 /// 2186 /// @returns Retun a map from collected ids to 'zero' ast expressions. 2187 __isl_give isl_id_to_ast_expr *getNames() { 2188 auto *Names = isl_id_to_ast_expr_alloc( 2189 S->getIslCtx(), 2190 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 2191 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 2192 auto *Space = S->getParamSpace(); 2193 2194 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 2195 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 2196 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 2197 } 2198 2199 for (auto &Array : S->arrays()) { 2200 auto Id = Array->getBasePtrId(); 2201 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 2202 } 2203 2204 isl_space_free(Space); 2205 isl_ast_expr_free(Zero); 2206 2207 return Names; 2208 } 2209 2210 /// Create a new PPCG scop from the current scop. 2211 /// 2212 /// The PPCG scop is initialized with data from the current polly::Scop. From 2213 /// this initial data, the data-dependences in the PPCG scop are initialized. 2214 /// We do not use Polly's dependence analysis for now, to ensure we match 2215 /// the PPCG default behaviour more closely. 2216 /// 2217 /// @returns A new ppcg scop. 2218 ppcg_scop *createPPCGScop() { 2219 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 2220 2221 PPCGScop->options = createPPCGOptions(); 2222 // enable live range reordering 2223 PPCGScop->options->live_range_reordering = 1; 2224 2225 PPCGScop->start = 0; 2226 PPCGScop->end = 0; 2227 2228 PPCGScop->context = S->getContext(); 2229 PPCGScop->domain = S->getDomains(); 2230 PPCGScop->call = nullptr; 2231 PPCGScop->tagged_reads = getTaggedReads(); 2232 PPCGScop->reads = S->getReads(); 2233 PPCGScop->live_in = nullptr; 2234 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 2235 PPCGScop->may_writes = S->getWrites(); 2236 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 2237 PPCGScop->must_writes = S->getMustWrites(); 2238 PPCGScop->live_out = nullptr; 2239 PPCGScop->tagger = nullptr; 2240 PPCGScop->independence = 2241 isl_union_map_empty(isl_set_get_space(PPCGScop->context)); 2242 PPCGScop->dep_flow = nullptr; 2243 PPCGScop->tagged_dep_flow = nullptr; 2244 PPCGScop->dep_false = nullptr; 2245 PPCGScop->dep_forced = nullptr; 2246 PPCGScop->dep_order = nullptr; 2247 PPCGScop->tagged_dep_order = nullptr; 2248 2249 PPCGScop->schedule = S->getScheduleTree(); 2250 2251 MustKillsInfo KillsInfo = computeMustKillsInfo(*S); 2252 // If we have something non-trivial to kill, add it to the schedule 2253 if (KillsInfo.KillsSchedule.get()) 2254 PPCGScop->schedule = isl_schedule_sequence( 2255 PPCGScop->schedule, KillsInfo.KillsSchedule.take()); 2256 PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take(); 2257 2258 PPCGScop->names = getNames(); 2259 PPCGScop->pet = nullptr; 2260 2261 compute_tagger(PPCGScop); 2262 compute_dependences(PPCGScop); 2263 2264 return PPCGScop; 2265 } 2266 2267 /// Collect the array accesses in a statement. 2268 /// 2269 /// @param Stmt The statement for which to collect the accesses. 2270 /// 2271 /// @returns A list of array accesses. 2272 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 2273 gpu_stmt_access *Accesses = nullptr; 2274 2275 for (MemoryAccess *Acc : Stmt) { 2276 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 2277 Access->read = Acc->isRead(); 2278 Access->write = Acc->isWrite(); 2279 Access->access = Acc->getAccessRelation(); 2280 isl_space *Space = isl_map_get_space(Access->access); 2281 Space = isl_space_range(Space); 2282 Space = isl_space_from_range(Space); 2283 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 2284 isl_map *Universe = isl_map_universe(Space); 2285 Access->tagged_access = 2286 isl_map_domain_product(Acc->getAccessRelation(), Universe); 2287 Access->exact_write = !Acc->isMayWrite(); 2288 Access->ref_id = Acc->getId(); 2289 Access->next = Accesses; 2290 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 2291 Accesses = Access; 2292 } 2293 2294 return Accesses; 2295 } 2296 2297 /// Collect the list of GPU statements. 2298 /// 2299 /// Each statement has an id, a pointer to the underlying data structure, 2300 /// as well as a list with all memory accesses. 2301 /// 2302 /// TODO: Initialize the list of memory accesses. 2303 /// 2304 /// @returns A linked-list of statements. 2305 gpu_stmt *getStatements() { 2306 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 2307 std::distance(S->begin(), S->end())); 2308 2309 int i = 0; 2310 for (auto &Stmt : *S) { 2311 gpu_stmt *GPUStmt = &Stmts[i]; 2312 2313 GPUStmt->id = Stmt.getDomainId(); 2314 2315 // We use the pet stmt pointer to keep track of the Polly statements. 2316 GPUStmt->stmt = (pet_stmt *)&Stmt; 2317 GPUStmt->accesses = getStmtAccesses(Stmt); 2318 i++; 2319 } 2320 2321 return Stmts; 2322 } 2323 2324 /// Derive the extent of an array. 2325 /// 2326 /// The extent of an array is the set of elements that are within the 2327 /// accessed array. For the inner dimensions, the extent constraints are 2328 /// 0 and the size of the corresponding array dimension. For the first 2329 /// (outermost) dimension, the extent constraints are the minimal and maximal 2330 /// subscript value for the first dimension. 2331 /// 2332 /// @param Array The array to derive the extent for. 2333 /// 2334 /// @returns An isl_set describing the extent of the array. 2335 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 2336 unsigned NumDims = Array->getNumberOfDimensions(); 2337 isl_union_map *Accesses = S->getAccesses(); 2338 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 2339 Accesses = isl_union_map_detect_equalities(Accesses); 2340 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 2341 AccessUSet = isl_union_set_coalesce(AccessUSet); 2342 AccessUSet = isl_union_set_detect_equalities(AccessUSet); 2343 AccessUSet = isl_union_set_coalesce(AccessUSet); 2344 2345 if (isl_union_set_is_empty(AccessUSet)) { 2346 isl_union_set_free(AccessUSet); 2347 return isl_set_empty(Array->getSpace()); 2348 } 2349 2350 if (Array->getNumberOfDimensions() == 0) { 2351 isl_union_set_free(AccessUSet); 2352 return isl_set_universe(Array->getSpace()); 2353 } 2354 2355 isl_set *AccessSet = 2356 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 2357 2358 isl_union_set_free(AccessUSet); 2359 isl_local_space *LS = isl_local_space_from_space(Array->getSpace()); 2360 2361 isl_pw_aff *Val = 2362 isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 2363 2364 isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 2365 isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 2366 OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 2367 isl_pw_aff_dim(Val, isl_dim_in)); 2368 OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 2369 isl_pw_aff_dim(Val, isl_dim_in)); 2370 OuterMin = 2371 isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); 2372 OuterMax = 2373 isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); 2374 2375 isl_set *Extent = isl_set_universe(Array->getSpace()); 2376 2377 Extent = isl_set_intersect( 2378 Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 2379 Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 2380 2381 for (unsigned i = 1; i < NumDims; ++i) 2382 Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 2383 2384 for (unsigned i = 0; i < NumDims; ++i) { 2385 isl_pw_aff *PwAff = 2386 const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i)); 2387 2388 // isl_pw_aff can be NULL for zero dimension. Only in the case of a 2389 // Fortran array will we have a legitimate dimension. 2390 if (!PwAff) { 2391 assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension"); 2392 continue; 2393 } 2394 2395 isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 2396 isl_local_space_from_space(Array->getSpace()), isl_dim_set, i)); 2397 PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 2398 isl_pw_aff_dim(Val, isl_dim_in)); 2399 PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 2400 isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 2401 auto *Set = isl_pw_aff_gt_set(PwAff, Val); 2402 Extent = isl_set_intersect(Set, Extent); 2403 } 2404 2405 return Extent; 2406 } 2407 2408 /// Derive the bounds of an array. 2409 /// 2410 /// For the first dimension we derive the bound of the array from the extent 2411 /// of this dimension. For inner dimensions we obtain their size directly from 2412 /// ScopArrayInfo. 2413 /// 2414 /// @param PPCGArray The array to compute bounds for. 2415 /// @param Array The polly array from which to take the information. 2416 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 2417 if (PPCGArray.n_index > 0) { 2418 if (isl_set_is_empty(PPCGArray.extent)) { 2419 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2420 isl_local_space *LS = isl_local_space_from_space( 2421 isl_space_params(isl_set_get_space(Dom))); 2422 isl_set_free(Dom); 2423 isl_aff *Zero = isl_aff_zero_on_domain(LS); 2424 PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero); 2425 } else { 2426 isl_set *Dom = isl_set_copy(PPCGArray.extent); 2427 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 2428 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 2429 isl_set_free(Dom); 2430 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 2431 isl_local_space *LS = 2432 isl_local_space_from_space(isl_set_get_space(Dom)); 2433 isl_aff *One = isl_aff_zero_on_domain(LS); 2434 One = isl_aff_add_constant_si(One, 1); 2435 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 2436 Bound = isl_pw_aff_gist(Bound, S->getContext()); 2437 PPCGArray.bound[0] = Bound; 2438 } 2439 } 2440 2441 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 2442 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 2443 auto LS = isl_pw_aff_get_domain_space(Bound); 2444 auto Aff = isl_multi_aff_zero(LS); 2445 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 2446 PPCGArray.bound[i] = Bound; 2447 } 2448 } 2449 2450 /// Create the arrays for @p PPCGProg. 2451 /// 2452 /// @param PPCGProg The program to compute the arrays for. 2453 void createArrays(gpu_prog *PPCGProg) { 2454 int i = 0; 2455 for (auto &Array : S->arrays()) { 2456 std::string TypeName; 2457 raw_string_ostream OS(TypeName); 2458 2459 OS << *Array->getElementType(); 2460 TypeName = OS.str(); 2461 2462 gpu_array_info &PPCGArray = PPCGProg->array[i]; 2463 2464 PPCGArray.space = Array->getSpace(); 2465 PPCGArray.type = strdup(TypeName.c_str()); 2466 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 2467 PPCGArray.name = strdup(Array->getName().c_str()); 2468 PPCGArray.extent = nullptr; 2469 PPCGArray.n_index = Array->getNumberOfDimensions(); 2470 PPCGArray.bound = 2471 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 2472 PPCGArray.extent = getExtent(Array); 2473 PPCGArray.n_ref = 0; 2474 PPCGArray.refs = nullptr; 2475 PPCGArray.accessed = true; 2476 PPCGArray.read_only_scalar = 2477 Array->isReadOnly() && Array->getNumberOfDimensions() == 0; 2478 PPCGArray.has_compound_element = false; 2479 PPCGArray.local = false; 2480 PPCGArray.declare_local = false; 2481 PPCGArray.global = false; 2482 PPCGArray.linearize = false; 2483 PPCGArray.dep_order = nullptr; 2484 PPCGArray.user = Array; 2485 2486 setArrayBounds(PPCGArray, Array); 2487 i++; 2488 2489 collect_references(PPCGProg, &PPCGArray); 2490 } 2491 } 2492 2493 /// Create an identity map between the arrays in the scop. 2494 /// 2495 /// @returns An identity map between the arrays in the scop. 2496 isl_union_map *getArrayIdentity() { 2497 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 2498 2499 for (auto &Array : S->arrays()) { 2500 isl_space *Space = Array->getSpace(); 2501 Space = isl_space_map_from_set(Space); 2502 isl_map *Identity = isl_map_identity(Space); 2503 Maps = isl_union_map_add_map(Maps, Identity); 2504 } 2505 2506 return Maps; 2507 } 2508 2509 /// Create a default-initialized PPCG GPU program. 2510 /// 2511 /// @returns A new gpu program description. 2512 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 2513 2514 if (!PPCGScop) 2515 return nullptr; 2516 2517 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 2518 2519 PPCGProg->ctx = S->getIslCtx(); 2520 PPCGProg->scop = PPCGScop; 2521 PPCGProg->context = isl_set_copy(PPCGScop->context); 2522 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 2523 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 2524 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 2525 PPCGProg->tagged_must_kill = 2526 isl_union_map_copy(PPCGScop->tagged_must_kills); 2527 PPCGProg->to_inner = getArrayIdentity(); 2528 PPCGProg->to_outer = getArrayIdentity(); 2529 PPCGProg->any_to_outer = nullptr; 2530 2531 // this needs to be set when live range reordering is enabled. 2532 // NOTE: I believe that is conservatively correct. I'm not sure 2533 // what the semantics of this is. 2534 // Quoting PPCG/gpu.h: "Order dependences on non-scalars." 2535 PPCGProg->array_order = 2536 isl_union_map_empty(isl_set_get_space(PPCGScop->context)); 2537 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 2538 PPCGProg->stmts = getStatements(); 2539 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 2540 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 2541 PPCGProg->n_array); 2542 2543 createArrays(PPCGProg); 2544 2545 PPCGProg->may_persist = compute_may_persist(PPCGProg); 2546 return PPCGProg; 2547 } 2548 2549 struct PrintGPUUserData { 2550 struct cuda_info *CudaInfo; 2551 struct gpu_prog *PPCGProg; 2552 std::vector<ppcg_kernel *> Kernels; 2553 }; 2554 2555 /// Print a user statement node in the host code. 2556 /// 2557 /// We use ppcg's printing facilities to print the actual statement and 2558 /// additionally build up a list of all kernels that are encountered in the 2559 /// host ast. 2560 /// 2561 /// @param P The printer to print to 2562 /// @param Options The printing options to use 2563 /// @param Node The node to print 2564 /// @param User A user pointer to carry additional data. This pointer is 2565 /// expected to be of type PrintGPUUserData. 2566 /// 2567 /// @returns A printer to which the output has been printed. 2568 static __isl_give isl_printer * 2569 printHostUser(__isl_take isl_printer *P, 2570 __isl_take isl_ast_print_options *Options, 2571 __isl_take isl_ast_node *Node, void *User) { 2572 auto Data = (struct PrintGPUUserData *)User; 2573 auto Id = isl_ast_node_get_annotation(Node); 2574 2575 if (Id) { 2576 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 2577 2578 // If this is a user statement, format it ourselves as ppcg would 2579 // otherwise try to call pet functionality that is not available in 2580 // Polly. 2581 if (IsUser) { 2582 P = isl_printer_start_line(P); 2583 P = isl_printer_print_ast_node(P, Node); 2584 P = isl_printer_end_line(P); 2585 isl_id_free(Id); 2586 isl_ast_print_options_free(Options); 2587 return P; 2588 } 2589 2590 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 2591 isl_id_free(Id); 2592 Data->Kernels.push_back(Kernel); 2593 } 2594 2595 return print_host_user(P, Options, Node, User); 2596 } 2597 2598 /// Print C code corresponding to the control flow in @p Kernel. 2599 /// 2600 /// @param Kernel The kernel to print 2601 void printKernel(ppcg_kernel *Kernel) { 2602 auto *P = isl_printer_to_str(S->getIslCtx()); 2603 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2604 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2605 P = isl_ast_node_print(Kernel->tree, P, Options); 2606 char *String = isl_printer_get_str(P); 2607 printf("%s\n", String); 2608 free(String); 2609 isl_printer_free(P); 2610 } 2611 2612 /// Print C code corresponding to the GPU code described by @p Tree. 2613 /// 2614 /// @param Tree An AST describing GPU code 2615 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 2616 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 2617 auto *P = isl_printer_to_str(S->getIslCtx()); 2618 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 2619 2620 PrintGPUUserData Data; 2621 Data.PPCGProg = PPCGProg; 2622 2623 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 2624 Options = 2625 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 2626 P = isl_ast_node_print(Tree, P, Options); 2627 char *String = isl_printer_get_str(P); 2628 printf("# host\n"); 2629 printf("%s\n", String); 2630 free(String); 2631 isl_printer_free(P); 2632 2633 for (auto Kernel : Data.Kernels) { 2634 printf("# kernel%d\n", Kernel->id); 2635 printKernel(Kernel); 2636 } 2637 } 2638 2639 // Generate a GPU program using PPCG. 2640 // 2641 // GPU mapping consists of multiple steps: 2642 // 2643 // 1) Compute new schedule for the program. 2644 // 2) Map schedule to GPU (TODO) 2645 // 3) Generate code for new schedule (TODO) 2646 // 2647 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 2648 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 2649 // strategy directly from this pass. 2650 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 2651 2652 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 2653 2654 PPCGGen->ctx = S->getIslCtx(); 2655 PPCGGen->options = PPCGScop->options; 2656 PPCGGen->print = nullptr; 2657 PPCGGen->print_user = nullptr; 2658 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 2659 PPCGGen->prog = PPCGProg; 2660 PPCGGen->tree = nullptr; 2661 PPCGGen->types.n = 0; 2662 PPCGGen->types.name = nullptr; 2663 PPCGGen->sizes = nullptr; 2664 PPCGGen->used_sizes = nullptr; 2665 PPCGGen->kernel_id = 0; 2666 2667 // Set scheduling strategy to same strategy PPCG is using. 2668 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 2669 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 2670 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 2671 2672 isl_schedule *Schedule = get_schedule(PPCGGen); 2673 2674 int has_permutable = has_any_permutable_node(Schedule); 2675 2676 if (!has_permutable || has_permutable < 0) { 2677 Schedule = isl_schedule_free(Schedule); 2678 } else { 2679 Schedule = map_to_device(PPCGGen, Schedule); 2680 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 2681 } 2682 2683 if (DumpSchedule) { 2684 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 2685 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 2686 P = isl_printer_print_str(P, "Schedule\n"); 2687 P = isl_printer_print_str(P, "========\n"); 2688 if (Schedule) 2689 P = isl_printer_print_schedule(P, Schedule); 2690 else 2691 P = isl_printer_print_str(P, "No schedule found\n"); 2692 2693 printf("%s\n", isl_printer_get_str(P)); 2694 isl_printer_free(P); 2695 } 2696 2697 if (DumpCode) { 2698 printf("Code\n"); 2699 printf("====\n"); 2700 if (PPCGGen->tree) 2701 printGPUTree(PPCGGen->tree, PPCGProg); 2702 else 2703 printf("No code generated\n"); 2704 } 2705 2706 isl_schedule_free(Schedule); 2707 2708 return PPCGGen; 2709 } 2710 2711 /// Free gpu_gen structure. 2712 /// 2713 /// @param PPCGGen The ppcg_gen object to free. 2714 void freePPCGGen(gpu_gen *PPCGGen) { 2715 isl_ast_node_free(PPCGGen->tree); 2716 isl_union_map_free(PPCGGen->sizes); 2717 isl_union_map_free(PPCGGen->used_sizes); 2718 free(PPCGGen); 2719 } 2720 2721 /// Free the options in the ppcg scop structure. 2722 /// 2723 /// ppcg is not freeing these options for us. To avoid leaks we do this 2724 /// ourselves. 2725 /// 2726 /// @param PPCGScop The scop referencing the options to free. 2727 void freeOptions(ppcg_scop *PPCGScop) { 2728 free(PPCGScop->options->debug); 2729 PPCGScop->options->debug = nullptr; 2730 free(PPCGScop->options); 2731 PPCGScop->options = nullptr; 2732 } 2733 2734 /// Approximate the number of points in the set. 2735 /// 2736 /// This function returns an ast expression that overapproximates the number 2737 /// of points in an isl set through the rectangular hull surrounding this set. 2738 /// 2739 /// @param Set The set to count. 2740 /// @param Build The isl ast build object to use for creating the ast 2741 /// expression. 2742 /// 2743 /// @returns An approximation of the number of points in the set. 2744 __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set, 2745 __isl_keep isl_ast_build *Build) { 2746 2747 isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1); 2748 auto *Expr = isl_ast_expr_from_val(isl_val_copy(One)); 2749 2750 isl_space *Space = isl_set_get_space(Set); 2751 Space = isl_space_params(Space); 2752 auto *Univ = isl_set_universe(Space); 2753 isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One); 2754 2755 for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) { 2756 isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i); 2757 isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i); 2758 isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min); 2759 DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff)); 2760 auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize); 2761 Expr = isl_ast_expr_mul(Expr, DimSizeExpr); 2762 } 2763 2764 isl_set_free(Set); 2765 isl_pw_aff_free(OneAff); 2766 2767 return Expr; 2768 } 2769 2770 /// Approximate a number of dynamic instructions executed by a given 2771 /// statement. 2772 /// 2773 /// @param Stmt The statement for which to compute the number of dynamic 2774 /// instructions. 2775 /// @param Build The isl ast build object to use for creating the ast 2776 /// expression. 2777 /// @returns An approximation of the number of dynamic instructions executed 2778 /// by @p Stmt. 2779 __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt, 2780 __isl_keep isl_ast_build *Build) { 2781 auto Iterations = approxPointsInSet(Stmt.getDomain(), Build); 2782 2783 long InstCount = 0; 2784 2785 if (Stmt.isBlockStmt()) { 2786 auto *BB = Stmt.getBasicBlock(); 2787 InstCount = std::distance(BB->begin(), BB->end()); 2788 } else { 2789 auto *R = Stmt.getRegion(); 2790 2791 for (auto *BB : R->blocks()) { 2792 InstCount += std::distance(BB->begin(), BB->end()); 2793 } 2794 } 2795 2796 isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount); 2797 auto *InstExpr = isl_ast_expr_from_val(InstVal); 2798 return isl_ast_expr_mul(InstExpr, Iterations); 2799 } 2800 2801 /// Approximate dynamic instructions executed in scop. 2802 /// 2803 /// @param S The scop for which to approximate dynamic instructions. 2804 /// @param Build The isl ast build object to use for creating the ast 2805 /// expression. 2806 /// @returns An approximation of the number of dynamic instructions executed 2807 /// in @p S. 2808 __isl_give isl_ast_expr * 2809 getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) { 2810 isl_ast_expr *Instructions; 2811 2812 isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0); 2813 Instructions = isl_ast_expr_from_val(Zero); 2814 2815 for (ScopStmt &Stmt : S) { 2816 isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build); 2817 Instructions = isl_ast_expr_add(Instructions, StmtInstructions); 2818 } 2819 return Instructions; 2820 } 2821 2822 /// Create a check that ensures sufficient compute in scop. 2823 /// 2824 /// @param S The scop for which to ensure sufficient compute. 2825 /// @param Build The isl ast build object to use for creating the ast 2826 /// expression. 2827 /// @returns An expression that evaluates to TRUE in case of sufficient 2828 /// compute and to FALSE, otherwise. 2829 __isl_give isl_ast_expr * 2830 createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) { 2831 auto Iterations = getNumberOfIterations(S, Build); 2832 auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute); 2833 auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal); 2834 return isl_ast_expr_ge(Iterations, MinComputeExpr); 2835 } 2836 2837 /// Check if the basic block contains a function we cannot codegen for GPU 2838 /// kernels. 2839 /// 2840 /// If this basic block does something with a `Function` other than calling 2841 /// a function that we support in a kernel, return true. 2842 bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) { 2843 for (const Instruction &Inst : *BB) { 2844 const CallInst *Call = dyn_cast<CallInst>(&Inst); 2845 if (Call && isValidFunctionInKernel(Call->getCalledFunction())) { 2846 continue; 2847 } 2848 2849 for (Value *SrcVal : Inst.operands()) { 2850 PointerType *p = dyn_cast<PointerType>(SrcVal->getType()); 2851 if (!p) 2852 continue; 2853 if (isa<FunctionType>(p->getElementType())) 2854 return true; 2855 } 2856 } 2857 return false; 2858 } 2859 2860 /// Return whether the Scop S uses functions in a way that we do not support. 2861 bool containsInvalidKernelFunction(const Scop &S) { 2862 for (auto &Stmt : S) { 2863 if (Stmt.isBlockStmt()) { 2864 if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock())) 2865 return true; 2866 } else { 2867 assert(Stmt.isRegionStmt() && 2868 "Stmt was neither block nor region statement"); 2869 for (const BasicBlock *BB : Stmt.getRegion()->blocks()) 2870 if (containsInvalidKernelFunctionInBllock(BB)) 2871 return true; 2872 } 2873 } 2874 return false; 2875 } 2876 2877 /// Generate code for a given GPU AST described by @p Root. 2878 /// 2879 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 2880 /// @param Prog The GPU Program to generate code for. 2881 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 2882 ScopAnnotator Annotator; 2883 Annotator.buildAliasScopes(*S); 2884 2885 Region *R = &S->getRegion(); 2886 2887 simplifyRegion(R, DT, LI, RI); 2888 2889 BasicBlock *EnteringBB = R->getEnteringBlock(); 2890 2891 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 2892 2893 // Only build the run-time condition and parameters _after_ having 2894 // introduced the conditional branch. This is important as the conditional 2895 // branch will guard the original scop from new induction variables that 2896 // the SCEVExpander may introduce while code generating the parameters and 2897 // which may introduce scalar dependences that prevent us from correctly 2898 // code generating this scop. 2899 BBPair StartExitBlocks = 2900 executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI); 2901 BasicBlock *StartBlock = std::get<0>(StartExitBlocks); 2902 2903 GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S, 2904 StartBlock, Prog, Runtime, Architecture); 2905 2906 // TODO: Handle LICM 2907 auto SplitBlock = StartBlock->getSinglePredecessor(); 2908 Builder.SetInsertPoint(SplitBlock->getTerminator()); 2909 NodeBuilder.addParameters(S->getContext()); 2910 2911 isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 2912 isl_ast_expr *Condition = IslAst::buildRunCondition(*S, Build); 2913 isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build); 2914 Condition = isl_ast_expr_and(Condition, SufficientCompute); 2915 isl_ast_build_free(Build); 2916 2917 Value *RTC = NodeBuilder.createRTC(Condition); 2918 Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 2919 2920 Builder.SetInsertPoint(&*StartBlock->begin()); 2921 2922 NodeBuilder.initializeAfterRTH(); 2923 NodeBuilder.create(Root); 2924 NodeBuilder.finalize(); 2925 2926 /// In case a sequential kernel has more surrounding loops as any parallel 2927 /// kernel, the SCoP is probably mostly sequential. Hence, there is no 2928 /// point in running it on a GPU. 2929 if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel) 2930 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2931 2932 if (!NodeBuilder.BuildSuccessful) 2933 SplitBlock->getTerminator()->setOperand(0, Builder.getFalse()); 2934 } 2935 2936 bool runOnScop(Scop &CurrentScop) override { 2937 S = &CurrentScop; 2938 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2939 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2940 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2941 DL = &S->getRegion().getEntry()->getModule()->getDataLayout(); 2942 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2943 2944 // We currently do not support functions other than intrinsics inside 2945 // kernels, as code generation will need to offload function calls to the 2946 // kernel. This may lead to a kernel trying to call a function on the host. 2947 // This also allows us to prevent codegen from trying to take the 2948 // address of an intrinsic function to send to the kernel. 2949 if (containsInvalidKernelFunction(CurrentScop)) { 2950 DEBUG( 2951 dbgs() 2952 << "Scop contains function which cannot be materialised in a GPU " 2953 "kernel. Bailing out.\n";); 2954 return false; 2955 } 2956 2957 auto PPCGScop = createPPCGScop(); 2958 auto PPCGProg = createPPCGProg(PPCGScop); 2959 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2960 2961 if (PPCGGen->tree) { 2962 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2963 CurrentScop.markAsToBeSkipped(); 2964 } 2965 2966 freeOptions(PPCGScop); 2967 freePPCGGen(PPCGGen); 2968 gpu_prog_free(PPCGProg); 2969 ppcg_scop_free(PPCGScop); 2970 2971 return true; 2972 } 2973 2974 void printScop(raw_ostream &, Scop &) const override {} 2975 2976 void getAnalysisUsage(AnalysisUsage &AU) const override { 2977 AU.addRequired<DominatorTreeWrapperPass>(); 2978 AU.addRequired<RegionInfoPass>(); 2979 AU.addRequired<ScalarEvolutionWrapperPass>(); 2980 AU.addRequired<ScopDetectionWrapperPass>(); 2981 AU.addRequired<ScopInfoRegionPass>(); 2982 AU.addRequired<LoopInfoWrapperPass>(); 2983 2984 AU.addPreserved<AAResultsWrapperPass>(); 2985 AU.addPreserved<BasicAAWrapperPass>(); 2986 AU.addPreserved<LoopInfoWrapperPass>(); 2987 AU.addPreserved<DominatorTreeWrapperPass>(); 2988 AU.addPreserved<GlobalsAAWrapperPass>(); 2989 AU.addPreserved<ScopDetectionWrapperPass>(); 2990 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2991 AU.addPreserved<SCEVAAWrapperPass>(); 2992 2993 // FIXME: We do not yet add regions for the newly generated code to the 2994 // region tree. 2995 AU.addPreserved<RegionInfoPass>(); 2996 AU.addPreserved<ScopInfoRegionPass>(); 2997 } 2998 }; 2999 } // namespace 3000 3001 char PPCGCodeGeneration::ID = 1; 3002 3003 Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) { 3004 PPCGCodeGeneration *generator = new PPCGCodeGeneration(); 3005 generator->Runtime = Runtime; 3006 generator->Architecture = Arch; 3007 return generator; 3008 } 3009 3010 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 3011 "Polly - Apply PPCG translation to SCOP", false, false) 3012 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 3013 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 3014 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 3015 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 3016 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 3017 INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); 3018 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 3019 "Polly - Apply PPCG translation to SCOP", false, false) 3020