1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg
11 // GPU mapping strategy.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "polly/CodeGen/PPCGCodeGeneration.h"
16 #include "polly/CodeGen/IslAst.h"
17 #include "polly/CodeGen/IslNodeBuilder.h"
18 #include "polly/CodeGen/Utils.h"
19 #include "polly/DependenceInfo.h"
20 #include "polly/LinkAllPasses.h"
21 #include "polly/Options.h"
22 #include "polly/ScopDetection.h"
23 #include "polly/ScopInfo.h"
24 #include "polly/Support/SCEVValidator.h"
25 #include "llvm/ADT/PostOrderIterator.h"
26 #include "llvm/Analysis/AliasAnalysis.h"
27 #include "llvm/Analysis/BasicAliasAnalysis.h"
28 #include "llvm/Analysis/GlobalsModRef.h"
29 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
30 #include "llvm/Analysis/TargetLibraryInfo.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/IR/LegacyPassManager.h"
33 #include "llvm/IR/Verifier.h"
34 #include "llvm/Support/TargetRegistry.h"
35 #include "llvm/Support/TargetSelect.h"
36 #include "llvm/Target/TargetMachine.h"
37 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
38 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
39 
40 #include "isl/union_map.h"
41 
42 extern "C" {
43 #include "ppcg/cuda.h"
44 #include "ppcg/gpu.h"
45 #include "ppcg/gpu_print.h"
46 #include "ppcg/ppcg.h"
47 #include "ppcg/schedule.h"
48 }
49 
50 #include "llvm/Support/Debug.h"
51 
52 using namespace polly;
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "polly-codegen-ppcg"
56 
57 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
58                                   cl::desc("Dump the computed GPU Schedule"),
59                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
60                                   cl::cat(PollyCategory));
61 
62 static cl::opt<bool>
63     DumpCode("polly-acc-dump-code",
64              cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
65              cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
66 
67 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
68                                   cl::desc("Dump the kernel LLVM-IR"),
69                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
70                                   cl::cat(PollyCategory));
71 
72 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
73                                    cl::desc("Dump the kernel assembly code"),
74                                    cl::Hidden, cl::init(false), cl::ZeroOrMore,
75                                    cl::cat(PollyCategory));
76 
77 static cl::opt<bool> FastMath("polly-acc-fastmath",
78                               cl::desc("Allow unsafe math optimizations"),
79                               cl::Hidden, cl::init(false), cl::ZeroOrMore,
80                               cl::cat(PollyCategory));
81 static cl::opt<bool> SharedMemory("polly-acc-use-shared",
82                                   cl::desc("Use shared memory"), cl::Hidden,
83                                   cl::init(false), cl::ZeroOrMore,
84                                   cl::cat(PollyCategory));
85 static cl::opt<bool> PrivateMemory("polly-acc-use-private",
86                                    cl::desc("Use private memory"), cl::Hidden,
87                                    cl::init(false), cl::ZeroOrMore,
88                                    cl::cat(PollyCategory));
89 
90 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory",
91                                    cl::desc("Generate Host kernel code assuming"
92                                             " that all memory has been"
93                                             " declared as managed memory"),
94                                    cl::Hidden, cl::init(false), cl::ZeroOrMore,
95                                    cl::cat(PollyCategory));
96 
97 static cl::opt<std::string>
98     CudaVersion("polly-acc-cuda-version",
99                 cl::desc("The CUDA version to compile for"), cl::Hidden,
100                 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
101 
102 static cl::opt<int>
103     MinCompute("polly-acc-mincompute",
104                cl::desc("Minimal number of compute statements to run on GPU."),
105                cl::Hidden, cl::init(10 * 512 * 512));
106 
107 /// Create the ast expressions for a ScopStmt.
108 ///
109 /// This function is a callback for to generate the ast expressions for each
110 /// of the scheduled ScopStmts.
111 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
112     void *StmtT, isl_ast_build *Build,
113     isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
114                                        isl_id *Id, void *User),
115     void *UserIndex,
116     isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
117     void *UserExpr) {
118 
119   ScopStmt *Stmt = (ScopStmt *)StmtT;
120 
121   isl_ctx *Ctx;
122 
123   if (!Stmt || !Build)
124     return NULL;
125 
126   Ctx = isl_ast_build_get_ctx(Build);
127   isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0);
128 
129   for (MemoryAccess *Acc : *Stmt) {
130     isl_map *AddrFunc = Acc->getAddressFunction();
131     AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain());
132     isl_id *RefId = Acc->getId();
133     isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc);
134     isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA);
135     MPA = isl_multi_pw_aff_coalesce(MPA);
136     MPA = FunctionIndex(MPA, RefId, UserIndex);
137     isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);
138     Access = FunctionExpr(Access, RefId, UserExpr);
139     RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);
140   }
141 
142   return RefToExpr;
143 }
144 
145 /// Generate code for a GPU specific isl AST.
146 ///
147 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
148 /// generates code for general-prupose AST nodes, with special functionality
149 /// for generating GPU specific user nodes.
150 ///
151 /// @see GPUNodeBuilder::createUser
152 class GPUNodeBuilder : public IslNodeBuilder {
153 public:
154   GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
155                  const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
156                  DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
157                  gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
158       : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
159         Prog(Prog), Runtime(Runtime), Arch(Arch) {
160     getExprBuilder().setIDToSAI(&IDToSAI);
161   }
162 
163   /// Create after-run-time-check initialization code.
164   void initializeAfterRTH();
165 
166   /// Finalize the generated scop.
167   virtual void finalize();
168 
169   /// Track if the full build process was successful.
170   ///
171   /// This value is set to false, if throughout the build process an error
172   /// occurred which prevents us from generating valid GPU code.
173   bool BuildSuccessful = true;
174 
175   /// The maximal number of loops surrounding a sequential kernel.
176   unsigned DeepestSequential = 0;
177 
178   /// The maximal number of loops surrounding a parallel kernel.
179   unsigned DeepestParallel = 0;
180 
181 private:
182   /// A vector of array base pointers for which a new ScopArrayInfo was created.
183   ///
184   /// This vector is used to delete the ScopArrayInfo when it is not needed any
185   /// more.
186   std::vector<Value *> LocalArrays;
187 
188   /// A map from ScopArrays to their corresponding device allocations.
189   std::map<ScopArrayInfo *, Value *> DeviceAllocations;
190 
191   /// The current GPU context.
192   Value *GPUContext;
193 
194   /// The set of isl_ids allocated in the kernel
195   std::vector<isl_id *> KernelIds;
196 
197   /// A module containing GPU code.
198   ///
199   /// This pointer is only set in case we are currently generating GPU code.
200   std::unique_ptr<Module> GPUModule;
201 
202   /// The GPU program we generate code for.
203   gpu_prog *Prog;
204 
205   /// The GPU Runtime implementation to use (OpenCL or CUDA).
206   GPURuntime Runtime;
207 
208   /// The GPU Architecture to target.
209   GPUArch Arch;
210 
211   /// Class to free isl_ids.
212   class IslIdDeleter {
213   public:
214     void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
215   };
216 
217   /// A set containing all isl_ids allocated in a GPU kernel.
218   ///
219   /// By releasing this set all isl_ids will be freed.
220   std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
221 
222   IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
223 
224   /// Create code for user-defined AST nodes.
225   ///
226   /// These AST nodes can be of type:
227   ///
228   ///   - ScopStmt:      A computational statement (TODO)
229   ///   - Kernel:        A GPU kernel call (TODO)
230   ///   - Data-Transfer: A GPU <-> CPU data-transfer
231   ///   - In-kernel synchronization
232   ///   - In-kernel memory copy statement
233   ///
234   /// @param UserStmt The ast node to generate code for.
235   virtual void createUser(__isl_take isl_ast_node *UserStmt);
236 
237   enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
238 
239   /// Create code for a data transfer statement
240   ///
241   /// @param TransferStmt The data transfer statement.
242   /// @param Direction The direction in which to transfer data.
243   void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
244                           enum DataDirection Direction);
245 
246   /// Find llvm::Values referenced in GPU kernel.
247   ///
248   /// @param Kernel The kernel to scan for llvm::Values
249   ///
250   /// @returns A set of values referenced by the kernel.
251   SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
252 
253   /// Compute the sizes of the execution grid for a given kernel.
254   ///
255   /// @param Kernel The kernel to compute grid sizes for.
256   ///
257   /// @returns A tuple with grid sizes for X and Y dimension
258   std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
259 
260   /// Creates a array that can be sent to the kernel on the device using a
261   /// host pointer. This is required for managed memory, when we directly send
262   /// host pointers to the device.
263   /// \note
264   /// This is to be used only with managed memory
265   Value *getOrCreateManagedDeviceArray(gpu_array_info *Array,
266                                        ScopArrayInfo *ArrayInfo);
267 
268   /// Compute the sizes of the thread blocks for a given kernel.
269   ///
270   /// @param Kernel The kernel to compute thread block sizes for.
271   ///
272   /// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
273   std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
274 
275   /// Create kernel launch parameters.
276   ///
277   /// @param Kernel        The kernel to create parameters for.
278   /// @param F             The kernel function that has been created.
279   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
280   ///
281   /// @returns A stack allocated array with pointers to the parameter
282   ///          values that are passed to the kernel.
283   Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F,
284                                 SetVector<Value *> SubtreeValues);
285 
286   /// Create declarations for kernel variable.
287   ///
288   /// This includes shared memory declarations.
289   ///
290   /// @param Kernel        The kernel definition to create variables for.
291   /// @param FN            The function into which to generate the variables.
292   void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
293 
294   /// Add CUDA annotations to module.
295   ///
296   /// Add a set of CUDA annotations that declares the maximal block dimensions
297   /// that will be used to execute the CUDA kernel. This allows the NVIDIA
298   /// PTX compiler to bound the number of allocated registers to ensure the
299   /// resulting kernel is known to run with up to as many block dimensions
300   /// as specified here.
301   ///
302   /// @param M         The module to add the annotations to.
303   /// @param BlockDimX The size of block dimension X.
304   /// @param BlockDimY The size of block dimension Y.
305   /// @param BlockDimZ The size of block dimension Z.
306   void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
307                           Value *BlockDimZ);
308 
309   /// Create GPU kernel.
310   ///
311   /// Code generate the kernel described by @p KernelStmt.
312   ///
313   /// @param KernelStmt The ast node to generate kernel code for.
314   void createKernel(__isl_take isl_ast_node *KernelStmt);
315 
316   /// Generate code that computes the size of an array.
317   ///
318   /// @param Array The array for which to compute a size.
319   Value *getArraySize(gpu_array_info *Array);
320 
321   /// Generate code to compute the minimal offset at which an array is accessed.
322   ///
323   /// The offset of an array is the minimal array location accessed in a scop.
324   ///
325   /// Example:
326   ///
327   ///   for (long i = 0; i < 100; i++)
328   ///     A[i + 42] += ...
329   ///
330   ///   getArrayOffset(A) results in 42.
331   ///
332   /// @param Array The array for which to compute the offset.
333   /// @returns An llvm::Value that contains the offset of the array.
334   Value *getArrayOffset(gpu_array_info *Array);
335 
336   /// Prepare the kernel arguments for kernel code generation
337   ///
338   /// @param Kernel The kernel to generate code for.
339   /// @param FN     The function created for the kernel.
340   void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN);
341 
342   /// Create kernel function.
343   ///
344   /// Create a kernel function located in a newly created module that can serve
345   /// as target for device code generation. Set the Builder to point to the
346   /// start block of this newly created function.
347   ///
348   /// @param Kernel The kernel to generate code for.
349   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
350   void createKernelFunction(ppcg_kernel *Kernel,
351                             SetVector<Value *> &SubtreeValues);
352 
353   /// Create the declaration of a kernel function.
354   ///
355   /// The kernel function takes as arguments:
356   ///
357   ///   - One i8 pointer for each external array reference used in the kernel.
358   ///   - Host iterators
359   ///   - Parameters
360   ///   - Other LLVM Value references (TODO)
361   ///
362   /// @param Kernel The kernel to generate the function declaration for.
363   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
364   ///
365   /// @returns The newly declared function.
366   Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
367                                      SetVector<Value *> &SubtreeValues);
368 
369   /// Insert intrinsic functions to obtain thread and block ids.
370   ///
371   /// @param The kernel to generate the intrinsic functions for.
372   void insertKernelIntrinsics(ppcg_kernel *Kernel);
373 
374   /// Create a global-to-shared or shared-to-global copy statement.
375   ///
376   /// @param CopyStmt The copy statement to generate code for
377   void createKernelCopy(ppcg_kernel_stmt *CopyStmt);
378 
379   /// Create code for a ScopStmt called in @p Expr.
380   ///
381   /// @param Expr The expression containing the call.
382   /// @param KernelStmt The kernel statement referenced in the call.
383   void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
384 
385   /// Create an in-kernel synchronization call.
386   void createKernelSync();
387 
388   /// Create a PTX assembly string for the current GPU kernel.
389   ///
390   /// @returns A string containing the corresponding PTX assembly code.
391   std::string createKernelASM();
392 
393   /// Remove references from the dominator tree to the kernel function @p F.
394   ///
395   /// @param F The function to remove references to.
396   void clearDominators(Function *F);
397 
398   /// Remove references from scalar evolution to the kernel function @p F.
399   ///
400   /// @param F The function to remove references to.
401   void clearScalarEvolution(Function *F);
402 
403   /// Remove references from loop info to the kernel function @p F.
404   ///
405   /// @param F The function to remove references to.
406   void clearLoops(Function *F);
407 
408   /// Finalize the generation of the kernel function.
409   ///
410   /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
411   /// dump its IR to stderr.
412   ///
413   /// @returns The Assembly string of the kernel.
414   std::string finalizeKernelFunction();
415 
416   /// Finalize the generation of the kernel arguments.
417   ///
418   /// This function ensures that not-read-only scalars used in a kernel are
419   /// stored back to the global memory location they ared backed up with before
420   /// the kernel terminates.
421   ///
422   /// @params Kernel The kernel to finalize kernel arguments for.
423   void finalizeKernelArguments(ppcg_kernel *Kernel);
424 
425   /// Create code that allocates memory to store arrays on device.
426   void allocateDeviceArrays();
427 
428   /// Free all allocated device arrays.
429   void freeDeviceArrays();
430 
431   /// Create a call to initialize the GPU context.
432   ///
433   /// @returns A pointer to the newly initialized context.
434   Value *createCallInitContext();
435 
436   /// Create a call to get the device pointer for a kernel allocation.
437   ///
438   /// @param Allocation The Polly GPU allocation
439   ///
440   /// @returns The device parameter corresponding to this allocation.
441   Value *createCallGetDevicePtr(Value *Allocation);
442 
443   /// Create a call to free the GPU context.
444   ///
445   /// @param Context A pointer to an initialized GPU context.
446   void createCallFreeContext(Value *Context);
447 
448   /// Create a call to allocate memory on the device.
449   ///
450   /// @param Size The size of memory to allocate
451   ///
452   /// @returns A pointer that identifies this allocation.
453   Value *createCallAllocateMemoryForDevice(Value *Size);
454 
455   /// Create a call to free a device array.
456   ///
457   /// @param Array The device array to free.
458   void createCallFreeDeviceMemory(Value *Array);
459 
460   /// Create a call to copy data from host to device.
461   ///
462   /// @param HostPtr A pointer to the host data that should be copied.
463   /// @param DevicePtr A device pointer specifying the location to copy to.
464   void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
465                                       Value *Size);
466 
467   /// Create a call to copy data from device to host.
468   ///
469   /// @param DevicePtr A pointer to the device data that should be copied.
470   /// @param HostPtr A host pointer specifying the location to copy to.
471   void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
472                                       Value *Size);
473 
474   /// Create a call to synchronize Host & Device.
475   /// \note
476   /// This is to be used only with managed memory.
477   void createCallSynchronizeDevice();
478 
479   /// Create a call to get a kernel from an assembly string.
480   ///
481   /// @param Buffer The string describing the kernel.
482   /// @param Entry  The name of the kernel function to call.
483   ///
484   /// @returns A pointer to a kernel object
485   Value *createCallGetKernel(Value *Buffer, Value *Entry);
486 
487   /// Create a call to free a GPU kernel.
488   ///
489   /// @param GPUKernel THe kernel to free.
490   void createCallFreeKernel(Value *GPUKernel);
491 
492   /// Create a call to launch a GPU kernel.
493   ///
494   /// @param GPUKernel  The kernel to launch.
495   /// @param GridDimX   The size of the first grid dimension.
496   /// @param GridDimY   The size of the second grid dimension.
497   /// @param GridBlockX The size of the first block dimension.
498   /// @param GridBlockY The size of the second block dimension.
499   /// @param GridBlockZ The size of the third block dimension.
500   /// @param Paramters  A pointer to an array that contains itself pointers to
501   ///                   the parameter values passed for each kernel argument.
502   void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
503                               Value *GridDimY, Value *BlockDimX,
504                               Value *BlockDimY, Value *BlockDimZ,
505                               Value *Parameters);
506 };
507 
508 void GPUNodeBuilder::initializeAfterRTH() {
509   BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(),
510                                  &*Builder.GetInsertPoint(), &DT, &LI);
511   NewBB->setName("polly.acc.initialize");
512   Builder.SetInsertPoint(&NewBB->front());
513 
514   GPUContext = createCallInitContext();
515 
516   if (!ManagedMemory)
517     allocateDeviceArrays();
518 }
519 
520 void GPUNodeBuilder::finalize() {
521   if (!ManagedMemory)
522     freeDeviceArrays();
523 
524   createCallFreeContext(GPUContext);
525   IslNodeBuilder::finalize();
526 }
527 
528 void GPUNodeBuilder::allocateDeviceArrays() {
529   assert(!ManagedMemory && "Managed memory will directly send host pointers "
530                            "to the kernel. There is no need for device arrays");
531   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
532 
533   for (int i = 0; i < Prog->n_array; ++i) {
534     gpu_array_info *Array = &Prog->array[i];
535     auto *ScopArray = (ScopArrayInfo *)Array->user;
536     std::string DevArrayName("p_dev_array_");
537     DevArrayName.append(Array->name);
538 
539     Value *ArraySize = getArraySize(Array);
540     Value *Offset = getArrayOffset(Array);
541     if (Offset)
542       ArraySize = Builder.CreateSub(
543           ArraySize,
544           Builder.CreateMul(Offset,
545                             Builder.getInt64(ScopArray->getElemSizeInBytes())));
546     Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
547     DevArray->setName(DevArrayName);
548     DeviceAllocations[ScopArray] = DevArray;
549   }
550 
551   isl_ast_build_free(Build);
552 }
553 
554 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
555                                         Value *BlockDimY, Value *BlockDimZ) {
556   auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
557 
558   for (auto &F : *M) {
559     if (F.getCallingConv() != CallingConv::PTX_Kernel)
560       continue;
561 
562     Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
563 
564     Metadata *Elements[] = {
565         ValueAsMetadata::get(&F),   MDString::get(M->getContext(), "maxntidx"),
566         ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
567         ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
568         ValueAsMetadata::get(V[2]),
569     };
570     MDNode *Node = MDNode::get(M->getContext(), Elements);
571     AnnotationNode->addOperand(Node);
572   }
573 }
574 
575 void GPUNodeBuilder::freeDeviceArrays() {
576   assert(!ManagedMemory && "Managed memory does not use device arrays");
577   for (auto &Array : DeviceAllocations)
578     createCallFreeDeviceMemory(Array.second);
579 }
580 
581 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
582   const char *Name = "polly_getKernel";
583   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
584   Function *F = M->getFunction(Name);
585 
586   // If F is not available, declare it.
587   if (!F) {
588     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
589     std::vector<Type *> Args;
590     Args.push_back(Builder.getInt8PtrTy());
591     Args.push_back(Builder.getInt8PtrTy());
592     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
593     F = Function::Create(Ty, Linkage, Name, M);
594   }
595 
596   return Builder.CreateCall(F, {Buffer, Entry});
597 }
598 
599 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
600   const char *Name = "polly_getDevicePtr";
601   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
602   Function *F = M->getFunction(Name);
603 
604   // If F is not available, declare it.
605   if (!F) {
606     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
607     std::vector<Type *> Args;
608     Args.push_back(Builder.getInt8PtrTy());
609     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
610     F = Function::Create(Ty, Linkage, Name, M);
611   }
612 
613   return Builder.CreateCall(F, {Allocation});
614 }
615 
616 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
617                                             Value *GridDimY, Value *BlockDimX,
618                                             Value *BlockDimY, Value *BlockDimZ,
619                                             Value *Parameters) {
620   const char *Name = "polly_launchKernel";
621   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
622   Function *F = M->getFunction(Name);
623 
624   // If F is not available, declare it.
625   if (!F) {
626     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
627     std::vector<Type *> Args;
628     Args.push_back(Builder.getInt8PtrTy());
629     Args.push_back(Builder.getInt32Ty());
630     Args.push_back(Builder.getInt32Ty());
631     Args.push_back(Builder.getInt32Ty());
632     Args.push_back(Builder.getInt32Ty());
633     Args.push_back(Builder.getInt32Ty());
634     Args.push_back(Builder.getInt8PtrTy());
635     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
636     F = Function::Create(Ty, Linkage, Name, M);
637   }
638 
639   Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
640                          BlockDimZ, Parameters});
641 }
642 
643 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
644   const char *Name = "polly_freeKernel";
645   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
646   Function *F = M->getFunction(Name);
647 
648   // If F is not available, declare it.
649   if (!F) {
650     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
651     std::vector<Type *> Args;
652     Args.push_back(Builder.getInt8PtrTy());
653     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
654     F = Function::Create(Ty, Linkage, Name, M);
655   }
656 
657   Builder.CreateCall(F, {GPUKernel});
658 }
659 
660 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
661   assert(!ManagedMemory && "Managed memory does not allocate or free memory "
662                            "for device");
663   const char *Name = "polly_freeDeviceMemory";
664   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
665   Function *F = M->getFunction(Name);
666 
667   // If F is not available, declare it.
668   if (!F) {
669     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
670     std::vector<Type *> Args;
671     Args.push_back(Builder.getInt8PtrTy());
672     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
673     F = Function::Create(Ty, Linkage, Name, M);
674   }
675 
676   Builder.CreateCall(F, {Array});
677 }
678 
679 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
680   assert(!ManagedMemory && "Managed memory does not allocate or free memory "
681                            "for device");
682   const char *Name = "polly_allocateMemoryForDevice";
683   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
684   Function *F = M->getFunction(Name);
685 
686   // If F is not available, declare it.
687   if (!F) {
688     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
689     std::vector<Type *> Args;
690     Args.push_back(Builder.getInt64Ty());
691     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
692     F = Function::Create(Ty, Linkage, Name, M);
693   }
694 
695   return Builder.CreateCall(F, {Size});
696 }
697 
698 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
699                                                     Value *DeviceData,
700                                                     Value *Size) {
701   assert(!ManagedMemory && "Managed memory does not transfer memory between "
702                            "device and host");
703   const char *Name = "polly_copyFromHostToDevice";
704   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
705   Function *F = M->getFunction(Name);
706 
707   // If F is not available, declare it.
708   if (!F) {
709     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
710     std::vector<Type *> Args;
711     Args.push_back(Builder.getInt8PtrTy());
712     Args.push_back(Builder.getInt8PtrTy());
713     Args.push_back(Builder.getInt64Ty());
714     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
715     F = Function::Create(Ty, Linkage, Name, M);
716   }
717 
718   Builder.CreateCall(F, {HostData, DeviceData, Size});
719 }
720 
721 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
722                                                     Value *HostData,
723                                                     Value *Size) {
724   assert(!ManagedMemory && "Managed memory does not transfer memory between "
725                            "device and host");
726   const char *Name = "polly_copyFromDeviceToHost";
727   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
728   Function *F = M->getFunction(Name);
729 
730   // If F is not available, declare it.
731   if (!F) {
732     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
733     std::vector<Type *> Args;
734     Args.push_back(Builder.getInt8PtrTy());
735     Args.push_back(Builder.getInt8PtrTy());
736     Args.push_back(Builder.getInt64Ty());
737     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
738     F = Function::Create(Ty, Linkage, Name, M);
739   }
740 
741   Builder.CreateCall(F, {DeviceData, HostData, Size});
742 }
743 
744 void GPUNodeBuilder::createCallSynchronizeDevice() {
745   assert(ManagedMemory && "explicit synchronization is only necessary for "
746                           "managed memory");
747   const char *Name = "polly_synchronizeDevice";
748   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
749   Function *F = M->getFunction(Name);
750 
751   // If F is not available, declare it.
752   if (!F) {
753     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
754     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
755     F = Function::Create(Ty, Linkage, Name, M);
756   }
757 
758   Builder.CreateCall(F);
759 }
760 
761 Value *GPUNodeBuilder::createCallInitContext() {
762   const char *Name;
763 
764   switch (Runtime) {
765   case GPURuntime::CUDA:
766     Name = "polly_initContextCUDA";
767     break;
768   case GPURuntime::OpenCL:
769     Name = "polly_initContextCL";
770     break;
771   }
772 
773   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
774   Function *F = M->getFunction(Name);
775 
776   // If F is not available, declare it.
777   if (!F) {
778     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
779     std::vector<Type *> Args;
780     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
781     F = Function::Create(Ty, Linkage, Name, M);
782   }
783 
784   return Builder.CreateCall(F, {});
785 }
786 
787 void GPUNodeBuilder::createCallFreeContext(Value *Context) {
788   const char *Name = "polly_freeContext";
789   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
790   Function *F = M->getFunction(Name);
791 
792   // If F is not available, declare it.
793   if (!F) {
794     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
795     std::vector<Type *> Args;
796     Args.push_back(Builder.getInt8PtrTy());
797     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
798     F = Function::Create(Ty, Linkage, Name, M);
799   }
800 
801   Builder.CreateCall(F, {Context});
802 }
803 
804 /// Check if one string is a prefix of another.
805 ///
806 /// @param String The string in which to look for the prefix.
807 /// @param Prefix The prefix to look for.
808 static bool isPrefix(std::string String, std::string Prefix) {
809   return String.find(Prefix) == 0;
810 }
811 
812 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
813   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
814   Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
815 
816   if (!gpu_array_is_scalar(Array)) {
817     auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]);
818     isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero);
819 
820     for (unsigned int i = 1; i < Array->n_index; i++) {
821       isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]);
822       isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
823       Res = isl_ast_expr_mul(Res, Expr);
824     }
825 
826     Value *NumElements = ExprBuilder.create(Res);
827     if (NumElements->getType() != ArraySize->getType())
828       NumElements = Builder.CreateSExt(NumElements, ArraySize->getType());
829     ArraySize = Builder.CreateMul(ArraySize, NumElements);
830   }
831   isl_ast_build_free(Build);
832   return ArraySize;
833 }
834 
835 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
836   if (gpu_array_is_scalar(Array))
837     return nullptr;
838 
839   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
840 
841   isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent));
842 
843   isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min));
844 
845   for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++)
846     ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0);
847 
848   if (isl_set_is_subset(Min, ZeroSet)) {
849     isl_set_free(Min);
850     isl_set_free(ZeroSet);
851     isl_ast_build_free(Build);
852     return nullptr;
853   }
854   isl_set_free(ZeroSet);
855 
856   isl_ast_expr *Result =
857       isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0));
858 
859   for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
860     if (i > 0) {
861       isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
862       isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
863       Result = isl_ast_expr_mul(Result, BExpr);
864     }
865     isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i);
866     isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin);
867     Result = isl_ast_expr_add(Result, MExpr);
868   }
869 
870   Value *ResultValue = ExprBuilder.create(Result);
871   isl_set_free(Min);
872   isl_ast_build_free(Build);
873 
874   return ResultValue;
875 }
876 
877 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array,
878                                                      ScopArrayInfo *ArrayInfo) {
879 
880   assert(ManagedMemory && "Only used when you wish to get a host "
881                           "pointer for sending data to the kernel, "
882                           "with managed memory");
883   std::map<ScopArrayInfo *, Value *>::iterator it;
884   if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) {
885     return it->second;
886   } else {
887     Value *HostPtr;
888 
889     if (gpu_array_is_scalar(Array))
890       HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo);
891     else
892       HostPtr = ArrayInfo->getBasePtr();
893 
894     Value *Offset = getArrayOffset(Array);
895     if (Offset) {
896       HostPtr = Builder.CreatePointerCast(
897           HostPtr, ArrayInfo->getElementType()->getPointerTo());
898       HostPtr = Builder.CreateGEP(HostPtr, Offset);
899     }
900 
901     HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
902     DeviceAllocations[ArrayInfo] = HostPtr;
903     return HostPtr;
904   }
905 }
906 
907 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
908                                         enum DataDirection Direction) {
909   assert(!ManagedMemory && "Managed memory needs no data transfers");
910   isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
911   isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
912   isl_id *Id = isl_ast_expr_get_id(Arg);
913   auto Array = (gpu_array_info *)isl_id_get_user(Id);
914   auto ScopArray = (ScopArrayInfo *)(Array->user);
915 
916   Value *Size = getArraySize(Array);
917   Value *Offset = getArrayOffset(Array);
918   Value *DevPtr = DeviceAllocations[ScopArray];
919 
920   Value *HostPtr;
921 
922   if (gpu_array_is_scalar(Array))
923     HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
924   else
925     HostPtr = ScopArray->getBasePtr();
926 
927   if (Offset) {
928     HostPtr = Builder.CreatePointerCast(
929         HostPtr, ScopArray->getElementType()->getPointerTo());
930     HostPtr = Builder.CreateGEP(HostPtr, Offset);
931   }
932 
933   HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
934 
935   if (Offset) {
936     Size = Builder.CreateSub(
937         Size, Builder.CreateMul(
938                   Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
939   }
940 
941   if (Direction == HOST_TO_DEVICE)
942     createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
943   else
944     createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
945 
946   isl_id_free(Id);
947   isl_ast_expr_free(Arg);
948   isl_ast_expr_free(Expr);
949   isl_ast_node_free(TransferStmt);
950 }
951 
952 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
953   isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
954   isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
955   isl_id *Id = isl_ast_expr_get_id(StmtExpr);
956   isl_id_free(Id);
957   isl_ast_expr_free(StmtExpr);
958 
959   const char *Str = isl_id_get_name(Id);
960   if (!strcmp(Str, "kernel")) {
961     createKernel(UserStmt);
962     isl_ast_expr_free(Expr);
963     return;
964   }
965 
966   if (isPrefix(Str, "to_device")) {
967     if (!ManagedMemory)
968       createDataTransfer(UserStmt, HOST_TO_DEVICE);
969     else
970       isl_ast_node_free(UserStmt);
971 
972     isl_ast_expr_free(Expr);
973     return;
974   }
975 
976   if (isPrefix(Str, "from_device")) {
977     if (!ManagedMemory) {
978       createDataTransfer(UserStmt, DEVICE_TO_HOST);
979     } else {
980       createCallSynchronizeDevice();
981       isl_ast_node_free(UserStmt);
982     }
983     isl_ast_expr_free(Expr);
984     return;
985   }
986 
987   isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
988   struct ppcg_kernel_stmt *KernelStmt =
989       (struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
990   isl_id_free(Anno);
991 
992   switch (KernelStmt->type) {
993   case ppcg_kernel_domain:
994     createScopStmt(Expr, KernelStmt);
995     isl_ast_node_free(UserStmt);
996     return;
997   case ppcg_kernel_copy:
998     createKernelCopy(KernelStmt);
999     isl_ast_expr_free(Expr);
1000     isl_ast_node_free(UserStmt);
1001     return;
1002   case ppcg_kernel_sync:
1003     createKernelSync();
1004     isl_ast_expr_free(Expr);
1005     isl_ast_node_free(UserStmt);
1006     return;
1007   }
1008 
1009   isl_ast_expr_free(Expr);
1010   isl_ast_node_free(UserStmt);
1011   return;
1012 }
1013 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
1014   isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index);
1015   LocalIndex = isl_ast_expr_address_of(LocalIndex);
1016   Value *LocalAddr = ExprBuilder.create(LocalIndex);
1017   isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index);
1018   Index = isl_ast_expr_address_of(Index);
1019   Value *GlobalAddr = ExprBuilder.create(Index);
1020 
1021   if (KernelStmt->u.c.read) {
1022     LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read");
1023     Builder.CreateStore(Load, LocalAddr);
1024   } else {
1025     LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write");
1026     Builder.CreateStore(Load, GlobalAddr);
1027   }
1028 }
1029 
1030 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
1031                                     ppcg_kernel_stmt *KernelStmt) {
1032   auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
1033   isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
1034 
1035   LoopToScevMapT LTS;
1036   LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
1037 
1038   createSubstitutions(Expr, Stmt, LTS);
1039 
1040   if (Stmt->isBlockStmt())
1041     BlockGen.copyStmt(*Stmt, LTS, Indexes);
1042   else
1043     RegionGen.copyStmt(*Stmt, LTS, Indexes);
1044 }
1045 
1046 void GPUNodeBuilder::createKernelSync() {
1047   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
1048 
1049   Function *Sync;
1050 
1051   switch (Arch) {
1052   case GPUArch::NVPTX64:
1053     Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
1054     break;
1055   }
1056 
1057   Builder.CreateCall(Sync, {});
1058 }
1059 
1060 /// Collect llvm::Values referenced from @p Node
1061 ///
1062 /// This function only applies to isl_ast_nodes that are user_nodes referring
1063 /// to a ScopStmt. All other node types are ignore.
1064 ///
1065 /// @param Node The node to collect references for.
1066 /// @param User A user pointer used as storage for the data that is collected.
1067 ///
1068 /// @returns isl_bool_true if data could be collected successfully.
1069 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
1070   if (isl_ast_node_get_type(Node) != isl_ast_node_user)
1071     return isl_bool_true;
1072 
1073   isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
1074   isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
1075   isl_id *Id = isl_ast_expr_get_id(StmtExpr);
1076   const char *Str = isl_id_get_name(Id);
1077   isl_id_free(Id);
1078   isl_ast_expr_free(StmtExpr);
1079   isl_ast_expr_free(Expr);
1080 
1081   if (!isPrefix(Str, "Stmt"))
1082     return isl_bool_true;
1083 
1084   Id = isl_ast_node_get_annotation(Node);
1085   auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
1086   auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
1087   isl_id_free(Id);
1088 
1089   addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */);
1090 
1091   return isl_bool_true;
1092 }
1093 
1094 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
1095   SetVector<Value *> SubtreeValues;
1096   SetVector<const SCEV *> SCEVs;
1097   SetVector<const Loop *> Loops;
1098   SubtreeReferences References = {
1099       LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
1100 
1101   for (const auto &I : IDToValue)
1102     SubtreeValues.insert(I.second);
1103 
1104   isl_ast_node_foreach_descendant_top_down(
1105       Kernel->tree, collectReferencesInGPUStmt, &References);
1106 
1107   for (const SCEV *Expr : SCEVs)
1108     findValues(Expr, SE, SubtreeValues);
1109 
1110   for (auto &SAI : S.arrays())
1111     SubtreeValues.remove(SAI->getBasePtr());
1112 
1113   isl_space *Space = S.getParamSpace();
1114   for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) {
1115     isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
1116     assert(IDToValue.count(Id));
1117     Value *Val = IDToValue[Id];
1118     SubtreeValues.remove(Val);
1119     isl_id_free(Id);
1120   }
1121   isl_space_free(Space);
1122 
1123   for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) {
1124     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
1125     assert(IDToValue.count(Id));
1126     Value *Val = IDToValue[Id];
1127     SubtreeValues.remove(Val);
1128     isl_id_free(Id);
1129   }
1130 
1131   return SubtreeValues;
1132 }
1133 
1134 void GPUNodeBuilder::clearDominators(Function *F) {
1135   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
1136   std::vector<BasicBlock *> Nodes;
1137   for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
1138     Nodes.push_back(I->getBlock());
1139 
1140   for (BasicBlock *BB : Nodes)
1141     DT.eraseNode(BB);
1142 }
1143 
1144 void GPUNodeBuilder::clearScalarEvolution(Function *F) {
1145   for (BasicBlock &BB : *F) {
1146     Loop *L = LI.getLoopFor(&BB);
1147     if (L)
1148       SE.forgetLoop(L);
1149   }
1150 }
1151 
1152 void GPUNodeBuilder::clearLoops(Function *F) {
1153   for (BasicBlock &BB : *F) {
1154     Loop *L = LI.getLoopFor(&BB);
1155     if (L)
1156       SE.forgetLoop(L);
1157     LI.removeBlock(&BB);
1158   }
1159 }
1160 
1161 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
1162   std::vector<Value *> Sizes;
1163   isl_ast_build *Context = isl_ast_build_from_context(S.getContext());
1164 
1165   for (long i = 0; i < Kernel->n_grid; i++) {
1166     isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i);
1167     isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size);
1168     Value *Res = ExprBuilder.create(GridSize);
1169     Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
1170     Sizes.push_back(Res);
1171   }
1172   isl_ast_build_free(Context);
1173 
1174   for (long i = Kernel->n_grid; i < 3; i++)
1175     Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
1176 
1177   return std::make_tuple(Sizes[0], Sizes[1]);
1178 }
1179 
1180 std::tuple<Value *, Value *, Value *>
1181 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
1182   std::vector<Value *> Sizes;
1183 
1184   for (long i = 0; i < Kernel->n_block; i++) {
1185     Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
1186     Sizes.push_back(Res);
1187   }
1188 
1189   for (long i = Kernel->n_block; i < 3; i++)
1190     Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
1191 
1192   return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
1193 }
1194 
1195 Value *
1196 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
1197                                        SetVector<Value *> SubtreeValues) {
1198   Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(),
1199                                  std::distance(F->arg_begin(), F->arg_end()));
1200 
1201   BasicBlock *EntryBlock =
1202       &Builder.GetInsertBlock()->getParent()->getEntryBlock();
1203   auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
1204   std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
1205   Instruction *Parameters = new AllocaInst(
1206       ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());
1207 
1208   int Index = 0;
1209   for (long i = 0; i < Prog->n_array; i++) {
1210     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1211       continue;
1212 
1213     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1214     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
1215 
1216     Value *DevArray = nullptr;
1217     if (ManagedMemory) {
1218       DevArray = getOrCreateManagedDeviceArray(
1219           &Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
1220     } else {
1221       DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
1222       DevArray = createCallGetDevicePtr(DevArray);
1223     }
1224     assert(DevArray != nullptr && "Array to be offloaded to device not "
1225                                   "initialized");
1226     Value *Offset = getArrayOffset(&Prog->array[i]);
1227 
1228     if (Offset) {
1229       DevArray = Builder.CreatePointerCast(
1230           DevArray, SAI->getElementType()->getPointerTo());
1231       DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
1232       DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
1233     }
1234     Value *Slot = Builder.CreateGEP(
1235         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1236 
1237     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
1238       Value *ValPtr = nullptr;
1239       if (ManagedMemory)
1240         ValPtr = DevArray;
1241       else
1242         ValPtr = BlockGen.getOrCreateAlloca(SAI);
1243 
1244       assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
1245                                   " to be stored into Parameters");
1246       Value *ValPtrCast =
1247           Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
1248       Builder.CreateStore(ValPtrCast, Slot);
1249     } else {
1250       Instruction *Param =
1251           new AllocaInst(Builder.getInt8PtrTy(), AddressSpace,
1252                          Launch + "_param_" + std::to_string(Index),
1253                          EntryBlock->getTerminator());
1254       Builder.CreateStore(DevArray, Param);
1255       Value *ParamTyped =
1256           Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1257       Builder.CreateStore(ParamTyped, Slot);
1258     }
1259     Index++;
1260   }
1261 
1262   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
1263 
1264   for (long i = 0; i < NumHostIters; i++) {
1265     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
1266     Value *Val = IDToValue[Id];
1267     isl_id_free(Id);
1268     Instruction *Param =
1269         new AllocaInst(Val->getType(), AddressSpace,
1270                        Launch + "_param_" + std::to_string(Index),
1271                        EntryBlock->getTerminator());
1272     Builder.CreateStore(Val, Param);
1273     Value *Slot = Builder.CreateGEP(
1274         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1275     Value *ParamTyped =
1276         Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1277     Builder.CreateStore(ParamTyped, Slot);
1278     Index++;
1279   }
1280 
1281   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
1282 
1283   for (long i = 0; i < NumVars; i++) {
1284     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
1285     Value *Val = IDToValue[Id];
1286     isl_id_free(Id);
1287     Instruction *Param =
1288         new AllocaInst(Val->getType(), AddressSpace,
1289                        Launch + "_param_" + std::to_string(Index),
1290                        EntryBlock->getTerminator());
1291     Builder.CreateStore(Val, Param);
1292     Value *Slot = Builder.CreateGEP(
1293         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1294     Value *ParamTyped =
1295         Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1296     Builder.CreateStore(ParamTyped, Slot);
1297     Index++;
1298   }
1299 
1300   for (auto Val : SubtreeValues) {
1301     Instruction *Param =
1302         new AllocaInst(Val->getType(), AddressSpace,
1303                        Launch + "_param_" + std::to_string(Index),
1304                        EntryBlock->getTerminator());
1305     Builder.CreateStore(Val, Param);
1306     Value *Slot = Builder.CreateGEP(
1307         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1308     Value *ParamTyped =
1309         Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1310     Builder.CreateStore(ParamTyped, Slot);
1311     Index++;
1312   }
1313 
1314   auto Location = EntryBlock->getTerminator();
1315   return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
1316                          Launch + "_params_i8ptr", Location);
1317 }
1318 
1319 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
1320   isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
1321   ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
1322   isl_id_free(Id);
1323   isl_ast_node_free(KernelStmt);
1324 
1325   if (Kernel->n_grid > 1)
1326     DeepestParallel =
1327         std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set));
1328   else
1329     DeepestSequential =
1330         std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set));
1331 
1332   Value *BlockDimX, *BlockDimY, *BlockDimZ;
1333   std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
1334 
1335   SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
1336 
1337   assert(Kernel->tree && "Device AST of kernel node is empty");
1338 
1339   Instruction &HostInsertPoint = *Builder.GetInsertPoint();
1340   IslExprBuilder::IDToValueTy HostIDs = IDToValue;
1341   ValueMapT HostValueMap = ValueMap;
1342   BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap;
1343   ScalarMap.clear();
1344 
1345   SetVector<const Loop *> Loops;
1346 
1347   // Create for all loops we depend on values that contain the current loop
1348   // iteration. These values are necessary to generate code for SCEVs that
1349   // depend on such loops. As a result we need to pass them to the subfunction.
1350   for (const Loop *L : Loops) {
1351     const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
1352                                             SE.getUnknown(Builder.getInt64(1)),
1353                                             L, SCEV::FlagAnyWrap);
1354     Value *V = generateSCEV(OuterLIV);
1355     OutsideLoopIterations[L] = SE.getUnknown(V);
1356     SubtreeValues.insert(V);
1357   }
1358 
1359   createKernelFunction(Kernel, SubtreeValues);
1360 
1361   create(isl_ast_node_copy(Kernel->tree));
1362 
1363   finalizeKernelArguments(Kernel);
1364   Function *F = Builder.GetInsertBlock()->getParent();
1365   addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
1366   clearDominators(F);
1367   clearScalarEvolution(F);
1368   clearLoops(F);
1369 
1370   IDToValue = HostIDs;
1371 
1372   ValueMap = std::move(HostValueMap);
1373   ScalarMap = std::move(HostScalarMap);
1374   EscapeMap.clear();
1375   IDToSAI.clear();
1376   Annotator.resetAlternativeAliasBases();
1377   for (auto &BasePtr : LocalArrays)
1378     S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array);
1379   LocalArrays.clear();
1380 
1381   std::string ASMString = finalizeKernelFunction();
1382   Builder.SetInsertPoint(&HostInsertPoint);
1383   Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
1384 
1385   std::string Name = "kernel_" + std::to_string(Kernel->id);
1386   Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
1387   Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
1388   Value *GPUKernel = createCallGetKernel(KernelString, NameString);
1389 
1390   Value *GridDimX, *GridDimY;
1391   std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
1392 
1393   createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1394                          BlockDimZ, Parameters);
1395   createCallFreeKernel(GPUKernel);
1396 
1397   for (auto Id : KernelIds)
1398     isl_id_free(Id);
1399 
1400   KernelIds.clear();
1401 }
1402 
1403 /// Compute the DataLayout string for the NVPTX backend.
1404 ///
1405 /// @param is64Bit Are we looking for a 64 bit architecture?
1406 static std::string computeNVPTXDataLayout(bool is64Bit) {
1407   std::string Ret = "";
1408 
1409   if (!is64Bit) {
1410     Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
1411            "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
1412            "64-v128:128:128-n16:32:64";
1413   } else {
1414     Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
1415            "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
1416            "64-v128:128:128-n16:32:64";
1417   }
1418 
1419   return Ret;
1420 }
1421 
1422 Function *
1423 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
1424                                          SetVector<Value *> &SubtreeValues) {
1425   std::vector<Type *> Args;
1426   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
1427 
1428   for (long i = 0; i < Prog->n_array; i++) {
1429     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1430       continue;
1431 
1432     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
1433       isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1434       const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
1435       Args.push_back(SAI->getElementType());
1436     } else {
1437       static const int UseGlobalMemory = 1;
1438       Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
1439     }
1440   }
1441 
1442   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
1443 
1444   for (long i = 0; i < NumHostIters; i++)
1445     Args.push_back(Builder.getInt64Ty());
1446 
1447   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
1448 
1449   for (long i = 0; i < NumVars; i++) {
1450     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
1451     Value *Val = IDToValue[Id];
1452     isl_id_free(Id);
1453     Args.push_back(Val->getType());
1454   }
1455 
1456   for (auto *V : SubtreeValues)
1457     Args.push_back(V->getType());
1458 
1459   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
1460   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
1461                               GPUModule.get());
1462 
1463   switch (Arch) {
1464   case GPUArch::NVPTX64:
1465     FN->setCallingConv(CallingConv::PTX_Kernel);
1466     break;
1467   }
1468 
1469   auto Arg = FN->arg_begin();
1470   for (long i = 0; i < Kernel->n_array; i++) {
1471     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1472       continue;
1473 
1474     Arg->setName(Kernel->array[i].array->name);
1475 
1476     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1477     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
1478     Type *EleTy = SAI->getElementType();
1479     Value *Val = &*Arg;
1480     SmallVector<const SCEV *, 4> Sizes;
1481     isl_ast_build *Build =
1482         isl_ast_build_from_context(isl_set_copy(Prog->context));
1483     Sizes.push_back(nullptr);
1484     for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
1485       isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
1486           Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
1487       auto V = ExprBuilder.create(DimSize);
1488       Sizes.push_back(SE.getSCEV(V));
1489     }
1490     const ScopArrayInfo *SAIRep =
1491         S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array);
1492     LocalArrays.push_back(Val);
1493 
1494     isl_ast_build_free(Build);
1495     KernelIds.push_back(Id);
1496     IDToSAI[Id] = SAIRep;
1497     Arg++;
1498   }
1499 
1500   for (long i = 0; i < NumHostIters; i++) {
1501     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
1502     Arg->setName(isl_id_get_name(Id));
1503     IDToValue[Id] = &*Arg;
1504     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
1505     Arg++;
1506   }
1507 
1508   for (long i = 0; i < NumVars; i++) {
1509     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
1510     Arg->setName(isl_id_get_name(Id));
1511     Value *Val = IDToValue[Id];
1512     ValueMap[Val] = &*Arg;
1513     IDToValue[Id] = &*Arg;
1514     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
1515     Arg++;
1516   }
1517 
1518   for (auto *V : SubtreeValues) {
1519     Arg->setName(V->getName());
1520     ValueMap[V] = &*Arg;
1521     Arg++;
1522   }
1523 
1524   return FN;
1525 }
1526 
1527 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
1528   Intrinsic::ID IntrinsicsBID[2];
1529   Intrinsic::ID IntrinsicsTID[3];
1530 
1531   switch (Arch) {
1532   case GPUArch::NVPTX64:
1533     IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
1534     IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
1535 
1536     IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
1537     IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
1538     IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
1539     break;
1540   }
1541 
1542   auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
1543     std::string Name = isl_id_get_name(Id);
1544     Module *M = Builder.GetInsertBlock()->getParent()->getParent();
1545     Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
1546     Value *Val = Builder.CreateCall(IntrinsicFn, {});
1547     Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
1548     IDToValue[Id] = Val;
1549     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
1550   };
1551 
1552   for (int i = 0; i < Kernel->n_grid; ++i) {
1553     isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
1554     addId(Id, IntrinsicsBID[i]);
1555   }
1556 
1557   for (int i = 0; i < Kernel->n_block; ++i) {
1558     isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
1559     addId(Id, IntrinsicsTID[i]);
1560   }
1561 }
1562 
1563 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
1564   auto Arg = FN->arg_begin();
1565   for (long i = 0; i < Kernel->n_array; i++) {
1566     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1567       continue;
1568 
1569     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1570     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
1571     isl_id_free(Id);
1572 
1573     if (SAI->getNumberOfDimensions() > 0) {
1574       Arg++;
1575       continue;
1576     }
1577 
1578     Value *Val = &*Arg;
1579 
1580     if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
1581       Type *TypePtr = SAI->getElementType()->getPointerTo();
1582       Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
1583       Val = Builder.CreateLoad(TypedArgPtr);
1584     }
1585 
1586     Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
1587     Builder.CreateStore(Val, Alloca);
1588 
1589     Arg++;
1590   }
1591 }
1592 
1593 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
1594   auto *FN = Builder.GetInsertBlock()->getParent();
1595   auto Arg = FN->arg_begin();
1596 
1597   bool StoredScalar = false;
1598   for (long i = 0; i < Kernel->n_array; i++) {
1599     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1600       continue;
1601 
1602     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1603     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
1604     isl_id_free(Id);
1605 
1606     if (SAI->getNumberOfDimensions() > 0) {
1607       Arg++;
1608       continue;
1609     }
1610 
1611     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
1612       Arg++;
1613       continue;
1614     }
1615 
1616     Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
1617     Value *ArgPtr = &*Arg;
1618     Type *TypePtr = SAI->getElementType()->getPointerTo();
1619     Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
1620     Value *Val = Builder.CreateLoad(Alloca);
1621     Builder.CreateStore(Val, TypedArgPtr);
1622     StoredScalar = true;
1623 
1624     Arg++;
1625   }
1626 
1627   if (StoredScalar)
1628     /// In case more than one thread contains scalar stores, the generated
1629     /// code might be incorrect, if we only store at the end of the kernel.
1630     /// To support this case we need to store these scalars back at each
1631     /// memory store or at least before each kernel barrier.
1632     if (Kernel->n_block != 0 || Kernel->n_grid != 0)
1633       BuildSuccessful = 0;
1634 }
1635 
1636 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
1637   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
1638 
1639   for (int i = 0; i < Kernel->n_var; ++i) {
1640     struct ppcg_kernel_var &Var = Kernel->var[i];
1641     isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set);
1642     Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType();
1643 
1644     Type *ArrayTy = EleTy;
1645     SmallVector<const SCEV *, 4> Sizes;
1646 
1647     Sizes.push_back(nullptr);
1648     for (unsigned int j = 1; j < Var.array->n_index; ++j) {
1649       isl_val *Val = isl_vec_get_element_val(Var.size, j);
1650       long Bound = isl_val_get_num_si(Val);
1651       isl_val_free(Val);
1652       Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
1653     }
1654 
1655     for (int j = Var.array->n_index - 1; j >= 0; --j) {
1656       isl_val *Val = isl_vec_get_element_val(Var.size, j);
1657       long Bound = isl_val_get_num_si(Val);
1658       isl_val_free(Val);
1659       ArrayTy = ArrayType::get(ArrayTy, Bound);
1660     }
1661 
1662     const ScopArrayInfo *SAI;
1663     Value *Allocation;
1664     if (Var.type == ppcg_access_shared) {
1665       auto GlobalVar = new GlobalVariable(
1666           *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name,
1667           nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
1668       GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8);
1669       GlobalVar->setInitializer(Constant::getNullValue(ArrayTy));
1670 
1671       Allocation = GlobalVar;
1672     } else if (Var.type == ppcg_access_private) {
1673       Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array");
1674     } else {
1675       llvm_unreachable("unknown variable type");
1676     }
1677     SAI =
1678         S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array);
1679     Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr);
1680     IDToValue[Id] = Allocation;
1681     LocalArrays.push_back(Allocation);
1682     KernelIds.push_back(Id);
1683     IDToSAI[Id] = SAI;
1684   }
1685 }
1686 
1687 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
1688                                           SetVector<Value *> &SubtreeValues) {
1689   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
1690   GPUModule.reset(new Module(Identifier, Builder.getContext()));
1691 
1692   switch (Arch) {
1693   case GPUArch::NVPTX64:
1694     if (Runtime == GPURuntime::CUDA)
1695       GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
1696     else if (Runtime == GPURuntime::OpenCL)
1697       GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
1698     GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
1699     break;
1700   }
1701 
1702   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
1703 
1704   BasicBlock *PrevBlock = Builder.GetInsertBlock();
1705   auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
1706 
1707   DT.addNewBlock(EntryBlock, PrevBlock);
1708 
1709   Builder.SetInsertPoint(EntryBlock);
1710   Builder.CreateRetVoid();
1711   Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
1712 
1713   ScopDetection::markFunctionAsInvalid(FN);
1714 
1715   prepareKernelArguments(Kernel, FN);
1716   createKernelVariables(Kernel, FN);
1717   insertKernelIntrinsics(Kernel);
1718 }
1719 
1720 std::string GPUNodeBuilder::createKernelASM() {
1721   llvm::Triple GPUTriple;
1722 
1723   switch (Arch) {
1724   case GPUArch::NVPTX64:
1725     switch (Runtime) {
1726     case GPURuntime::CUDA:
1727       GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
1728       break;
1729     case GPURuntime::OpenCL:
1730       GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
1731       break;
1732     }
1733     break;
1734   }
1735 
1736   std::string ErrMsg;
1737   auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
1738 
1739   if (!GPUTarget) {
1740     errs() << ErrMsg << "\n";
1741     return "";
1742   }
1743 
1744   TargetOptions Options;
1745   Options.UnsafeFPMath = FastMath;
1746 
1747   std::string subtarget;
1748 
1749   switch (Arch) {
1750   case GPUArch::NVPTX64:
1751     subtarget = CudaVersion;
1752     break;
1753   }
1754 
1755   std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
1756       GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
1757 
1758   SmallString<0> ASMString;
1759   raw_svector_ostream ASMStream(ASMString);
1760   llvm::legacy::PassManager PM;
1761 
1762   PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
1763 
1764   if (TargetM->addPassesToEmitFile(
1765           PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) {
1766     errs() << "The target does not support generation of this file type!\n";
1767     return "";
1768   }
1769 
1770   PM.run(*GPUModule);
1771 
1772   return ASMStream.str();
1773 }
1774 
1775 std::string GPUNodeBuilder::finalizeKernelFunction() {
1776   if (verifyModule(*GPUModule)) {
1777     BuildSuccessful = false;
1778     return "";
1779   }
1780 
1781   if (DumpKernelIR)
1782     outs() << *GPUModule << "\n";
1783 
1784   // Optimize module.
1785   llvm::legacy::PassManager OptPasses;
1786   PassManagerBuilder PassBuilder;
1787   PassBuilder.OptLevel = 3;
1788   PassBuilder.SizeLevel = 0;
1789   PassBuilder.populateModulePassManager(OptPasses);
1790   OptPasses.run(*GPUModule);
1791 
1792   std::string Assembly = createKernelASM();
1793 
1794   if (DumpKernelASM)
1795     outs() << Assembly << "\n";
1796 
1797   GPUModule.release();
1798   KernelIDs.clear();
1799 
1800   return Assembly;
1801 }
1802 
1803 namespace {
1804 class PPCGCodeGeneration : public ScopPass {
1805 public:
1806   static char ID;
1807 
1808   GPURuntime Runtime = GPURuntime::CUDA;
1809 
1810   GPUArch Architecture = GPUArch::NVPTX64;
1811 
1812   /// The scop that is currently processed.
1813   Scop *S;
1814 
1815   LoopInfo *LI;
1816   DominatorTree *DT;
1817   ScalarEvolution *SE;
1818   const DataLayout *DL;
1819   RegionInfo *RI;
1820 
1821   PPCGCodeGeneration() : ScopPass(ID) {}
1822 
1823   /// Construct compilation options for PPCG.
1824   ///
1825   /// @returns The compilation options.
1826   ppcg_options *createPPCGOptions() {
1827     auto DebugOptions =
1828         (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
1829     auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
1830 
1831     DebugOptions->dump_schedule_constraints = false;
1832     DebugOptions->dump_schedule = false;
1833     DebugOptions->dump_final_schedule = false;
1834     DebugOptions->dump_sizes = false;
1835     DebugOptions->verbose = false;
1836 
1837     Options->debug = DebugOptions;
1838 
1839     Options->reschedule = true;
1840     Options->scale_tile_loops = false;
1841     Options->wrap = false;
1842 
1843     Options->non_negative_parameters = false;
1844     Options->ctx = nullptr;
1845     Options->sizes = nullptr;
1846 
1847     Options->tile_size = 32;
1848 
1849     Options->use_private_memory = PrivateMemory;
1850     Options->use_shared_memory = SharedMemory;
1851     Options->max_shared_memory = 48 * 1024;
1852 
1853     Options->target = PPCG_TARGET_CUDA;
1854     Options->openmp = false;
1855     Options->linearize_device_arrays = true;
1856     Options->live_range_reordering = false;
1857 
1858     Options->opencl_compiler_options = nullptr;
1859     Options->opencl_use_gpu = false;
1860     Options->opencl_n_include_file = 0;
1861     Options->opencl_include_files = nullptr;
1862     Options->opencl_print_kernel_types = false;
1863     Options->opencl_embed_kernel_code = false;
1864 
1865     Options->save_schedule_file = nullptr;
1866     Options->load_schedule_file = nullptr;
1867 
1868     return Options;
1869   }
1870 
1871   /// Get a tagged access relation containing all accesses of type @p AccessTy.
1872   ///
1873   /// Instead of a normal access of the form:
1874   ///
1875   ///   Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
1876   ///
1877   /// a tagged access has the form
1878   ///
1879   ///   [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
1880   ///
1881   /// where 'id' is an additional space that references the memory access that
1882   /// triggered the access.
1883   ///
1884   /// @param AccessTy The type of the memory accesses to collect.
1885   ///
1886   /// @return The relation describing all tagged memory accesses.
1887   isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
1888     isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace());
1889 
1890     for (auto &Stmt : *S)
1891       for (auto &Acc : Stmt)
1892         if (Acc->getType() == AccessTy) {
1893           isl_map *Relation = Acc->getAccessRelation();
1894           Relation = isl_map_intersect_domain(Relation, Stmt.getDomain());
1895 
1896           isl_space *Space = isl_map_get_space(Relation);
1897           Space = isl_space_range(Space);
1898           Space = isl_space_from_range(Space);
1899           Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId());
1900           isl_map *Universe = isl_map_universe(Space);
1901           Relation = isl_map_domain_product(Relation, Universe);
1902           Accesses = isl_union_map_add_map(Accesses, Relation);
1903         }
1904 
1905     return Accesses;
1906   }
1907 
1908   /// Get the set of all read accesses, tagged with the access id.
1909   ///
1910   /// @see getTaggedAccesses
1911   isl_union_map *getTaggedReads() {
1912     return getTaggedAccesses(MemoryAccess::READ);
1913   }
1914 
1915   /// Get the set of all may (and must) accesses, tagged with the access id.
1916   ///
1917   /// @see getTaggedAccesses
1918   isl_union_map *getTaggedMayWrites() {
1919     return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
1920                                getTaggedAccesses(MemoryAccess::MUST_WRITE));
1921   }
1922 
1923   /// Get the set of all must accesses, tagged with the access id.
1924   ///
1925   /// @see getTaggedAccesses
1926   isl_union_map *getTaggedMustWrites() {
1927     return getTaggedAccesses(MemoryAccess::MUST_WRITE);
1928   }
1929 
1930   /// Collect parameter and array names as isl_ids.
1931   ///
1932   /// To reason about the different parameters and arrays used, ppcg requires
1933   /// a list of all isl_ids in use. As PPCG traditionally performs
1934   /// source-to-source compilation each of these isl_ids is mapped to the
1935   /// expression that represents it. As we do not have a corresponding
1936   /// expression in Polly, we just map each id to a 'zero' expression to match
1937   /// the data format that ppcg expects.
1938   ///
1939   /// @returns Retun a map from collected ids to 'zero' ast expressions.
1940   __isl_give isl_id_to_ast_expr *getNames() {
1941     auto *Names = isl_id_to_ast_expr_alloc(
1942         S->getIslCtx(),
1943         S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
1944     auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx()));
1945     auto *Space = S->getParamSpace();
1946 
1947     for (int I = 0, E = S->getNumParams(); I < E; ++I) {
1948       isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I);
1949       Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
1950     }
1951 
1952     for (auto &Array : S->arrays()) {
1953       auto Id = Array->getBasePtrId();
1954       Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
1955     }
1956 
1957     isl_space_free(Space);
1958     isl_ast_expr_free(Zero);
1959 
1960     return Names;
1961   }
1962 
1963   /// Create a new PPCG scop from the current scop.
1964   ///
1965   /// The PPCG scop is initialized with data from the current polly::Scop. From
1966   /// this initial data, the data-dependences in the PPCG scop are initialized.
1967   /// We do not use Polly's dependence analysis for now, to ensure we match
1968   /// the PPCG default behaviour more closely.
1969   ///
1970   /// @returns A new ppcg scop.
1971   ppcg_scop *createPPCGScop() {
1972     auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
1973 
1974     PPCGScop->options = createPPCGOptions();
1975 
1976     PPCGScop->start = 0;
1977     PPCGScop->end = 0;
1978 
1979     PPCGScop->context = S->getContext();
1980     PPCGScop->domain = S->getDomains();
1981     PPCGScop->call = nullptr;
1982     PPCGScop->tagged_reads = getTaggedReads();
1983     PPCGScop->reads = S->getReads();
1984     PPCGScop->live_in = nullptr;
1985     PPCGScop->tagged_may_writes = getTaggedMayWrites();
1986     PPCGScop->may_writes = S->getWrites();
1987     PPCGScop->tagged_must_writes = getTaggedMustWrites();
1988     PPCGScop->must_writes = S->getMustWrites();
1989     PPCGScop->live_out = nullptr;
1990     PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace());
1991     PPCGScop->tagger = nullptr;
1992 
1993     PPCGScop->independence = nullptr;
1994     PPCGScop->dep_flow = nullptr;
1995     PPCGScop->tagged_dep_flow = nullptr;
1996     PPCGScop->dep_false = nullptr;
1997     PPCGScop->dep_forced = nullptr;
1998     PPCGScop->dep_order = nullptr;
1999     PPCGScop->tagged_dep_order = nullptr;
2000 
2001     PPCGScop->schedule = S->getScheduleTree();
2002     PPCGScop->names = getNames();
2003 
2004     PPCGScop->pet = nullptr;
2005 
2006     compute_tagger(PPCGScop);
2007     compute_dependences(PPCGScop);
2008 
2009     return PPCGScop;
2010   }
2011 
2012   /// Collect the array acesses in a statement.
2013   ///
2014   /// @param Stmt The statement for which to collect the accesses.
2015   ///
2016   /// @returns A list of array accesses.
2017   gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
2018     gpu_stmt_access *Accesses = nullptr;
2019 
2020     for (MemoryAccess *Acc : Stmt) {
2021       auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access);
2022       Access->read = Acc->isRead();
2023       Access->write = Acc->isWrite();
2024       Access->access = Acc->getAccessRelation();
2025       isl_space *Space = isl_map_get_space(Access->access);
2026       Space = isl_space_range(Space);
2027       Space = isl_space_from_range(Space);
2028       Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId());
2029       isl_map *Universe = isl_map_universe(Space);
2030       Access->tagged_access =
2031           isl_map_domain_product(Acc->getAccessRelation(), Universe);
2032       Access->exact_write = !Acc->isMayWrite();
2033       Access->ref_id = Acc->getId();
2034       Access->next = Accesses;
2035       Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
2036       Accesses = Access;
2037     }
2038 
2039     return Accesses;
2040   }
2041 
2042   /// Collect the list of GPU statements.
2043   ///
2044   /// Each statement has an id, a pointer to the underlying data structure,
2045   /// as well as a list with all memory accesses.
2046   ///
2047   /// TODO: Initialize the list of memory accesses.
2048   ///
2049   /// @returns A linked-list of statements.
2050   gpu_stmt *getStatements() {
2051     gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt,
2052                                        std::distance(S->begin(), S->end()));
2053 
2054     int i = 0;
2055     for (auto &Stmt : *S) {
2056       gpu_stmt *GPUStmt = &Stmts[i];
2057 
2058       GPUStmt->id = Stmt.getDomainId();
2059 
2060       // We use the pet stmt pointer to keep track of the Polly statements.
2061       GPUStmt->stmt = (pet_stmt *)&Stmt;
2062       GPUStmt->accesses = getStmtAccesses(Stmt);
2063       i++;
2064     }
2065 
2066     return Stmts;
2067   }
2068 
2069   /// Derive the extent of an array.
2070   ///
2071   /// The extent of an array is the set of elements that are within the
2072   /// accessed array. For the inner dimensions, the extent constraints are
2073   /// 0 and the size of the corresponding array dimension. For the first
2074   /// (outermost) dimension, the extent constraints are the minimal and maximal
2075   /// subscript value for the first dimension.
2076   ///
2077   /// @param Array The array to derive the extent for.
2078   ///
2079   /// @returns An isl_set describing the extent of the array.
2080   __isl_give isl_set *getExtent(ScopArrayInfo *Array) {
2081     unsigned NumDims = Array->getNumberOfDimensions();
2082     isl_union_map *Accesses = S->getAccesses();
2083     Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains());
2084     Accesses = isl_union_map_detect_equalities(Accesses);
2085     isl_union_set *AccessUSet = isl_union_map_range(Accesses);
2086     AccessUSet = isl_union_set_coalesce(AccessUSet);
2087     AccessUSet = isl_union_set_detect_equalities(AccessUSet);
2088     AccessUSet = isl_union_set_coalesce(AccessUSet);
2089 
2090     if (isl_union_set_is_empty(AccessUSet)) {
2091       isl_union_set_free(AccessUSet);
2092       return isl_set_empty(Array->getSpace());
2093     }
2094 
2095     if (Array->getNumberOfDimensions() == 0) {
2096       isl_union_set_free(AccessUSet);
2097       return isl_set_universe(Array->getSpace());
2098     }
2099 
2100     isl_set *AccessSet =
2101         isl_union_set_extract_set(AccessUSet, Array->getSpace());
2102 
2103     isl_union_set_free(AccessUSet);
2104     isl_local_space *LS = isl_local_space_from_space(Array->getSpace());
2105 
2106     isl_pw_aff *Val =
2107         isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0));
2108 
2109     isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0);
2110     isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0);
2111     OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in,
2112                                    isl_pw_aff_dim(Val, isl_dim_in));
2113     OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in,
2114                                    isl_pw_aff_dim(Val, isl_dim_in));
2115     OuterMin =
2116         isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId());
2117     OuterMax =
2118         isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId());
2119 
2120     isl_set *Extent = isl_set_universe(Array->getSpace());
2121 
2122     Extent = isl_set_intersect(
2123         Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val)));
2124     Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val));
2125 
2126     for (unsigned i = 1; i < NumDims; ++i)
2127       Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0);
2128 
2129     for (unsigned i = 1; i < NumDims; ++i) {
2130       isl_pw_aff *PwAff =
2131           const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i));
2132       isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain(
2133           isl_local_space_from_space(Array->getSpace()), isl_dim_set, i));
2134       PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in,
2135                                   isl_pw_aff_dim(Val, isl_dim_in));
2136       PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in,
2137                                       isl_pw_aff_get_tuple_id(Val, isl_dim_in));
2138       auto *Set = isl_pw_aff_gt_set(PwAff, Val);
2139       Extent = isl_set_intersect(Set, Extent);
2140     }
2141 
2142     return Extent;
2143   }
2144 
2145   /// Derive the bounds of an array.
2146   ///
2147   /// For the first dimension we derive the bound of the array from the extent
2148   /// of this dimension. For inner dimensions we obtain their size directly from
2149   /// ScopArrayInfo.
2150   ///
2151   /// @param PPCGArray The array to compute bounds for.
2152   /// @param Array The polly array from which to take the information.
2153   void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
2154     if (PPCGArray.n_index > 0) {
2155       if (isl_set_is_empty(PPCGArray.extent)) {
2156         isl_set *Dom = isl_set_copy(PPCGArray.extent);
2157         isl_local_space *LS = isl_local_space_from_space(
2158             isl_space_params(isl_set_get_space(Dom)));
2159         isl_set_free(Dom);
2160         isl_aff *Zero = isl_aff_zero_on_domain(LS);
2161         PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero);
2162       } else {
2163         isl_set *Dom = isl_set_copy(PPCGArray.extent);
2164         Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
2165         isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
2166         isl_set_free(Dom);
2167         Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
2168         isl_local_space *LS =
2169             isl_local_space_from_space(isl_set_get_space(Dom));
2170         isl_aff *One = isl_aff_zero_on_domain(LS);
2171         One = isl_aff_add_constant_si(One, 1);
2172         Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
2173         Bound = isl_pw_aff_gist(Bound, S->getContext());
2174         PPCGArray.bound[0] = Bound;
2175       }
2176     }
2177 
2178     for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
2179       isl_pw_aff *Bound = Array->getDimensionSizePw(i);
2180       auto LS = isl_pw_aff_get_domain_space(Bound);
2181       auto Aff = isl_multi_aff_zero(LS);
2182       Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
2183       PPCGArray.bound[i] = Bound;
2184     }
2185   }
2186 
2187   /// Create the arrays for @p PPCGProg.
2188   ///
2189   /// @param PPCGProg The program to compute the arrays for.
2190   void createArrays(gpu_prog *PPCGProg) {
2191     int i = 0;
2192     for (auto &Array : S->arrays()) {
2193       std::string TypeName;
2194       raw_string_ostream OS(TypeName);
2195 
2196       OS << *Array->getElementType();
2197       TypeName = OS.str();
2198 
2199       gpu_array_info &PPCGArray = PPCGProg->array[i];
2200 
2201       PPCGArray.space = Array->getSpace();
2202       PPCGArray.type = strdup(TypeName.c_str());
2203       PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8;
2204       PPCGArray.name = strdup(Array->getName().c_str());
2205       PPCGArray.extent = nullptr;
2206       PPCGArray.n_index = Array->getNumberOfDimensions();
2207       PPCGArray.bound =
2208           isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index);
2209       PPCGArray.extent = getExtent(Array);
2210       PPCGArray.n_ref = 0;
2211       PPCGArray.refs = nullptr;
2212       PPCGArray.accessed = true;
2213       PPCGArray.read_only_scalar =
2214           Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
2215       PPCGArray.has_compound_element = false;
2216       PPCGArray.local = false;
2217       PPCGArray.declare_local = false;
2218       PPCGArray.global = false;
2219       PPCGArray.linearize = false;
2220       PPCGArray.dep_order = nullptr;
2221       PPCGArray.user = Array;
2222 
2223       setArrayBounds(PPCGArray, Array);
2224       i++;
2225 
2226       collect_references(PPCGProg, &PPCGArray);
2227     }
2228   }
2229 
2230   /// Create an identity map between the arrays in the scop.
2231   ///
2232   /// @returns An identity map between the arrays in the scop.
2233   isl_union_map *getArrayIdentity() {
2234     isl_union_map *Maps = isl_union_map_empty(S->getParamSpace());
2235 
2236     for (auto &Array : S->arrays()) {
2237       isl_space *Space = Array->getSpace();
2238       Space = isl_space_map_from_set(Space);
2239       isl_map *Identity = isl_map_identity(Space);
2240       Maps = isl_union_map_add_map(Maps, Identity);
2241     }
2242 
2243     return Maps;
2244   }
2245 
2246   /// Create a default-initialized PPCG GPU program.
2247   ///
2248   /// @returns A new gpu grogram description.
2249   gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
2250 
2251     if (!PPCGScop)
2252       return nullptr;
2253 
2254     auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog);
2255 
2256     PPCGProg->ctx = S->getIslCtx();
2257     PPCGProg->scop = PPCGScop;
2258     PPCGProg->context = isl_set_copy(PPCGScop->context);
2259     PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
2260     PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
2261     PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
2262     PPCGProg->tagged_must_kill =
2263         isl_union_map_copy(PPCGScop->tagged_must_kills);
2264     PPCGProg->to_inner = getArrayIdentity();
2265     PPCGProg->to_outer = getArrayIdentity();
2266     PPCGProg->any_to_outer = nullptr;
2267     PPCGProg->array_order = nullptr;
2268     PPCGProg->n_stmts = std::distance(S->begin(), S->end());
2269     PPCGProg->stmts = getStatements();
2270     PPCGProg->n_array = std::distance(S->array_begin(), S->array_end());
2271     PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info,
2272                                        PPCGProg->n_array);
2273 
2274     createArrays(PPCGProg);
2275 
2276     PPCGProg->may_persist = compute_may_persist(PPCGProg);
2277 
2278     return PPCGProg;
2279   }
2280 
2281   struct PrintGPUUserData {
2282     struct cuda_info *CudaInfo;
2283     struct gpu_prog *PPCGProg;
2284     std::vector<ppcg_kernel *> Kernels;
2285   };
2286 
2287   /// Print a user statement node in the host code.
2288   ///
2289   /// We use ppcg's printing facilities to print the actual statement and
2290   /// additionally build up a list of all kernels that are encountered in the
2291   /// host ast.
2292   ///
2293   /// @param P The printer to print to
2294   /// @param Options The printing options to use
2295   /// @param Node The node to print
2296   /// @param User A user pointer to carry additional data. This pointer is
2297   ///             expected to be of type PrintGPUUserData.
2298   ///
2299   /// @returns A printer to which the output has been printed.
2300   static __isl_give isl_printer *
2301   printHostUser(__isl_take isl_printer *P,
2302                 __isl_take isl_ast_print_options *Options,
2303                 __isl_take isl_ast_node *Node, void *User) {
2304     auto Data = (struct PrintGPUUserData *)User;
2305     auto Id = isl_ast_node_get_annotation(Node);
2306 
2307     if (Id) {
2308       bool IsUser = !strcmp(isl_id_get_name(Id), "user");
2309 
2310       // If this is a user statement, format it ourselves as ppcg would
2311       // otherwise try to call pet functionality that is not available in
2312       // Polly.
2313       if (IsUser) {
2314         P = isl_printer_start_line(P);
2315         P = isl_printer_print_ast_node(P, Node);
2316         P = isl_printer_end_line(P);
2317         isl_id_free(Id);
2318         isl_ast_print_options_free(Options);
2319         return P;
2320       }
2321 
2322       auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
2323       isl_id_free(Id);
2324       Data->Kernels.push_back(Kernel);
2325     }
2326 
2327     return print_host_user(P, Options, Node, User);
2328   }
2329 
2330   /// Print C code corresponding to the control flow in @p Kernel.
2331   ///
2332   /// @param Kernel The kernel to print
2333   void printKernel(ppcg_kernel *Kernel) {
2334     auto *P = isl_printer_to_str(S->getIslCtx());
2335     P = isl_printer_set_output_format(P, ISL_FORMAT_C);
2336     auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
2337     P = isl_ast_node_print(Kernel->tree, P, Options);
2338     char *String = isl_printer_get_str(P);
2339     printf("%s\n", String);
2340     free(String);
2341     isl_printer_free(P);
2342   }
2343 
2344   /// Print C code corresponding to the GPU code described by @p Tree.
2345   ///
2346   /// @param Tree An AST describing GPU code
2347   /// @param PPCGProg The PPCG program from which @Tree has been constructed.
2348   void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
2349     auto *P = isl_printer_to_str(S->getIslCtx());
2350     P = isl_printer_set_output_format(P, ISL_FORMAT_C);
2351 
2352     PrintGPUUserData Data;
2353     Data.PPCGProg = PPCGProg;
2354 
2355     auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
2356     Options =
2357         isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
2358     P = isl_ast_node_print(Tree, P, Options);
2359     char *String = isl_printer_get_str(P);
2360     printf("# host\n");
2361     printf("%s\n", String);
2362     free(String);
2363     isl_printer_free(P);
2364 
2365     for (auto Kernel : Data.Kernels) {
2366       printf("# kernel%d\n", Kernel->id);
2367       printKernel(Kernel);
2368     }
2369   }
2370 
2371   // Generate a GPU program using PPCG.
2372   //
2373   // GPU mapping consists of multiple steps:
2374   //
2375   //  1) Compute new schedule for the program.
2376   //  2) Map schedule to GPU (TODO)
2377   //  3) Generate code for new schedule (TODO)
2378   //
2379   // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
2380   // is mostly CPU specific. Instead, we use PPCG's GPU code generation
2381   // strategy directly from this pass.
2382   gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
2383 
2384     auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen);
2385 
2386     PPCGGen->ctx = S->getIslCtx();
2387     PPCGGen->options = PPCGScop->options;
2388     PPCGGen->print = nullptr;
2389     PPCGGen->print_user = nullptr;
2390     PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
2391     PPCGGen->prog = PPCGProg;
2392     PPCGGen->tree = nullptr;
2393     PPCGGen->types.n = 0;
2394     PPCGGen->types.name = nullptr;
2395     PPCGGen->sizes = nullptr;
2396     PPCGGen->used_sizes = nullptr;
2397     PPCGGen->kernel_id = 0;
2398 
2399     // Set scheduling strategy to same strategy PPCG is using.
2400     isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
2401     isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
2402     isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
2403 
2404     isl_schedule *Schedule = get_schedule(PPCGGen);
2405 
2406     int has_permutable = has_any_permutable_node(Schedule);
2407 
2408     if (!has_permutable || has_permutable < 0) {
2409       Schedule = isl_schedule_free(Schedule);
2410     } else {
2411       Schedule = map_to_device(PPCGGen, Schedule);
2412       PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
2413     }
2414 
2415     if (DumpSchedule) {
2416       isl_printer *P = isl_printer_to_str(S->getIslCtx());
2417       P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
2418       P = isl_printer_print_str(P, "Schedule\n");
2419       P = isl_printer_print_str(P, "========\n");
2420       if (Schedule)
2421         P = isl_printer_print_schedule(P, Schedule);
2422       else
2423         P = isl_printer_print_str(P, "No schedule found\n");
2424 
2425       printf("%s\n", isl_printer_get_str(P));
2426       isl_printer_free(P);
2427     }
2428 
2429     if (DumpCode) {
2430       printf("Code\n");
2431       printf("====\n");
2432       if (PPCGGen->tree)
2433         printGPUTree(PPCGGen->tree, PPCGProg);
2434       else
2435         printf("No code generated\n");
2436     }
2437 
2438     isl_schedule_free(Schedule);
2439 
2440     return PPCGGen;
2441   }
2442 
2443   /// Free gpu_gen structure.
2444   ///
2445   /// @param PPCGGen The ppcg_gen object to free.
2446   void freePPCGGen(gpu_gen *PPCGGen) {
2447     isl_ast_node_free(PPCGGen->tree);
2448     isl_union_map_free(PPCGGen->sizes);
2449     isl_union_map_free(PPCGGen->used_sizes);
2450     free(PPCGGen);
2451   }
2452 
2453   /// Free the options in the ppcg scop structure.
2454   ///
2455   /// ppcg is not freeing these options for us. To avoid leaks we do this
2456   /// ourselves.
2457   ///
2458   /// @param PPCGScop The scop referencing the options to free.
2459   void freeOptions(ppcg_scop *PPCGScop) {
2460     free(PPCGScop->options->debug);
2461     PPCGScop->options->debug = nullptr;
2462     free(PPCGScop->options);
2463     PPCGScop->options = nullptr;
2464   }
2465 
2466   /// Approximate the number of points in the set.
2467   ///
2468   /// This function returns an ast expression that overapproximates the number
2469   /// of points in an isl set through the rectangular hull surrounding this set.
2470   ///
2471   /// @param Set   The set to count.
2472   /// @param Build The isl ast build object to use for creating the ast
2473   ///              expression.
2474   ///
2475   /// @returns An approximation of the number of points in the set.
2476   __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
2477                                              __isl_keep isl_ast_build *Build) {
2478 
2479     isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
2480     auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
2481 
2482     isl_space *Space = isl_set_get_space(Set);
2483     Space = isl_space_params(Space);
2484     auto *Univ = isl_set_universe(Space);
2485     isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
2486 
2487     for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
2488       isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
2489       isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
2490       isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
2491       DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
2492       auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
2493       Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
2494     }
2495 
2496     isl_set_free(Set);
2497     isl_pw_aff_free(OneAff);
2498 
2499     return Expr;
2500   }
2501 
2502   /// Approximate a number of dynamic instructions executed by a given
2503   /// statement.
2504   ///
2505   /// @param Stmt  The statement for which to compute the number of dynamic
2506   ///              instructions.
2507   /// @param Build The isl ast build object to use for creating the ast
2508   ///              expression.
2509   /// @returns An approximation of the number of dynamic instructions executed
2510   ///          by @p Stmt.
2511   __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
2512                                              __isl_keep isl_ast_build *Build) {
2513     auto Iterations = approxPointsInSet(Stmt.getDomain(), Build);
2514 
2515     long InstCount = 0;
2516 
2517     if (Stmt.isBlockStmt()) {
2518       auto *BB = Stmt.getBasicBlock();
2519       InstCount = std::distance(BB->begin(), BB->end());
2520     } else {
2521       auto *R = Stmt.getRegion();
2522 
2523       for (auto *BB : R->blocks()) {
2524         InstCount += std::distance(BB->begin(), BB->end());
2525       }
2526     }
2527 
2528     isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
2529     auto *InstExpr = isl_ast_expr_from_val(InstVal);
2530     return isl_ast_expr_mul(InstExpr, Iterations);
2531   }
2532 
2533   /// Approximate dynamic instructions executed in scop.
2534   ///
2535   /// @param S     The scop for which to approximate dynamic instructions.
2536   /// @param Build The isl ast build object to use for creating the ast
2537   ///              expression.
2538   /// @returns An approximation of the number of dynamic instructions executed
2539   ///          in @p S.
2540   __isl_give isl_ast_expr *
2541   getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
2542     isl_ast_expr *Instructions;
2543 
2544     isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
2545     Instructions = isl_ast_expr_from_val(Zero);
2546 
2547     for (ScopStmt &Stmt : S) {
2548       isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
2549       Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
2550     }
2551     return Instructions;
2552   }
2553 
2554   /// Create a check that ensures sufficient compute in scop.
2555   ///
2556   /// @param S     The scop for which to ensure sufficient compute.
2557   /// @param Build The isl ast build object to use for creating the ast
2558   ///              expression.
2559   /// @returns An expression that evaluates to TRUE in case of sufficient
2560   ///          compute and to FALSE, otherwise.
2561   __isl_give isl_ast_expr *
2562   createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
2563     auto Iterations = getNumberOfIterations(S, Build);
2564     auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
2565     auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
2566     return isl_ast_expr_ge(Iterations, MinComputeExpr);
2567   }
2568 
2569   /// Generate code for a given GPU AST described by @p Root.
2570   ///
2571   /// @param Root An isl_ast_node pointing to the root of the GPU AST.
2572   /// @param Prog The GPU Program to generate code for.
2573   void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
2574     ScopAnnotator Annotator;
2575     Annotator.buildAliasScopes(*S);
2576 
2577     Region *R = &S->getRegion();
2578 
2579     simplifyRegion(R, DT, LI, RI);
2580 
2581     BasicBlock *EnteringBB = R->getEnteringBlock();
2582 
2583     PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
2584 
2585     // Only build the run-time condition and parameters _after_ having
2586     // introduced the conditional branch. This is important as the conditional
2587     // branch will guard the original scop from new induction variables that
2588     // the SCEVExpander may introduce while code generating the parameters and
2589     // which may introduce scalar dependences that prevent us from correctly
2590     // code generating this scop.
2591     BasicBlock *StartBlock =
2592         executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
2593 
2594     GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
2595                                StartBlock, Prog, Runtime, Architecture);
2596 
2597     // TODO: Handle LICM
2598     auto SplitBlock = StartBlock->getSinglePredecessor();
2599     Builder.SetInsertPoint(SplitBlock->getTerminator());
2600     NodeBuilder.addParameters(S->getContext());
2601 
2602     isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
2603     isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build);
2604     isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
2605     Condition = isl_ast_expr_and(Condition, SufficientCompute);
2606     isl_ast_build_free(Build);
2607 
2608     Value *RTC = NodeBuilder.createRTC(Condition);
2609     Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
2610 
2611     Builder.SetInsertPoint(&*StartBlock->begin());
2612 
2613     NodeBuilder.initializeAfterRTH();
2614     NodeBuilder.create(Root);
2615     NodeBuilder.finalize();
2616 
2617     /// In case a sequential kernel has more surrounding loops as any parallel
2618     /// kernel, the SCoP is probably mostly sequential. Hence, there is no
2619     /// point in running it on a GPU.
2620     if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
2621       SplitBlock->getTerminator()->setOperand(0, Builder.getFalse());
2622 
2623     if (!NodeBuilder.BuildSuccessful)
2624       SplitBlock->getTerminator()->setOperand(0, Builder.getFalse());
2625   }
2626 
2627   bool runOnScop(Scop &CurrentScop) override {
2628     S = &CurrentScop;
2629     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2630     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2631     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2632     DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
2633     RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
2634 
2635     // We currently do not support scops with invariant loads.
2636     if (S->hasInvariantAccesses())
2637       return false;
2638 
2639     auto PPCGScop = createPPCGScop();
2640     auto PPCGProg = createPPCGProg(PPCGScop);
2641     auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
2642 
2643     if (PPCGGen->tree)
2644       generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
2645 
2646     freeOptions(PPCGScop);
2647     freePPCGGen(PPCGGen);
2648     gpu_prog_free(PPCGProg);
2649     ppcg_scop_free(PPCGScop);
2650 
2651     return true;
2652   }
2653 
2654   void printScop(raw_ostream &, Scop &) const override {}
2655 
2656   void getAnalysisUsage(AnalysisUsage &AU) const override {
2657     AU.addRequired<DominatorTreeWrapperPass>();
2658     AU.addRequired<RegionInfoPass>();
2659     AU.addRequired<ScalarEvolutionWrapperPass>();
2660     AU.addRequired<ScopDetection>();
2661     AU.addRequired<ScopInfoRegionPass>();
2662     AU.addRequired<LoopInfoWrapperPass>();
2663 
2664     AU.addPreserved<AAResultsWrapperPass>();
2665     AU.addPreserved<BasicAAWrapperPass>();
2666     AU.addPreserved<LoopInfoWrapperPass>();
2667     AU.addPreserved<DominatorTreeWrapperPass>();
2668     AU.addPreserved<GlobalsAAWrapperPass>();
2669     AU.addPreserved<ScopDetection>();
2670     AU.addPreserved<ScalarEvolutionWrapperPass>();
2671     AU.addPreserved<SCEVAAWrapperPass>();
2672 
2673     // FIXME: We do not yet add regions for the newly generated code to the
2674     //        region tree.
2675     AU.addPreserved<RegionInfoPass>();
2676     AU.addPreserved<ScopInfoRegionPass>();
2677   }
2678 };
2679 } // namespace
2680 
2681 char PPCGCodeGeneration::ID = 1;
2682 
2683 Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
2684   PPCGCodeGeneration *generator = new PPCGCodeGeneration();
2685   generator->Runtime = Runtime;
2686   generator->Architecture = Arch;
2687   return generator;
2688 }
2689 
2690 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
2691                       "Polly - Apply PPCG translation to SCOP", false, false)
2692 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
2693 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
2694 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
2695 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
2696 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
2697 INITIALIZE_PASS_DEPENDENCY(ScopDetection);
2698 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
2699                     "Polly - Apply PPCG translation to SCOP", false, false)
2700