1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg
11 // GPU mapping strategy.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "polly/CodeGen/IslAst.h"
16 #include "polly/CodeGen/IslNodeBuilder.h"
17 #include "polly/CodeGen/Utils.h"
18 #include "polly/DependenceInfo.h"
19 #include "polly/LinkAllPasses.h"
20 #include "polly/Options.h"
21 #include "polly/ScopDetection.h"
22 #include "polly/ScopInfo.h"
23 #include "polly/Support/SCEVValidator.h"
24 #include "llvm/ADT/PostOrderIterator.h"
25 #include "llvm/Analysis/AliasAnalysis.h"
26 #include "llvm/Analysis/BasicAliasAnalysis.h"
27 #include "llvm/Analysis/GlobalsModRef.h"
28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
29 #include "llvm/Analysis/TargetLibraryInfo.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/IR/LegacyPassManager.h"
32 #include "llvm/IR/Verifier.h"
33 #include "llvm/Support/TargetRegistry.h"
34 #include "llvm/Support/TargetSelect.h"
35 #include "llvm/Target/TargetMachine.h"
36 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 
39 #include "isl/union_map.h"
40 
41 extern "C" {
42 #include "ppcg/cuda.h"
43 #include "ppcg/gpu.h"
44 #include "ppcg/gpu_print.h"
45 #include "ppcg/ppcg.h"
46 #include "ppcg/schedule.h"
47 }
48 
49 #include "llvm/Support/Debug.h"
50 
51 using namespace polly;
52 using namespace llvm;
53 
54 #define DEBUG_TYPE "polly-codegen-ppcg"
55 
56 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
57                                   cl::desc("Dump the computed GPU Schedule"),
58                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
59                                   cl::cat(PollyCategory));
60 
61 static cl::opt<bool>
62     DumpCode("polly-acc-dump-code",
63              cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
64              cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
65 
66 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
67                                   cl::desc("Dump the kernel LLVM-IR"),
68                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
69                                   cl::cat(PollyCategory));
70 
71 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
72                                    cl::desc("Dump the kernel assembly code"),
73                                    cl::Hidden, cl::init(false), cl::ZeroOrMore,
74                                    cl::cat(PollyCategory));
75 
76 static cl::opt<bool> FastMath("polly-acc-fastmath",
77                               cl::desc("Allow unsafe math optimizations"),
78                               cl::Hidden, cl::init(false), cl::ZeroOrMore,
79                               cl::cat(PollyCategory));
80 static cl::opt<bool> SharedMemory("polly-acc-use-shared",
81                                   cl::desc("Use shared memory"), cl::Hidden,
82                                   cl::init(false), cl::ZeroOrMore,
83                                   cl::cat(PollyCategory));
84 static cl::opt<bool> PrivateMemory("polly-acc-use-private",
85                                    cl::desc("Use private memory"), cl::Hidden,
86                                    cl::init(false), cl::ZeroOrMore,
87                                    cl::cat(PollyCategory));
88 
89 static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory",
90                                    cl::desc("Generate Host kernel code assuming"
91                                             " that all memory has been"
92                                             " declared as managed memory"),
93                                    cl::Hidden, cl::init(false), cl::ZeroOrMore,
94                                    cl::cat(PollyCategory));
95 
96 static cl::opt<std::string>
97     CudaVersion("polly-acc-cuda-version",
98                 cl::desc("The CUDA version to compile for"), cl::Hidden,
99                 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
100 
101 static cl::opt<int>
102     MinCompute("polly-acc-mincompute",
103                cl::desc("Minimal number of compute statements to run on GPU."),
104                cl::Hidden, cl::init(10 * 512 * 512));
105 
106 /// Create the ast expressions for a ScopStmt.
107 ///
108 /// This function is a callback for to generate the ast expressions for each
109 /// of the scheduled ScopStmts.
110 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
111     void *StmtT, isl_ast_build *Build,
112     isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
113                                        isl_id *Id, void *User),
114     void *UserIndex,
115     isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
116     void *UserExpr) {
117 
118   ScopStmt *Stmt = (ScopStmt *)StmtT;
119 
120   isl_ctx *Ctx;
121 
122   if (!Stmt || !Build)
123     return NULL;
124 
125   Ctx = isl_ast_build_get_ctx(Build);
126   isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0);
127 
128   for (MemoryAccess *Acc : *Stmt) {
129     isl_map *AddrFunc = Acc->getAddressFunction();
130     AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain());
131     isl_id *RefId = Acc->getId();
132     isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc);
133     isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA);
134     MPA = isl_multi_pw_aff_coalesce(MPA);
135     MPA = FunctionIndex(MPA, RefId, UserIndex);
136     isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);
137     Access = FunctionExpr(Access, RefId, UserExpr);
138     RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);
139   }
140 
141   return RefToExpr;
142 }
143 
144 /// Generate code for a GPU specific isl AST.
145 ///
146 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
147 /// generates code for general-prupose AST nodes, with special functionality
148 /// for generating GPU specific user nodes.
149 ///
150 /// @see GPUNodeBuilder::createUser
151 class GPUNodeBuilder : public IslNodeBuilder {
152 public:
153   GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
154                  const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
155                  DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
156                  gpu_prog *Prog)
157       : IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
158         Prog(Prog) {
159     getExprBuilder().setIDToSAI(&IDToSAI);
160   }
161 
162   /// Create after-run-time-check initialization code.
163   void initializeAfterRTH();
164 
165   /// Finalize the generated scop.
166   virtual void finalize();
167 
168   /// Track if the full build process was successful.
169   ///
170   /// This value is set to false, if throughout the build process an error
171   /// occurred which prevents us from generating valid GPU code.
172   bool BuildSuccessful = true;
173 
174   /// The maximal number of loops surrounding a sequential kernel.
175   unsigned DeepestSequential = 0;
176 
177   /// The maximal number of loops surrounding a parallel kernel.
178   unsigned DeepestParallel = 0;
179 
180 private:
181   /// A vector of array base pointers for which a new ScopArrayInfo was created.
182   ///
183   /// This vector is used to delete the ScopArrayInfo when it is not needed any
184   /// more.
185   std::vector<Value *> LocalArrays;
186 
187   /// A map from ScopArrays to their corresponding device allocations.
188   std::map<ScopArrayInfo *, Value *> DeviceAllocations;
189 
190   /// The current GPU context.
191   Value *GPUContext;
192 
193   /// The set of isl_ids allocated in the kernel
194   std::vector<isl_id *> KernelIds;
195 
196   /// A module containing GPU code.
197   ///
198   /// This pointer is only set in case we are currently generating GPU code.
199   std::unique_ptr<Module> GPUModule;
200 
201   /// The GPU program we generate code for.
202   gpu_prog *Prog;
203 
204   /// Class to free isl_ids.
205   class IslIdDeleter {
206   public:
207     void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
208   };
209 
210   /// A set containing all isl_ids allocated in a GPU kernel.
211   ///
212   /// By releasing this set all isl_ids will be freed.
213   std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
214 
215   IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
216 
217   /// Create code for user-defined AST nodes.
218   ///
219   /// These AST nodes can be of type:
220   ///
221   ///   - ScopStmt:      A computational statement (TODO)
222   ///   - Kernel:        A GPU kernel call (TODO)
223   ///   - Data-Transfer: A GPU <-> CPU data-transfer
224   ///   - In-kernel synchronization
225   ///   - In-kernel memory copy statement
226   ///
227   /// @param UserStmt The ast node to generate code for.
228   virtual void createUser(__isl_take isl_ast_node *UserStmt);
229 
230   enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
231 
232   /// Create code for a data transfer statement
233   ///
234   /// @param TransferStmt The data transfer statement.
235   /// @param Direction The direction in which to transfer data.
236   void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
237                           enum DataDirection Direction);
238 
239   /// Find llvm::Values referenced in GPU kernel.
240   ///
241   /// @param Kernel The kernel to scan for llvm::Values
242   ///
243   /// @returns A set of values referenced by the kernel.
244   SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
245 
246   /// Compute the sizes of the execution grid for a given kernel.
247   ///
248   /// @param Kernel The kernel to compute grid sizes for.
249   ///
250   /// @returns A tuple with grid sizes for X and Y dimension
251   std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
252 
253   /// Creates a array that can be sent to the kernel on the device using a
254   /// host pointer. This is required for managed memory, when we directly send
255   /// host pointers to the device.
256   /// \note
257   /// This is to be used only with managed memory
258   Value *getOrCreateManagedDeviceArray(gpu_array_info *Array,
259                                        ScopArrayInfo *ArrayInfo);
260 
261   /// Compute the sizes of the thread blocks for a given kernel.
262   ///
263   /// @param Kernel The kernel to compute thread block sizes for.
264   ///
265   /// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
266   std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
267 
268   /// Create kernel launch parameters.
269   ///
270   /// @param Kernel        The kernel to create parameters for.
271   /// @param F             The kernel function that has been created.
272   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
273   ///
274   /// @returns A stack allocated array with pointers to the parameter
275   ///          values that are passed to the kernel.
276   Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F,
277                                 SetVector<Value *> SubtreeValues);
278 
279   /// Create declarations for kernel variable.
280   ///
281   /// This includes shared memory declarations.
282   ///
283   /// @param Kernel        The kernel definition to create variables for.
284   /// @param FN            The function into which to generate the variables.
285   void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
286 
287   /// Add CUDA annotations to module.
288   ///
289   /// Add a set of CUDA annotations that declares the maximal block dimensions
290   /// that will be used to execute the CUDA kernel. This allows the NVIDIA
291   /// PTX compiler to bound the number of allocated registers to ensure the
292   /// resulting kernel is known to run with up to as many block dimensions
293   /// as specified here.
294   ///
295   /// @param M         The module to add the annotations to.
296   /// @param BlockDimX The size of block dimension X.
297   /// @param BlockDimY The size of block dimension Y.
298   /// @param BlockDimZ The size of block dimension Z.
299   void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
300                           Value *BlockDimZ);
301 
302   /// Create GPU kernel.
303   ///
304   /// Code generate the kernel described by @p KernelStmt.
305   ///
306   /// @param KernelStmt The ast node to generate kernel code for.
307   void createKernel(__isl_take isl_ast_node *KernelStmt);
308 
309   /// Generate code that computes the size of an array.
310   ///
311   /// @param Array The array for which to compute a size.
312   Value *getArraySize(gpu_array_info *Array);
313 
314   /// Generate code to compute the minimal offset at which an array is accessed.
315   ///
316   /// The offset of an array is the minimal array location accessed in a scop.
317   ///
318   /// Example:
319   ///
320   ///   for (long i = 0; i < 100; i++)
321   ///     A[i + 42] += ...
322   ///
323   ///   getArrayOffset(A) results in 42.
324   ///
325   /// @param Array The array for which to compute the offset.
326   /// @returns An llvm::Value that contains the offset of the array.
327   Value *getArrayOffset(gpu_array_info *Array);
328 
329   /// Prepare the kernel arguments for kernel code generation
330   ///
331   /// @param Kernel The kernel to generate code for.
332   /// @param FN     The function created for the kernel.
333   void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN);
334 
335   /// Create kernel function.
336   ///
337   /// Create a kernel function located in a newly created module that can serve
338   /// as target for device code generation. Set the Builder to point to the
339   /// start block of this newly created function.
340   ///
341   /// @param Kernel The kernel to generate code for.
342   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
343   void createKernelFunction(ppcg_kernel *Kernel,
344                             SetVector<Value *> &SubtreeValues);
345 
346   /// Create the declaration of a kernel function.
347   ///
348   /// The kernel function takes as arguments:
349   ///
350   ///   - One i8 pointer for each external array reference used in the kernel.
351   ///   - Host iterators
352   ///   - Parameters
353   ///   - Other LLVM Value references (TODO)
354   ///
355   /// @param Kernel The kernel to generate the function declaration for.
356   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
357   ///
358   /// @returns The newly declared function.
359   Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
360                                      SetVector<Value *> &SubtreeValues);
361 
362   /// Insert intrinsic functions to obtain thread and block ids.
363   ///
364   /// @param The kernel to generate the intrinsic functions for.
365   void insertKernelIntrinsics(ppcg_kernel *Kernel);
366 
367   /// Create a global-to-shared or shared-to-global copy statement.
368   ///
369   /// @param CopyStmt The copy statement to generate code for
370   void createKernelCopy(ppcg_kernel_stmt *CopyStmt);
371 
372   /// Create code for a ScopStmt called in @p Expr.
373   ///
374   /// @param Expr The expression containing the call.
375   /// @param KernelStmt The kernel statement referenced in the call.
376   void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
377 
378   /// Create an in-kernel synchronization call.
379   void createKernelSync();
380 
381   /// Create a PTX assembly string for the current GPU kernel.
382   ///
383   /// @returns A string containing the corresponding PTX assembly code.
384   std::string createKernelASM();
385 
386   /// Remove references from the dominator tree to the kernel function @p F.
387   ///
388   /// @param F The function to remove references to.
389   void clearDominators(Function *F);
390 
391   /// Remove references from scalar evolution to the kernel function @p F.
392   ///
393   /// @param F The function to remove references to.
394   void clearScalarEvolution(Function *F);
395 
396   /// Remove references from loop info to the kernel function @p F.
397   ///
398   /// @param F The function to remove references to.
399   void clearLoops(Function *F);
400 
401   /// Finalize the generation of the kernel function.
402   ///
403   /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
404   /// dump its IR to stderr.
405   ///
406   /// @returns The Assembly string of the kernel.
407   std::string finalizeKernelFunction();
408 
409   /// Finalize the generation of the kernel arguments.
410   ///
411   /// This function ensures that not-read-only scalars used in a kernel are
412   /// stored back to the global memory location they ared backed up with before
413   /// the kernel terminates.
414   ///
415   /// @params Kernel The kernel to finalize kernel arguments for.
416   void finalizeKernelArguments(ppcg_kernel *Kernel);
417 
418   /// Create code that allocates memory to store arrays on device.
419   void allocateDeviceArrays();
420 
421   /// Free all allocated device arrays.
422   void freeDeviceArrays();
423 
424   /// Create a call to initialize the GPU context.
425   ///
426   /// @returns A pointer to the newly initialized context.
427   Value *createCallInitContext();
428 
429   /// Create a call to get the device pointer for a kernel allocation.
430   ///
431   /// @param Allocation The Polly GPU allocation
432   ///
433   /// @returns The device parameter corresponding to this allocation.
434   Value *createCallGetDevicePtr(Value *Allocation);
435 
436   /// Create a call to free the GPU context.
437   ///
438   /// @param Context A pointer to an initialized GPU context.
439   void createCallFreeContext(Value *Context);
440 
441   /// Create a call to allocate memory on the device.
442   ///
443   /// @param Size The size of memory to allocate
444   ///
445   /// @returns A pointer that identifies this allocation.
446   Value *createCallAllocateMemoryForDevice(Value *Size);
447 
448   /// Create a call to free a device array.
449   ///
450   /// @param Array The device array to free.
451   void createCallFreeDeviceMemory(Value *Array);
452 
453   /// Create a call to copy data from host to device.
454   ///
455   /// @param HostPtr A pointer to the host data that should be copied.
456   /// @param DevicePtr A device pointer specifying the location to copy to.
457   void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
458                                       Value *Size);
459 
460   /// Create a call to copy data from device to host.
461   ///
462   /// @param DevicePtr A pointer to the device data that should be copied.
463   /// @param HostPtr A host pointer specifying the location to copy to.
464   void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
465                                       Value *Size);
466 
467   /// Create a call to synchronize Host & Device.
468   /// \note
469   /// This is to be used only with managed memory.
470   void createCallSynchronizeDevice();
471 
472   /// Create a call to get a kernel from an assembly string.
473   ///
474   /// @param Buffer The string describing the kernel.
475   /// @param Entry  The name of the kernel function to call.
476   ///
477   /// @returns A pointer to a kernel object
478   Value *createCallGetKernel(Value *Buffer, Value *Entry);
479 
480   /// Create a call to free a GPU kernel.
481   ///
482   /// @param GPUKernel THe kernel to free.
483   void createCallFreeKernel(Value *GPUKernel);
484 
485   /// Create a call to launch a GPU kernel.
486   ///
487   /// @param GPUKernel  The kernel to launch.
488   /// @param GridDimX   The size of the first grid dimension.
489   /// @param GridDimY   The size of the second grid dimension.
490   /// @param GridBlockX The size of the first block dimension.
491   /// @param GridBlockY The size of the second block dimension.
492   /// @param GridBlockZ The size of the third block dimension.
493   /// @param Paramters  A pointer to an array that contains itself pointers to
494   ///                   the parameter values passed for each kernel argument.
495   void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
496                               Value *GridDimY, Value *BlockDimX,
497                               Value *BlockDimY, Value *BlockDimZ,
498                               Value *Parameters);
499 };
500 
501 void GPUNodeBuilder::initializeAfterRTH() {
502   BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(),
503                                  &*Builder.GetInsertPoint(), &DT, &LI);
504   NewBB->setName("polly.acc.initialize");
505   Builder.SetInsertPoint(&NewBB->front());
506 
507   GPUContext = createCallInitContext();
508 
509   if (!ManagedMemory)
510     allocateDeviceArrays();
511 }
512 
513 void GPUNodeBuilder::finalize() {
514   if (!ManagedMemory)
515     freeDeviceArrays();
516 
517   createCallFreeContext(GPUContext);
518   IslNodeBuilder::finalize();
519 }
520 
521 void GPUNodeBuilder::allocateDeviceArrays() {
522   assert(!ManagedMemory && "Managed memory will directly send host pointers "
523                            "to the kernel. There is no need for device arrays");
524   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
525 
526   for (int i = 0; i < Prog->n_array; ++i) {
527     gpu_array_info *Array = &Prog->array[i];
528     auto *ScopArray = (ScopArrayInfo *)Array->user;
529     std::string DevArrayName("p_dev_array_");
530     DevArrayName.append(Array->name);
531 
532     Value *ArraySize = getArraySize(Array);
533     Value *Offset = getArrayOffset(Array);
534     if (Offset)
535       ArraySize = Builder.CreateSub(
536           ArraySize,
537           Builder.CreateMul(Offset,
538                             Builder.getInt64(ScopArray->getElemSizeInBytes())));
539     Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
540     DevArray->setName(DevArrayName);
541     DeviceAllocations[ScopArray] = DevArray;
542   }
543 
544   isl_ast_build_free(Build);
545 }
546 
547 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
548                                         Value *BlockDimY, Value *BlockDimZ) {
549   auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
550 
551   for (auto &F : *M) {
552     if (F.getCallingConv() != CallingConv::PTX_Kernel)
553       continue;
554 
555     Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
556 
557     Metadata *Elements[] = {
558         ValueAsMetadata::get(&F),   MDString::get(M->getContext(), "maxntidx"),
559         ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
560         ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
561         ValueAsMetadata::get(V[2]),
562     };
563     MDNode *Node = MDNode::get(M->getContext(), Elements);
564     AnnotationNode->addOperand(Node);
565   }
566 }
567 
568 void GPUNodeBuilder::freeDeviceArrays() {
569   assert(!ManagedMemory && "Managed memory does not use device arrays");
570   for (auto &Array : DeviceAllocations)
571     createCallFreeDeviceMemory(Array.second);
572 }
573 
574 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
575   const char *Name = "polly_getKernel";
576   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
577   Function *F = M->getFunction(Name);
578 
579   // If F is not available, declare it.
580   if (!F) {
581     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
582     std::vector<Type *> Args;
583     Args.push_back(Builder.getInt8PtrTy());
584     Args.push_back(Builder.getInt8PtrTy());
585     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
586     F = Function::Create(Ty, Linkage, Name, M);
587   }
588 
589   return Builder.CreateCall(F, {Buffer, Entry});
590 }
591 
592 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
593   const char *Name = "polly_getDevicePtr";
594   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
595   Function *F = M->getFunction(Name);
596 
597   // If F is not available, declare it.
598   if (!F) {
599     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
600     std::vector<Type *> Args;
601     Args.push_back(Builder.getInt8PtrTy());
602     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
603     F = Function::Create(Ty, Linkage, Name, M);
604   }
605 
606   return Builder.CreateCall(F, {Allocation});
607 }
608 
609 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
610                                             Value *GridDimY, Value *BlockDimX,
611                                             Value *BlockDimY, Value *BlockDimZ,
612                                             Value *Parameters) {
613   const char *Name = "polly_launchKernel";
614   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
615   Function *F = M->getFunction(Name);
616 
617   // If F is not available, declare it.
618   if (!F) {
619     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
620     std::vector<Type *> Args;
621     Args.push_back(Builder.getInt8PtrTy());
622     Args.push_back(Builder.getInt32Ty());
623     Args.push_back(Builder.getInt32Ty());
624     Args.push_back(Builder.getInt32Ty());
625     Args.push_back(Builder.getInt32Ty());
626     Args.push_back(Builder.getInt32Ty());
627     Args.push_back(Builder.getInt8PtrTy());
628     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
629     F = Function::Create(Ty, Linkage, Name, M);
630   }
631 
632   Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
633                          BlockDimZ, Parameters});
634 }
635 
636 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
637   const char *Name = "polly_freeKernel";
638   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
639   Function *F = M->getFunction(Name);
640 
641   // If F is not available, declare it.
642   if (!F) {
643     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
644     std::vector<Type *> Args;
645     Args.push_back(Builder.getInt8PtrTy());
646     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
647     F = Function::Create(Ty, Linkage, Name, M);
648   }
649 
650   Builder.CreateCall(F, {GPUKernel});
651 }
652 
653 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
654   assert(!ManagedMemory && "Managed memory does not allocate or free memory "
655                            "for device");
656   const char *Name = "polly_freeDeviceMemory";
657   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
658   Function *F = M->getFunction(Name);
659 
660   // If F is not available, declare it.
661   if (!F) {
662     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
663     std::vector<Type *> Args;
664     Args.push_back(Builder.getInt8PtrTy());
665     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
666     F = Function::Create(Ty, Linkage, Name, M);
667   }
668 
669   Builder.CreateCall(F, {Array});
670 }
671 
672 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
673   assert(!ManagedMemory && "Managed memory does not allocate or free memory "
674                            "for device");
675   const char *Name = "polly_allocateMemoryForDevice";
676   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
677   Function *F = M->getFunction(Name);
678 
679   // If F is not available, declare it.
680   if (!F) {
681     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
682     std::vector<Type *> Args;
683     Args.push_back(Builder.getInt64Ty());
684     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
685     F = Function::Create(Ty, Linkage, Name, M);
686   }
687 
688   return Builder.CreateCall(F, {Size});
689 }
690 
691 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
692                                                     Value *DeviceData,
693                                                     Value *Size) {
694   assert(!ManagedMemory && "Managed memory does not transfer memory between "
695                            "device and host");
696   const char *Name = "polly_copyFromHostToDevice";
697   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
698   Function *F = M->getFunction(Name);
699 
700   // If F is not available, declare it.
701   if (!F) {
702     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
703     std::vector<Type *> Args;
704     Args.push_back(Builder.getInt8PtrTy());
705     Args.push_back(Builder.getInt8PtrTy());
706     Args.push_back(Builder.getInt64Ty());
707     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
708     F = Function::Create(Ty, Linkage, Name, M);
709   }
710 
711   Builder.CreateCall(F, {HostData, DeviceData, Size});
712 }
713 
714 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
715                                                     Value *HostData,
716                                                     Value *Size) {
717   assert(!ManagedMemory && "Managed memory does not transfer memory between "
718                            "device and host");
719   const char *Name = "polly_copyFromDeviceToHost";
720   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
721   Function *F = M->getFunction(Name);
722 
723   // If F is not available, declare it.
724   if (!F) {
725     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
726     std::vector<Type *> Args;
727     Args.push_back(Builder.getInt8PtrTy());
728     Args.push_back(Builder.getInt8PtrTy());
729     Args.push_back(Builder.getInt64Ty());
730     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
731     F = Function::Create(Ty, Linkage, Name, M);
732   }
733 
734   Builder.CreateCall(F, {DeviceData, HostData, Size});
735 }
736 
737 void GPUNodeBuilder::createCallSynchronizeDevice() {
738   assert(ManagedMemory && "explicit synchronization is only necessary for "
739                           "managed memory");
740   const char *Name = "polly_synchronizeDevice";
741   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
742   Function *F = M->getFunction(Name);
743 
744   // If F is not available, declare it.
745   if (!F) {
746     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
747     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
748     F = Function::Create(Ty, Linkage, Name, M);
749   }
750 
751   Builder.CreateCall(F);
752 }
753 
754 Value *GPUNodeBuilder::createCallInitContext() {
755   const char *Name = "polly_initContext";
756   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
757   Function *F = M->getFunction(Name);
758 
759   // If F is not available, declare it.
760   if (!F) {
761     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
762     std::vector<Type *> Args;
763     FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
764     F = Function::Create(Ty, Linkage, Name, M);
765   }
766 
767   return Builder.CreateCall(F, {});
768 }
769 
770 void GPUNodeBuilder::createCallFreeContext(Value *Context) {
771   const char *Name = "polly_freeContext";
772   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
773   Function *F = M->getFunction(Name);
774 
775   // If F is not available, declare it.
776   if (!F) {
777     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
778     std::vector<Type *> Args;
779     Args.push_back(Builder.getInt8PtrTy());
780     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
781     F = Function::Create(Ty, Linkage, Name, M);
782   }
783 
784   Builder.CreateCall(F, {Context});
785 }
786 
787 /// Check if one string is a prefix of another.
788 ///
789 /// @param String The string in which to look for the prefix.
790 /// @param Prefix The prefix to look for.
791 static bool isPrefix(std::string String, std::string Prefix) {
792   return String.find(Prefix) == 0;
793 }
794 
795 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
796   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
797   Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
798 
799   if (!gpu_array_is_scalar(Array)) {
800     auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]);
801     isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero);
802 
803     for (unsigned int i = 1; i < Array->n_index; i++) {
804       isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]);
805       isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
806       Res = isl_ast_expr_mul(Res, Expr);
807     }
808 
809     Value *NumElements = ExprBuilder.create(Res);
810     if (NumElements->getType() != ArraySize->getType())
811       NumElements = Builder.CreateSExt(NumElements, ArraySize->getType());
812     ArraySize = Builder.CreateMul(ArraySize, NumElements);
813   }
814   isl_ast_build_free(Build);
815   return ArraySize;
816 }
817 
818 Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
819   if (gpu_array_is_scalar(Array))
820     return nullptr;
821 
822   isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
823 
824   isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent));
825 
826   isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min));
827 
828   for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++)
829     ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0);
830 
831   if (isl_set_is_subset(Min, ZeroSet)) {
832     isl_set_free(Min);
833     isl_set_free(ZeroSet);
834     isl_ast_build_free(Build);
835     return nullptr;
836   }
837   isl_set_free(ZeroSet);
838 
839   isl_ast_expr *Result =
840       isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0));
841 
842   for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
843     if (i > 0) {
844       isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
845       isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
846       Result = isl_ast_expr_mul(Result, BExpr);
847     }
848     isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i);
849     isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin);
850     Result = isl_ast_expr_add(Result, MExpr);
851   }
852 
853   Value *ResultValue = ExprBuilder.create(Result);
854   isl_set_free(Min);
855   isl_ast_build_free(Build);
856 
857   return ResultValue;
858 }
859 
860 Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array,
861                                                      ScopArrayInfo *ArrayInfo) {
862 
863   assert(ManagedMemory && "Only used when you wish to get a host "
864                           "pointer for sending data to the kernel, "
865                           "with managed memory");
866   std::map<ScopArrayInfo *, Value *>::iterator it;
867   if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) {
868     return it->second;
869   } else {
870     Value *HostPtr;
871 
872     if (gpu_array_is_scalar(Array))
873       HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo);
874     else
875       HostPtr = ArrayInfo->getBasePtr();
876 
877     Value *Offset = getArrayOffset(Array);
878     if (Offset) {
879       HostPtr = Builder.CreatePointerCast(
880           HostPtr, ArrayInfo->getElementType()->getPointerTo());
881       HostPtr = Builder.CreateGEP(HostPtr, Offset);
882     }
883 
884     HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
885     DeviceAllocations[ArrayInfo] = HostPtr;
886     return HostPtr;
887   }
888 }
889 
890 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
891                                         enum DataDirection Direction) {
892   assert(!ManagedMemory && "Managed memory needs no data transfers");
893   isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
894   isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
895   isl_id *Id = isl_ast_expr_get_id(Arg);
896   auto Array = (gpu_array_info *)isl_id_get_user(Id);
897   auto ScopArray = (ScopArrayInfo *)(Array->user);
898 
899   Value *Size = getArraySize(Array);
900   Value *Offset = getArrayOffset(Array);
901   Value *DevPtr = DeviceAllocations[ScopArray];
902 
903   Value *HostPtr;
904 
905   if (gpu_array_is_scalar(Array))
906     HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
907   else
908     HostPtr = ScopArray->getBasePtr();
909 
910   if (Offset) {
911     HostPtr = Builder.CreatePointerCast(
912         HostPtr, ScopArray->getElementType()->getPointerTo());
913     HostPtr = Builder.CreateGEP(HostPtr, Offset);
914   }
915 
916   HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
917 
918   if (Offset) {
919     Size = Builder.CreateSub(
920         Size, Builder.CreateMul(
921                   Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
922   }
923 
924   if (Direction == HOST_TO_DEVICE)
925     createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
926   else
927     createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
928 
929   isl_id_free(Id);
930   isl_ast_expr_free(Arg);
931   isl_ast_expr_free(Expr);
932   isl_ast_node_free(TransferStmt);
933 }
934 
935 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
936   isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
937   isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
938   isl_id *Id = isl_ast_expr_get_id(StmtExpr);
939   isl_id_free(Id);
940   isl_ast_expr_free(StmtExpr);
941 
942   const char *Str = isl_id_get_name(Id);
943   if (!strcmp(Str, "kernel")) {
944     createKernel(UserStmt);
945     isl_ast_expr_free(Expr);
946     return;
947   }
948 
949   if (isPrefix(Str, "to_device")) {
950     if (!ManagedMemory)
951       createDataTransfer(UserStmt, HOST_TO_DEVICE);
952     else
953       isl_ast_node_free(UserStmt);
954 
955     isl_ast_expr_free(Expr);
956     return;
957   }
958 
959   if (isPrefix(Str, "from_device")) {
960     if (!ManagedMemory) {
961       createDataTransfer(UserStmt, DEVICE_TO_HOST);
962     } else {
963       createCallSynchronizeDevice();
964       isl_ast_node_free(UserStmt);
965     }
966     isl_ast_expr_free(Expr);
967     return;
968   }
969 
970   isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
971   struct ppcg_kernel_stmt *KernelStmt =
972       (struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
973   isl_id_free(Anno);
974 
975   switch (KernelStmt->type) {
976   case ppcg_kernel_domain:
977     createScopStmt(Expr, KernelStmt);
978     isl_ast_node_free(UserStmt);
979     return;
980   case ppcg_kernel_copy:
981     createKernelCopy(KernelStmt);
982     isl_ast_expr_free(Expr);
983     isl_ast_node_free(UserStmt);
984     return;
985   case ppcg_kernel_sync:
986     createKernelSync();
987     isl_ast_expr_free(Expr);
988     isl_ast_node_free(UserStmt);
989     return;
990   }
991 
992   isl_ast_expr_free(Expr);
993   isl_ast_node_free(UserStmt);
994   return;
995 }
996 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
997   isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index);
998   LocalIndex = isl_ast_expr_address_of(LocalIndex);
999   Value *LocalAddr = ExprBuilder.create(LocalIndex);
1000   isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index);
1001   Index = isl_ast_expr_address_of(Index);
1002   Value *GlobalAddr = ExprBuilder.create(Index);
1003 
1004   if (KernelStmt->u.c.read) {
1005     LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read");
1006     Builder.CreateStore(Load, LocalAddr);
1007   } else {
1008     LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write");
1009     Builder.CreateStore(Load, GlobalAddr);
1010   }
1011 }
1012 
1013 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
1014                                     ppcg_kernel_stmt *KernelStmt) {
1015   auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
1016   isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
1017 
1018   LoopToScevMapT LTS;
1019   LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
1020 
1021   createSubstitutions(Expr, Stmt, LTS);
1022 
1023   if (Stmt->isBlockStmt())
1024     BlockGen.copyStmt(*Stmt, LTS, Indexes);
1025   else
1026     RegionGen.copyStmt(*Stmt, LTS, Indexes);
1027 }
1028 
1029 void GPUNodeBuilder::createKernelSync() {
1030   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
1031   auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
1032   Builder.CreateCall(Sync, {});
1033 }
1034 
1035 /// Collect llvm::Values referenced from @p Node
1036 ///
1037 /// This function only applies to isl_ast_nodes that are user_nodes referring
1038 /// to a ScopStmt. All other node types are ignore.
1039 ///
1040 /// @param Node The node to collect references for.
1041 /// @param User A user pointer used as storage for the data that is collected.
1042 ///
1043 /// @returns isl_bool_true if data could be collected successfully.
1044 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
1045   if (isl_ast_node_get_type(Node) != isl_ast_node_user)
1046     return isl_bool_true;
1047 
1048   isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
1049   isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
1050   isl_id *Id = isl_ast_expr_get_id(StmtExpr);
1051   const char *Str = isl_id_get_name(Id);
1052   isl_id_free(Id);
1053   isl_ast_expr_free(StmtExpr);
1054   isl_ast_expr_free(Expr);
1055 
1056   if (!isPrefix(Str, "Stmt"))
1057     return isl_bool_true;
1058 
1059   Id = isl_ast_node_get_annotation(Node);
1060   auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
1061   auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
1062   isl_id_free(Id);
1063 
1064   addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */);
1065 
1066   return isl_bool_true;
1067 }
1068 
1069 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
1070   SetVector<Value *> SubtreeValues;
1071   SetVector<const SCEV *> SCEVs;
1072   SetVector<const Loop *> Loops;
1073   SubtreeReferences References = {
1074       LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
1075 
1076   for (const auto &I : IDToValue)
1077     SubtreeValues.insert(I.second);
1078 
1079   isl_ast_node_foreach_descendant_top_down(
1080       Kernel->tree, collectReferencesInGPUStmt, &References);
1081 
1082   for (const SCEV *Expr : SCEVs)
1083     findValues(Expr, SE, SubtreeValues);
1084 
1085   for (auto &SAI : S.arrays())
1086     SubtreeValues.remove(SAI->getBasePtr());
1087 
1088   isl_space *Space = S.getParamSpace();
1089   for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) {
1090     isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
1091     assert(IDToValue.count(Id));
1092     Value *Val = IDToValue[Id];
1093     SubtreeValues.remove(Val);
1094     isl_id_free(Id);
1095   }
1096   isl_space_free(Space);
1097 
1098   for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) {
1099     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
1100     assert(IDToValue.count(Id));
1101     Value *Val = IDToValue[Id];
1102     SubtreeValues.remove(Val);
1103     isl_id_free(Id);
1104   }
1105 
1106   return SubtreeValues;
1107 }
1108 
1109 void GPUNodeBuilder::clearDominators(Function *F) {
1110   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
1111   std::vector<BasicBlock *> Nodes;
1112   for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
1113     Nodes.push_back(I->getBlock());
1114 
1115   for (BasicBlock *BB : Nodes)
1116     DT.eraseNode(BB);
1117 }
1118 
1119 void GPUNodeBuilder::clearScalarEvolution(Function *F) {
1120   for (BasicBlock &BB : *F) {
1121     Loop *L = LI.getLoopFor(&BB);
1122     if (L)
1123       SE.forgetLoop(L);
1124   }
1125 }
1126 
1127 void GPUNodeBuilder::clearLoops(Function *F) {
1128   for (BasicBlock &BB : *F) {
1129     Loop *L = LI.getLoopFor(&BB);
1130     if (L)
1131       SE.forgetLoop(L);
1132     LI.removeBlock(&BB);
1133   }
1134 }
1135 
1136 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
1137   std::vector<Value *> Sizes;
1138   isl_ast_build *Context = isl_ast_build_from_context(S.getContext());
1139 
1140   for (long i = 0; i < Kernel->n_grid; i++) {
1141     isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i);
1142     isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size);
1143     Value *Res = ExprBuilder.create(GridSize);
1144     Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
1145     Sizes.push_back(Res);
1146   }
1147   isl_ast_build_free(Context);
1148 
1149   for (long i = Kernel->n_grid; i < 3; i++)
1150     Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
1151 
1152   return std::make_tuple(Sizes[0], Sizes[1]);
1153 }
1154 
1155 std::tuple<Value *, Value *, Value *>
1156 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
1157   std::vector<Value *> Sizes;
1158 
1159   for (long i = 0; i < Kernel->n_block; i++) {
1160     Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
1161     Sizes.push_back(Res);
1162   }
1163 
1164   for (long i = Kernel->n_block; i < 3; i++)
1165     Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
1166 
1167   return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
1168 }
1169 
1170 Value *
1171 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
1172                                        SetVector<Value *> SubtreeValues) {
1173   Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(),
1174                                  std::distance(F->arg_begin(), F->arg_end()));
1175 
1176   BasicBlock *EntryBlock =
1177       &Builder.GetInsertBlock()->getParent()->getEntryBlock();
1178   auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
1179   std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
1180   Instruction *Parameters = new AllocaInst(
1181       ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());
1182 
1183   int Index = 0;
1184   for (long i = 0; i < Prog->n_array; i++) {
1185     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1186       continue;
1187 
1188     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1189     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
1190 
1191     Value *DevArray = nullptr;
1192     if (ManagedMemory) {
1193       DevArray = getOrCreateManagedDeviceArray(
1194           &Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
1195     } else {
1196       DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
1197       DevArray = createCallGetDevicePtr(DevArray);
1198     }
1199     assert(DevArray != nullptr && "Array to be offloaded to device not "
1200                                   "initialized");
1201     Value *Offset = getArrayOffset(&Prog->array[i]);
1202 
1203     if (Offset) {
1204       DevArray = Builder.CreatePointerCast(
1205           DevArray, SAI->getElementType()->getPointerTo());
1206       DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
1207       DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
1208     }
1209     Value *Slot = Builder.CreateGEP(
1210         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1211 
1212     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
1213       Value *ValPtr = nullptr;
1214       if (ManagedMemory)
1215         ValPtr = DevArray;
1216       else
1217         ValPtr = BlockGen.getOrCreateAlloca(SAI);
1218 
1219       assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
1220                                   " to be stored into Parameters");
1221       Value *ValPtrCast =
1222           Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
1223       Builder.CreateStore(ValPtrCast, Slot);
1224     } else {
1225       Instruction *Param =
1226           new AllocaInst(Builder.getInt8PtrTy(), AddressSpace,
1227                          Launch + "_param_" + std::to_string(Index),
1228                          EntryBlock->getTerminator());
1229       Builder.CreateStore(DevArray, Param);
1230       Value *ParamTyped =
1231           Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1232       Builder.CreateStore(ParamTyped, Slot);
1233     }
1234     Index++;
1235   }
1236 
1237   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
1238 
1239   for (long i = 0; i < NumHostIters; i++) {
1240     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
1241     Value *Val = IDToValue[Id];
1242     isl_id_free(Id);
1243     Instruction *Param =
1244         new AllocaInst(Val->getType(), AddressSpace,
1245                        Launch + "_param_" + std::to_string(Index),
1246                        EntryBlock->getTerminator());
1247     Builder.CreateStore(Val, Param);
1248     Value *Slot = Builder.CreateGEP(
1249         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1250     Value *ParamTyped =
1251         Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1252     Builder.CreateStore(ParamTyped, Slot);
1253     Index++;
1254   }
1255 
1256   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
1257 
1258   for (long i = 0; i < NumVars; i++) {
1259     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
1260     Value *Val = IDToValue[Id];
1261     isl_id_free(Id);
1262     Instruction *Param =
1263         new AllocaInst(Val->getType(), AddressSpace,
1264                        Launch + "_param_" + std::to_string(Index),
1265                        EntryBlock->getTerminator());
1266     Builder.CreateStore(Val, Param);
1267     Value *Slot = Builder.CreateGEP(
1268         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1269     Value *ParamTyped =
1270         Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1271     Builder.CreateStore(ParamTyped, Slot);
1272     Index++;
1273   }
1274 
1275   for (auto Val : SubtreeValues) {
1276     Instruction *Param =
1277         new AllocaInst(Val->getType(), AddressSpace,
1278                        Launch + "_param_" + std::to_string(Index),
1279                        EntryBlock->getTerminator());
1280     Builder.CreateStore(Val, Param);
1281     Value *Slot = Builder.CreateGEP(
1282         Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
1283     Value *ParamTyped =
1284         Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
1285     Builder.CreateStore(ParamTyped, Slot);
1286     Index++;
1287   }
1288 
1289   auto Location = EntryBlock->getTerminator();
1290   return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
1291                          Launch + "_params_i8ptr", Location);
1292 }
1293 
1294 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
1295   isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
1296   ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
1297   isl_id_free(Id);
1298   isl_ast_node_free(KernelStmt);
1299 
1300   if (Kernel->n_grid > 1)
1301     DeepestParallel =
1302         std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set));
1303   else
1304     DeepestSequential =
1305         std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set));
1306 
1307   Value *BlockDimX, *BlockDimY, *BlockDimZ;
1308   std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
1309 
1310   SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
1311 
1312   assert(Kernel->tree && "Device AST of kernel node is empty");
1313 
1314   Instruction &HostInsertPoint = *Builder.GetInsertPoint();
1315   IslExprBuilder::IDToValueTy HostIDs = IDToValue;
1316   ValueMapT HostValueMap = ValueMap;
1317   BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap;
1318   ScalarMap.clear();
1319 
1320   SetVector<const Loop *> Loops;
1321 
1322   // Create for all loops we depend on values that contain the current loop
1323   // iteration. These values are necessary to generate code for SCEVs that
1324   // depend on such loops. As a result we need to pass them to the subfunction.
1325   for (const Loop *L : Loops) {
1326     const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
1327                                             SE.getUnknown(Builder.getInt64(1)),
1328                                             L, SCEV::FlagAnyWrap);
1329     Value *V = generateSCEV(OuterLIV);
1330     OutsideLoopIterations[L] = SE.getUnknown(V);
1331     SubtreeValues.insert(V);
1332   }
1333 
1334   createKernelFunction(Kernel, SubtreeValues);
1335 
1336   create(isl_ast_node_copy(Kernel->tree));
1337 
1338   finalizeKernelArguments(Kernel);
1339   Function *F = Builder.GetInsertBlock()->getParent();
1340   addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
1341   clearDominators(F);
1342   clearScalarEvolution(F);
1343   clearLoops(F);
1344 
1345   IDToValue = HostIDs;
1346 
1347   ValueMap = std::move(HostValueMap);
1348   ScalarMap = std::move(HostScalarMap);
1349   EscapeMap.clear();
1350   IDToSAI.clear();
1351   Annotator.resetAlternativeAliasBases();
1352   for (auto &BasePtr : LocalArrays)
1353     S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array);
1354   LocalArrays.clear();
1355 
1356   std::string ASMString = finalizeKernelFunction();
1357   Builder.SetInsertPoint(&HostInsertPoint);
1358   Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
1359 
1360   std::string Name = "kernel_" + std::to_string(Kernel->id);
1361   Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
1362   Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
1363   Value *GPUKernel = createCallGetKernel(KernelString, NameString);
1364 
1365   Value *GridDimX, *GridDimY;
1366   std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
1367 
1368   createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
1369                          BlockDimZ, Parameters);
1370   createCallFreeKernel(GPUKernel);
1371 
1372   for (auto Id : KernelIds)
1373     isl_id_free(Id);
1374 
1375   KernelIds.clear();
1376 }
1377 
1378 /// Compute the DataLayout string for the NVPTX backend.
1379 ///
1380 /// @param is64Bit Are we looking for a 64 bit architecture?
1381 static std::string computeNVPTXDataLayout(bool is64Bit) {
1382   std::string Ret = "";
1383 
1384   if (!is64Bit) {
1385     Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
1386            "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
1387            "64-v128:128:128-n16:32:64";
1388   } else {
1389     Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
1390            "64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
1391            "64-v128:128:128-n16:32:64";
1392   }
1393 
1394   return Ret;
1395 }
1396 
1397 Function *
1398 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
1399                                          SetVector<Value *> &SubtreeValues) {
1400   std::vector<Type *> Args;
1401   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
1402 
1403   for (long i = 0; i < Prog->n_array; i++) {
1404     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1405       continue;
1406 
1407     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
1408       isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1409       const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
1410       Args.push_back(SAI->getElementType());
1411     } else {
1412       static const int UseGlobalMemory = 1;
1413       Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
1414     }
1415   }
1416 
1417   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
1418 
1419   for (long i = 0; i < NumHostIters; i++)
1420     Args.push_back(Builder.getInt64Ty());
1421 
1422   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
1423 
1424   for (long i = 0; i < NumVars; i++) {
1425     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
1426     Value *Val = IDToValue[Id];
1427     isl_id_free(Id);
1428     Args.push_back(Val->getType());
1429   }
1430 
1431   for (auto *V : SubtreeValues)
1432     Args.push_back(V->getType());
1433 
1434   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
1435   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
1436                               GPUModule.get());
1437   FN->setCallingConv(CallingConv::PTX_Kernel);
1438 
1439   auto Arg = FN->arg_begin();
1440   for (long i = 0; i < Kernel->n_array; i++) {
1441     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1442       continue;
1443 
1444     Arg->setName(Kernel->array[i].array->name);
1445 
1446     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1447     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
1448     Type *EleTy = SAI->getElementType();
1449     Value *Val = &*Arg;
1450     SmallVector<const SCEV *, 4> Sizes;
1451     isl_ast_build *Build =
1452         isl_ast_build_from_context(isl_set_copy(Prog->context));
1453     Sizes.push_back(nullptr);
1454     for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
1455       isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
1456           Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
1457       auto V = ExprBuilder.create(DimSize);
1458       Sizes.push_back(SE.getSCEV(V));
1459     }
1460     const ScopArrayInfo *SAIRep =
1461         S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array);
1462     LocalArrays.push_back(Val);
1463 
1464     isl_ast_build_free(Build);
1465     KernelIds.push_back(Id);
1466     IDToSAI[Id] = SAIRep;
1467     Arg++;
1468   }
1469 
1470   for (long i = 0; i < NumHostIters; i++) {
1471     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
1472     Arg->setName(isl_id_get_name(Id));
1473     IDToValue[Id] = &*Arg;
1474     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
1475     Arg++;
1476   }
1477 
1478   for (long i = 0; i < NumVars; i++) {
1479     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
1480     Arg->setName(isl_id_get_name(Id));
1481     Value *Val = IDToValue[Id];
1482     ValueMap[Val] = &*Arg;
1483     IDToValue[Id] = &*Arg;
1484     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
1485     Arg++;
1486   }
1487 
1488   for (auto *V : SubtreeValues) {
1489     Arg->setName(V->getName());
1490     ValueMap[V] = &*Arg;
1491     Arg++;
1492   }
1493 
1494   return FN;
1495 }
1496 
1497 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
1498   Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
1499                                    Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
1500 
1501   Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
1502                                    Intrinsic::nvvm_read_ptx_sreg_tid_y,
1503                                    Intrinsic::nvvm_read_ptx_sreg_tid_z};
1504 
1505   auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
1506     std::string Name = isl_id_get_name(Id);
1507     Module *M = Builder.GetInsertBlock()->getParent()->getParent();
1508     Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
1509     Value *Val = Builder.CreateCall(IntrinsicFn, {});
1510     Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
1511     IDToValue[Id] = Val;
1512     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
1513   };
1514 
1515   for (int i = 0; i < Kernel->n_grid; ++i) {
1516     isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
1517     addId(Id, IntrinsicsBID[i]);
1518   }
1519 
1520   for (int i = 0; i < Kernel->n_block; ++i) {
1521     isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
1522     addId(Id, IntrinsicsTID[i]);
1523   }
1524 }
1525 
1526 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
1527   auto Arg = FN->arg_begin();
1528   for (long i = 0; i < Kernel->n_array; i++) {
1529     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1530       continue;
1531 
1532     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1533     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
1534     isl_id_free(Id);
1535 
1536     if (SAI->getNumberOfDimensions() > 0) {
1537       Arg++;
1538       continue;
1539     }
1540 
1541     Value *Val = &*Arg;
1542 
1543     if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
1544       Type *TypePtr = SAI->getElementType()->getPointerTo();
1545       Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
1546       Val = Builder.CreateLoad(TypedArgPtr);
1547     }
1548 
1549     Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
1550     Builder.CreateStore(Val, Alloca);
1551 
1552     Arg++;
1553   }
1554 }
1555 
1556 void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
1557   auto *FN = Builder.GetInsertBlock()->getParent();
1558   auto Arg = FN->arg_begin();
1559 
1560   bool StoredScalar = false;
1561   for (long i = 0; i < Kernel->n_array; i++) {
1562     if (!ppcg_kernel_requires_array_argument(Kernel, i))
1563       continue;
1564 
1565     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
1566     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
1567     isl_id_free(Id);
1568 
1569     if (SAI->getNumberOfDimensions() > 0) {
1570       Arg++;
1571       continue;
1572     }
1573 
1574     if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
1575       Arg++;
1576       continue;
1577     }
1578 
1579     Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
1580     Value *ArgPtr = &*Arg;
1581     Type *TypePtr = SAI->getElementType()->getPointerTo();
1582     Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
1583     Value *Val = Builder.CreateLoad(Alloca);
1584     Builder.CreateStore(Val, TypedArgPtr);
1585     StoredScalar = true;
1586 
1587     Arg++;
1588   }
1589 
1590   if (StoredScalar)
1591     /// In case more than one thread contains scalar stores, the generated
1592     /// code might be incorrect, if we only store at the end of the kernel.
1593     /// To support this case we need to store these scalars back at each
1594     /// memory store or at least before each kernel barrier.
1595     if (Kernel->n_block != 0 || Kernel->n_grid != 0)
1596       BuildSuccessful = 0;
1597 }
1598 
1599 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
1600   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
1601 
1602   for (int i = 0; i < Kernel->n_var; ++i) {
1603     struct ppcg_kernel_var &Var = Kernel->var[i];
1604     isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set);
1605     Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType();
1606 
1607     Type *ArrayTy = EleTy;
1608     SmallVector<const SCEV *, 4> Sizes;
1609 
1610     Sizes.push_back(nullptr);
1611     for (unsigned int j = 1; j < Var.array->n_index; ++j) {
1612       isl_val *Val = isl_vec_get_element_val(Var.size, j);
1613       long Bound = isl_val_get_num_si(Val);
1614       isl_val_free(Val);
1615       Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
1616     }
1617 
1618     for (int j = Var.array->n_index - 1; j >= 0; --j) {
1619       isl_val *Val = isl_vec_get_element_val(Var.size, j);
1620       long Bound = isl_val_get_num_si(Val);
1621       isl_val_free(Val);
1622       ArrayTy = ArrayType::get(ArrayTy, Bound);
1623     }
1624 
1625     const ScopArrayInfo *SAI;
1626     Value *Allocation;
1627     if (Var.type == ppcg_access_shared) {
1628       auto GlobalVar = new GlobalVariable(
1629           *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name,
1630           nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
1631       GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8);
1632       GlobalVar->setInitializer(Constant::getNullValue(ArrayTy));
1633 
1634       Allocation = GlobalVar;
1635     } else if (Var.type == ppcg_access_private) {
1636       Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array");
1637     } else {
1638       llvm_unreachable("unknown variable type");
1639     }
1640     SAI =
1641         S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array);
1642     Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr);
1643     IDToValue[Id] = Allocation;
1644     LocalArrays.push_back(Allocation);
1645     KernelIds.push_back(Id);
1646     IDToSAI[Id] = SAI;
1647   }
1648 }
1649 
1650 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
1651                                           SetVector<Value *> &SubtreeValues) {
1652 
1653   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
1654   GPUModule.reset(new Module(Identifier, Builder.getContext()));
1655   GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
1656   GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
1657 
1658   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
1659 
1660   BasicBlock *PrevBlock = Builder.GetInsertBlock();
1661   auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
1662 
1663   DT.addNewBlock(EntryBlock, PrevBlock);
1664 
1665   Builder.SetInsertPoint(EntryBlock);
1666   Builder.CreateRetVoid();
1667   Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
1668 
1669   ScopDetection::markFunctionAsInvalid(FN);
1670 
1671   prepareKernelArguments(Kernel, FN);
1672   createKernelVariables(Kernel, FN);
1673   insertKernelIntrinsics(Kernel);
1674 }
1675 
1676 std::string GPUNodeBuilder::createKernelASM() {
1677   llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
1678   std::string ErrMsg;
1679   auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
1680 
1681   if (!GPUTarget) {
1682     errs() << ErrMsg << "\n";
1683     return "";
1684   }
1685 
1686   TargetOptions Options;
1687   Options.UnsafeFPMath = FastMath;
1688   std::unique_ptr<TargetMachine> TargetM(
1689       GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
1690                                      Options, Optional<Reloc::Model>()));
1691 
1692   SmallString<0> ASMString;
1693   raw_svector_ostream ASMStream(ASMString);
1694   llvm::legacy::PassManager PM;
1695 
1696   PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
1697 
1698   if (TargetM->addPassesToEmitFile(
1699           PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) {
1700     errs() << "The target does not support generation of this file type!\n";
1701     return "";
1702   }
1703 
1704   PM.run(*GPUModule);
1705 
1706   return ASMStream.str();
1707 }
1708 
1709 std::string GPUNodeBuilder::finalizeKernelFunction() {
1710   if (verifyModule(*GPUModule)) {
1711     BuildSuccessful = false;
1712     return "";
1713   }
1714 
1715   if (DumpKernelIR)
1716     outs() << *GPUModule << "\n";
1717 
1718   // Optimize module.
1719   llvm::legacy::PassManager OptPasses;
1720   PassManagerBuilder PassBuilder;
1721   PassBuilder.OptLevel = 3;
1722   PassBuilder.SizeLevel = 0;
1723   PassBuilder.populateModulePassManager(OptPasses);
1724   OptPasses.run(*GPUModule);
1725 
1726   std::string Assembly = createKernelASM();
1727 
1728   if (DumpKernelASM)
1729     outs() << Assembly << "\n";
1730 
1731   GPUModule.release();
1732   KernelIDs.clear();
1733 
1734   return Assembly;
1735 }
1736 
1737 namespace {
1738 class PPCGCodeGeneration : public ScopPass {
1739 public:
1740   static char ID;
1741 
1742   /// The scop that is currently processed.
1743   Scop *S;
1744 
1745   LoopInfo *LI;
1746   DominatorTree *DT;
1747   ScalarEvolution *SE;
1748   const DataLayout *DL;
1749   RegionInfo *RI;
1750 
1751   PPCGCodeGeneration() : ScopPass(ID) {}
1752 
1753   /// Construct compilation options for PPCG.
1754   ///
1755   /// @returns The compilation options.
1756   ppcg_options *createPPCGOptions() {
1757     auto DebugOptions =
1758         (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
1759     auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
1760 
1761     DebugOptions->dump_schedule_constraints = false;
1762     DebugOptions->dump_schedule = false;
1763     DebugOptions->dump_final_schedule = false;
1764     DebugOptions->dump_sizes = false;
1765     DebugOptions->verbose = false;
1766 
1767     Options->debug = DebugOptions;
1768 
1769     Options->reschedule = true;
1770     Options->scale_tile_loops = false;
1771     Options->wrap = false;
1772 
1773     Options->non_negative_parameters = false;
1774     Options->ctx = nullptr;
1775     Options->sizes = nullptr;
1776 
1777     Options->tile_size = 32;
1778 
1779     Options->use_private_memory = PrivateMemory;
1780     Options->use_shared_memory = SharedMemory;
1781     Options->max_shared_memory = 48 * 1024;
1782 
1783     Options->target = PPCG_TARGET_CUDA;
1784     Options->openmp = false;
1785     Options->linearize_device_arrays = true;
1786     Options->live_range_reordering = false;
1787 
1788     Options->opencl_compiler_options = nullptr;
1789     Options->opencl_use_gpu = false;
1790     Options->opencl_n_include_file = 0;
1791     Options->opencl_include_files = nullptr;
1792     Options->opencl_print_kernel_types = false;
1793     Options->opencl_embed_kernel_code = false;
1794 
1795     Options->save_schedule_file = nullptr;
1796     Options->load_schedule_file = nullptr;
1797 
1798     return Options;
1799   }
1800 
1801   /// Get a tagged access relation containing all accesses of type @p AccessTy.
1802   ///
1803   /// Instead of a normal access of the form:
1804   ///
1805   ///   Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
1806   ///
1807   /// a tagged access has the form
1808   ///
1809   ///   [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
1810   ///
1811   /// where 'id' is an additional space that references the memory access that
1812   /// triggered the access.
1813   ///
1814   /// @param AccessTy The type of the memory accesses to collect.
1815   ///
1816   /// @return The relation describing all tagged memory accesses.
1817   isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
1818     isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace());
1819 
1820     for (auto &Stmt : *S)
1821       for (auto &Acc : Stmt)
1822         if (Acc->getType() == AccessTy) {
1823           isl_map *Relation = Acc->getAccessRelation();
1824           Relation = isl_map_intersect_domain(Relation, Stmt.getDomain());
1825 
1826           isl_space *Space = isl_map_get_space(Relation);
1827           Space = isl_space_range(Space);
1828           Space = isl_space_from_range(Space);
1829           Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId());
1830           isl_map *Universe = isl_map_universe(Space);
1831           Relation = isl_map_domain_product(Relation, Universe);
1832           Accesses = isl_union_map_add_map(Accesses, Relation);
1833         }
1834 
1835     return Accesses;
1836   }
1837 
1838   /// Get the set of all read accesses, tagged with the access id.
1839   ///
1840   /// @see getTaggedAccesses
1841   isl_union_map *getTaggedReads() {
1842     return getTaggedAccesses(MemoryAccess::READ);
1843   }
1844 
1845   /// Get the set of all may (and must) accesses, tagged with the access id.
1846   ///
1847   /// @see getTaggedAccesses
1848   isl_union_map *getTaggedMayWrites() {
1849     return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
1850                                getTaggedAccesses(MemoryAccess::MUST_WRITE));
1851   }
1852 
1853   /// Get the set of all must accesses, tagged with the access id.
1854   ///
1855   /// @see getTaggedAccesses
1856   isl_union_map *getTaggedMustWrites() {
1857     return getTaggedAccesses(MemoryAccess::MUST_WRITE);
1858   }
1859 
1860   /// Collect parameter and array names as isl_ids.
1861   ///
1862   /// To reason about the different parameters and arrays used, ppcg requires
1863   /// a list of all isl_ids in use. As PPCG traditionally performs
1864   /// source-to-source compilation each of these isl_ids is mapped to the
1865   /// expression that represents it. As we do not have a corresponding
1866   /// expression in Polly, we just map each id to a 'zero' expression to match
1867   /// the data format that ppcg expects.
1868   ///
1869   /// @returns Retun a map from collected ids to 'zero' ast expressions.
1870   __isl_give isl_id_to_ast_expr *getNames() {
1871     auto *Names = isl_id_to_ast_expr_alloc(
1872         S->getIslCtx(),
1873         S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
1874     auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx()));
1875     auto *Space = S->getParamSpace();
1876 
1877     for (int I = 0, E = S->getNumParams(); I < E; ++I) {
1878       isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I);
1879       Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
1880     }
1881 
1882     for (auto &Array : S->arrays()) {
1883       auto Id = Array->getBasePtrId();
1884       Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
1885     }
1886 
1887     isl_space_free(Space);
1888     isl_ast_expr_free(Zero);
1889 
1890     return Names;
1891   }
1892 
1893   /// Create a new PPCG scop from the current scop.
1894   ///
1895   /// The PPCG scop is initialized with data from the current polly::Scop. From
1896   /// this initial data, the data-dependences in the PPCG scop are initialized.
1897   /// We do not use Polly's dependence analysis for now, to ensure we match
1898   /// the PPCG default behaviour more closely.
1899   ///
1900   /// @returns A new ppcg scop.
1901   ppcg_scop *createPPCGScop() {
1902     auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
1903 
1904     PPCGScop->options = createPPCGOptions();
1905 
1906     PPCGScop->start = 0;
1907     PPCGScop->end = 0;
1908 
1909     PPCGScop->context = S->getContext();
1910     PPCGScop->domain = S->getDomains();
1911     PPCGScop->call = nullptr;
1912     PPCGScop->tagged_reads = getTaggedReads();
1913     PPCGScop->reads = S->getReads();
1914     PPCGScop->live_in = nullptr;
1915     PPCGScop->tagged_may_writes = getTaggedMayWrites();
1916     PPCGScop->may_writes = S->getWrites();
1917     PPCGScop->tagged_must_writes = getTaggedMustWrites();
1918     PPCGScop->must_writes = S->getMustWrites();
1919     PPCGScop->live_out = nullptr;
1920     PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace());
1921     PPCGScop->tagger = nullptr;
1922 
1923     PPCGScop->independence = nullptr;
1924     PPCGScop->dep_flow = nullptr;
1925     PPCGScop->tagged_dep_flow = nullptr;
1926     PPCGScop->dep_false = nullptr;
1927     PPCGScop->dep_forced = nullptr;
1928     PPCGScop->dep_order = nullptr;
1929     PPCGScop->tagged_dep_order = nullptr;
1930 
1931     PPCGScop->schedule = S->getScheduleTree();
1932     PPCGScop->names = getNames();
1933 
1934     PPCGScop->pet = nullptr;
1935 
1936     compute_tagger(PPCGScop);
1937     compute_dependences(PPCGScop);
1938 
1939     return PPCGScop;
1940   }
1941 
1942   /// Collect the array acesses in a statement.
1943   ///
1944   /// @param Stmt The statement for which to collect the accesses.
1945   ///
1946   /// @returns A list of array accesses.
1947   gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
1948     gpu_stmt_access *Accesses = nullptr;
1949 
1950     for (MemoryAccess *Acc : Stmt) {
1951       auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access);
1952       Access->read = Acc->isRead();
1953       Access->write = Acc->isWrite();
1954       Access->access = Acc->getAccessRelation();
1955       isl_space *Space = isl_map_get_space(Access->access);
1956       Space = isl_space_range(Space);
1957       Space = isl_space_from_range(Space);
1958       Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId());
1959       isl_map *Universe = isl_map_universe(Space);
1960       Access->tagged_access =
1961           isl_map_domain_product(Acc->getAccessRelation(), Universe);
1962       Access->exact_write = !Acc->isMayWrite();
1963       Access->ref_id = Acc->getId();
1964       Access->next = Accesses;
1965       Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
1966       Accesses = Access;
1967     }
1968 
1969     return Accesses;
1970   }
1971 
1972   /// Collect the list of GPU statements.
1973   ///
1974   /// Each statement has an id, a pointer to the underlying data structure,
1975   /// as well as a list with all memory accesses.
1976   ///
1977   /// TODO: Initialize the list of memory accesses.
1978   ///
1979   /// @returns A linked-list of statements.
1980   gpu_stmt *getStatements() {
1981     gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt,
1982                                        std::distance(S->begin(), S->end()));
1983 
1984     int i = 0;
1985     for (auto &Stmt : *S) {
1986       gpu_stmt *GPUStmt = &Stmts[i];
1987 
1988       GPUStmt->id = Stmt.getDomainId();
1989 
1990       // We use the pet stmt pointer to keep track of the Polly statements.
1991       GPUStmt->stmt = (pet_stmt *)&Stmt;
1992       GPUStmt->accesses = getStmtAccesses(Stmt);
1993       i++;
1994     }
1995 
1996     return Stmts;
1997   }
1998 
1999   /// Derive the extent of an array.
2000   ///
2001   /// The extent of an array is the set of elements that are within the
2002   /// accessed array. For the inner dimensions, the extent constraints are
2003   /// 0 and the size of the corresponding array dimension. For the first
2004   /// (outermost) dimension, the extent constraints are the minimal and maximal
2005   /// subscript value for the first dimension.
2006   ///
2007   /// @param Array The array to derive the extent for.
2008   ///
2009   /// @returns An isl_set describing the extent of the array.
2010   __isl_give isl_set *getExtent(ScopArrayInfo *Array) {
2011     unsigned NumDims = Array->getNumberOfDimensions();
2012     isl_union_map *Accesses = S->getAccesses();
2013     Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains());
2014     Accesses = isl_union_map_detect_equalities(Accesses);
2015     isl_union_set *AccessUSet = isl_union_map_range(Accesses);
2016     AccessUSet = isl_union_set_coalesce(AccessUSet);
2017     AccessUSet = isl_union_set_detect_equalities(AccessUSet);
2018     AccessUSet = isl_union_set_coalesce(AccessUSet);
2019 
2020     if (isl_union_set_is_empty(AccessUSet)) {
2021       isl_union_set_free(AccessUSet);
2022       return isl_set_empty(Array->getSpace());
2023     }
2024 
2025     if (Array->getNumberOfDimensions() == 0) {
2026       isl_union_set_free(AccessUSet);
2027       return isl_set_universe(Array->getSpace());
2028     }
2029 
2030     isl_set *AccessSet =
2031         isl_union_set_extract_set(AccessUSet, Array->getSpace());
2032 
2033     isl_union_set_free(AccessUSet);
2034     isl_local_space *LS = isl_local_space_from_space(Array->getSpace());
2035 
2036     isl_pw_aff *Val =
2037         isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0));
2038 
2039     isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0);
2040     isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0);
2041     OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in,
2042                                    isl_pw_aff_dim(Val, isl_dim_in));
2043     OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in,
2044                                    isl_pw_aff_dim(Val, isl_dim_in));
2045     OuterMin =
2046         isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId());
2047     OuterMax =
2048         isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId());
2049 
2050     isl_set *Extent = isl_set_universe(Array->getSpace());
2051 
2052     Extent = isl_set_intersect(
2053         Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val)));
2054     Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val));
2055 
2056     for (unsigned i = 1; i < NumDims; ++i)
2057       Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0);
2058 
2059     for (unsigned i = 1; i < NumDims; ++i) {
2060       isl_pw_aff *PwAff =
2061           const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i));
2062       isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain(
2063           isl_local_space_from_space(Array->getSpace()), isl_dim_set, i));
2064       PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in,
2065                                   isl_pw_aff_dim(Val, isl_dim_in));
2066       PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in,
2067                                       isl_pw_aff_get_tuple_id(Val, isl_dim_in));
2068       auto *Set = isl_pw_aff_gt_set(PwAff, Val);
2069       Extent = isl_set_intersect(Set, Extent);
2070     }
2071 
2072     return Extent;
2073   }
2074 
2075   /// Derive the bounds of an array.
2076   ///
2077   /// For the first dimension we derive the bound of the array from the extent
2078   /// of this dimension. For inner dimensions we obtain their size directly from
2079   /// ScopArrayInfo.
2080   ///
2081   /// @param PPCGArray The array to compute bounds for.
2082   /// @param Array The polly array from which to take the information.
2083   void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
2084     if (PPCGArray.n_index > 0) {
2085       if (isl_set_is_empty(PPCGArray.extent)) {
2086         isl_set *Dom = isl_set_copy(PPCGArray.extent);
2087         isl_local_space *LS = isl_local_space_from_space(
2088             isl_space_params(isl_set_get_space(Dom)));
2089         isl_set_free(Dom);
2090         isl_aff *Zero = isl_aff_zero_on_domain(LS);
2091         PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero);
2092       } else {
2093         isl_set *Dom = isl_set_copy(PPCGArray.extent);
2094         Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
2095         isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
2096         isl_set_free(Dom);
2097         Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
2098         isl_local_space *LS =
2099             isl_local_space_from_space(isl_set_get_space(Dom));
2100         isl_aff *One = isl_aff_zero_on_domain(LS);
2101         One = isl_aff_add_constant_si(One, 1);
2102         Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
2103         Bound = isl_pw_aff_gist(Bound, S->getContext());
2104         PPCGArray.bound[0] = Bound;
2105       }
2106     }
2107 
2108     for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
2109       isl_pw_aff *Bound = Array->getDimensionSizePw(i);
2110       auto LS = isl_pw_aff_get_domain_space(Bound);
2111       auto Aff = isl_multi_aff_zero(LS);
2112       Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
2113       PPCGArray.bound[i] = Bound;
2114     }
2115   }
2116 
2117   /// Create the arrays for @p PPCGProg.
2118   ///
2119   /// @param PPCGProg The program to compute the arrays for.
2120   void createArrays(gpu_prog *PPCGProg) {
2121     int i = 0;
2122     for (auto &Array : S->arrays()) {
2123       std::string TypeName;
2124       raw_string_ostream OS(TypeName);
2125 
2126       OS << *Array->getElementType();
2127       TypeName = OS.str();
2128 
2129       gpu_array_info &PPCGArray = PPCGProg->array[i];
2130 
2131       PPCGArray.space = Array->getSpace();
2132       PPCGArray.type = strdup(TypeName.c_str());
2133       PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8;
2134       PPCGArray.name = strdup(Array->getName().c_str());
2135       PPCGArray.extent = nullptr;
2136       PPCGArray.n_index = Array->getNumberOfDimensions();
2137       PPCGArray.bound =
2138           isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index);
2139       PPCGArray.extent = getExtent(Array);
2140       PPCGArray.n_ref = 0;
2141       PPCGArray.refs = nullptr;
2142       PPCGArray.accessed = true;
2143       PPCGArray.read_only_scalar =
2144           Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
2145       PPCGArray.has_compound_element = false;
2146       PPCGArray.local = false;
2147       PPCGArray.declare_local = false;
2148       PPCGArray.global = false;
2149       PPCGArray.linearize = false;
2150       PPCGArray.dep_order = nullptr;
2151       PPCGArray.user = Array;
2152 
2153       setArrayBounds(PPCGArray, Array);
2154       i++;
2155 
2156       collect_references(PPCGProg, &PPCGArray);
2157     }
2158   }
2159 
2160   /// Create an identity map between the arrays in the scop.
2161   ///
2162   /// @returns An identity map between the arrays in the scop.
2163   isl_union_map *getArrayIdentity() {
2164     isl_union_map *Maps = isl_union_map_empty(S->getParamSpace());
2165 
2166     for (auto &Array : S->arrays()) {
2167       isl_space *Space = Array->getSpace();
2168       Space = isl_space_map_from_set(Space);
2169       isl_map *Identity = isl_map_identity(Space);
2170       Maps = isl_union_map_add_map(Maps, Identity);
2171     }
2172 
2173     return Maps;
2174   }
2175 
2176   /// Create a default-initialized PPCG GPU program.
2177   ///
2178   /// @returns A new gpu grogram description.
2179   gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
2180 
2181     if (!PPCGScop)
2182       return nullptr;
2183 
2184     auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog);
2185 
2186     PPCGProg->ctx = S->getIslCtx();
2187     PPCGProg->scop = PPCGScop;
2188     PPCGProg->context = isl_set_copy(PPCGScop->context);
2189     PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
2190     PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
2191     PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
2192     PPCGProg->tagged_must_kill =
2193         isl_union_map_copy(PPCGScop->tagged_must_kills);
2194     PPCGProg->to_inner = getArrayIdentity();
2195     PPCGProg->to_outer = getArrayIdentity();
2196     PPCGProg->any_to_outer = nullptr;
2197     PPCGProg->array_order = nullptr;
2198     PPCGProg->n_stmts = std::distance(S->begin(), S->end());
2199     PPCGProg->stmts = getStatements();
2200     PPCGProg->n_array = std::distance(S->array_begin(), S->array_end());
2201     PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info,
2202                                        PPCGProg->n_array);
2203 
2204     createArrays(PPCGProg);
2205 
2206     PPCGProg->may_persist = compute_may_persist(PPCGProg);
2207 
2208     return PPCGProg;
2209   }
2210 
2211   struct PrintGPUUserData {
2212     struct cuda_info *CudaInfo;
2213     struct gpu_prog *PPCGProg;
2214     std::vector<ppcg_kernel *> Kernels;
2215   };
2216 
2217   /// Print a user statement node in the host code.
2218   ///
2219   /// We use ppcg's printing facilities to print the actual statement and
2220   /// additionally build up a list of all kernels that are encountered in the
2221   /// host ast.
2222   ///
2223   /// @param P The printer to print to
2224   /// @param Options The printing options to use
2225   /// @param Node The node to print
2226   /// @param User A user pointer to carry additional data. This pointer is
2227   ///             expected to be of type PrintGPUUserData.
2228   ///
2229   /// @returns A printer to which the output has been printed.
2230   static __isl_give isl_printer *
2231   printHostUser(__isl_take isl_printer *P,
2232                 __isl_take isl_ast_print_options *Options,
2233                 __isl_take isl_ast_node *Node, void *User) {
2234     auto Data = (struct PrintGPUUserData *)User;
2235     auto Id = isl_ast_node_get_annotation(Node);
2236 
2237     if (Id) {
2238       bool IsUser = !strcmp(isl_id_get_name(Id), "user");
2239 
2240       // If this is a user statement, format it ourselves as ppcg would
2241       // otherwise try to call pet functionality that is not available in
2242       // Polly.
2243       if (IsUser) {
2244         P = isl_printer_start_line(P);
2245         P = isl_printer_print_ast_node(P, Node);
2246         P = isl_printer_end_line(P);
2247         isl_id_free(Id);
2248         isl_ast_print_options_free(Options);
2249         return P;
2250       }
2251 
2252       auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
2253       isl_id_free(Id);
2254       Data->Kernels.push_back(Kernel);
2255     }
2256 
2257     return print_host_user(P, Options, Node, User);
2258   }
2259 
2260   /// Print C code corresponding to the control flow in @p Kernel.
2261   ///
2262   /// @param Kernel The kernel to print
2263   void printKernel(ppcg_kernel *Kernel) {
2264     auto *P = isl_printer_to_str(S->getIslCtx());
2265     P = isl_printer_set_output_format(P, ISL_FORMAT_C);
2266     auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
2267     P = isl_ast_node_print(Kernel->tree, P, Options);
2268     char *String = isl_printer_get_str(P);
2269     printf("%s\n", String);
2270     free(String);
2271     isl_printer_free(P);
2272   }
2273 
2274   /// Print C code corresponding to the GPU code described by @p Tree.
2275   ///
2276   /// @param Tree An AST describing GPU code
2277   /// @param PPCGProg The PPCG program from which @Tree has been constructed.
2278   void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
2279     auto *P = isl_printer_to_str(S->getIslCtx());
2280     P = isl_printer_set_output_format(P, ISL_FORMAT_C);
2281 
2282     PrintGPUUserData Data;
2283     Data.PPCGProg = PPCGProg;
2284 
2285     auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
2286     Options =
2287         isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
2288     P = isl_ast_node_print(Tree, P, Options);
2289     char *String = isl_printer_get_str(P);
2290     printf("# host\n");
2291     printf("%s\n", String);
2292     free(String);
2293     isl_printer_free(P);
2294 
2295     for (auto Kernel : Data.Kernels) {
2296       printf("# kernel%d\n", Kernel->id);
2297       printKernel(Kernel);
2298     }
2299   }
2300 
2301   // Generate a GPU program using PPCG.
2302   //
2303   // GPU mapping consists of multiple steps:
2304   //
2305   //  1) Compute new schedule for the program.
2306   //  2) Map schedule to GPU (TODO)
2307   //  3) Generate code for new schedule (TODO)
2308   //
2309   // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
2310   // is mostly CPU specific. Instead, we use PPCG's GPU code generation
2311   // strategy directly from this pass.
2312   gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
2313 
2314     auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen);
2315 
2316     PPCGGen->ctx = S->getIslCtx();
2317     PPCGGen->options = PPCGScop->options;
2318     PPCGGen->print = nullptr;
2319     PPCGGen->print_user = nullptr;
2320     PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
2321     PPCGGen->prog = PPCGProg;
2322     PPCGGen->tree = nullptr;
2323     PPCGGen->types.n = 0;
2324     PPCGGen->types.name = nullptr;
2325     PPCGGen->sizes = nullptr;
2326     PPCGGen->used_sizes = nullptr;
2327     PPCGGen->kernel_id = 0;
2328 
2329     // Set scheduling strategy to same strategy PPCG is using.
2330     isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
2331     isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
2332     isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
2333 
2334     isl_schedule *Schedule = get_schedule(PPCGGen);
2335 
2336     int has_permutable = has_any_permutable_node(Schedule);
2337 
2338     if (!has_permutable || has_permutable < 0) {
2339       Schedule = isl_schedule_free(Schedule);
2340     } else {
2341       Schedule = map_to_device(PPCGGen, Schedule);
2342       PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
2343     }
2344 
2345     if (DumpSchedule) {
2346       isl_printer *P = isl_printer_to_str(S->getIslCtx());
2347       P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
2348       P = isl_printer_print_str(P, "Schedule\n");
2349       P = isl_printer_print_str(P, "========\n");
2350       if (Schedule)
2351         P = isl_printer_print_schedule(P, Schedule);
2352       else
2353         P = isl_printer_print_str(P, "No schedule found\n");
2354 
2355       printf("%s\n", isl_printer_get_str(P));
2356       isl_printer_free(P);
2357     }
2358 
2359     if (DumpCode) {
2360       printf("Code\n");
2361       printf("====\n");
2362       if (PPCGGen->tree)
2363         printGPUTree(PPCGGen->tree, PPCGProg);
2364       else
2365         printf("No code generated\n");
2366     }
2367 
2368     isl_schedule_free(Schedule);
2369 
2370     return PPCGGen;
2371   }
2372 
2373   /// Free gpu_gen structure.
2374   ///
2375   /// @param PPCGGen The ppcg_gen object to free.
2376   void freePPCGGen(gpu_gen *PPCGGen) {
2377     isl_ast_node_free(PPCGGen->tree);
2378     isl_union_map_free(PPCGGen->sizes);
2379     isl_union_map_free(PPCGGen->used_sizes);
2380     free(PPCGGen);
2381   }
2382 
2383   /// Free the options in the ppcg scop structure.
2384   ///
2385   /// ppcg is not freeing these options for us. To avoid leaks we do this
2386   /// ourselves.
2387   ///
2388   /// @param PPCGScop The scop referencing the options to free.
2389   void freeOptions(ppcg_scop *PPCGScop) {
2390     free(PPCGScop->options->debug);
2391     PPCGScop->options->debug = nullptr;
2392     free(PPCGScop->options);
2393     PPCGScop->options = nullptr;
2394   }
2395 
2396   /// Approximate the number of points in the set.
2397   ///
2398   /// This function returns an ast expression that overapproximates the number
2399   /// of points in an isl set through the rectangular hull surrounding this set.
2400   ///
2401   /// @param Set   The set to count.
2402   /// @param Build The isl ast build object to use for creating the ast
2403   ///              expression.
2404   ///
2405   /// @returns An approximation of the number of points in the set.
2406   __isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
2407                                              __isl_keep isl_ast_build *Build) {
2408 
2409     isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
2410     auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
2411 
2412     isl_space *Space = isl_set_get_space(Set);
2413     Space = isl_space_params(Space);
2414     auto *Univ = isl_set_universe(Space);
2415     isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
2416 
2417     for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
2418       isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
2419       isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
2420       isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
2421       DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
2422       auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
2423       Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
2424     }
2425 
2426     isl_set_free(Set);
2427     isl_pw_aff_free(OneAff);
2428 
2429     return Expr;
2430   }
2431 
2432   /// Approximate a number of dynamic instructions executed by a given
2433   /// statement.
2434   ///
2435   /// @param Stmt  The statement for which to compute the number of dynamic
2436   ///              instructions.
2437   /// @param Build The isl ast build object to use for creating the ast
2438   ///              expression.
2439   /// @returns An approximation of the number of dynamic instructions executed
2440   ///          by @p Stmt.
2441   __isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
2442                                              __isl_keep isl_ast_build *Build) {
2443     auto Iterations = approxPointsInSet(Stmt.getDomain(), Build);
2444 
2445     long InstCount = 0;
2446 
2447     if (Stmt.isBlockStmt()) {
2448       auto *BB = Stmt.getBasicBlock();
2449       InstCount = std::distance(BB->begin(), BB->end());
2450     } else {
2451       auto *R = Stmt.getRegion();
2452 
2453       for (auto *BB : R->blocks()) {
2454         InstCount += std::distance(BB->begin(), BB->end());
2455       }
2456     }
2457 
2458     isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
2459     auto *InstExpr = isl_ast_expr_from_val(InstVal);
2460     return isl_ast_expr_mul(InstExpr, Iterations);
2461   }
2462 
2463   /// Approximate dynamic instructions executed in scop.
2464   ///
2465   /// @param S     The scop for which to approximate dynamic instructions.
2466   /// @param Build The isl ast build object to use for creating the ast
2467   ///              expression.
2468   /// @returns An approximation of the number of dynamic instructions executed
2469   ///          in @p S.
2470   __isl_give isl_ast_expr *
2471   getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
2472     isl_ast_expr *Instructions;
2473 
2474     isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
2475     Instructions = isl_ast_expr_from_val(Zero);
2476 
2477     for (ScopStmt &Stmt : S) {
2478       isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
2479       Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
2480     }
2481     return Instructions;
2482   }
2483 
2484   /// Create a check that ensures sufficient compute in scop.
2485   ///
2486   /// @param S     The scop for which to ensure sufficient compute.
2487   /// @param Build The isl ast build object to use for creating the ast
2488   ///              expression.
2489   /// @returns An expression that evaluates to TRUE in case of sufficient
2490   ///          compute and to FALSE, otherwise.
2491   __isl_give isl_ast_expr *
2492   createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
2493     auto Iterations = getNumberOfIterations(S, Build);
2494     auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
2495     auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
2496     return isl_ast_expr_ge(Iterations, MinComputeExpr);
2497   }
2498 
2499   /// Generate code for a given GPU AST described by @p Root.
2500   ///
2501   /// @param Root An isl_ast_node pointing to the root of the GPU AST.
2502   /// @param Prog The GPU Program to generate code for.
2503   void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
2504     ScopAnnotator Annotator;
2505     Annotator.buildAliasScopes(*S);
2506 
2507     Region *R = &S->getRegion();
2508 
2509     simplifyRegion(R, DT, LI, RI);
2510 
2511     BasicBlock *EnteringBB = R->getEnteringBlock();
2512 
2513     PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
2514 
2515     // Only build the run-time condition and parameters _after_ having
2516     // introduced the conditional branch. This is important as the conditional
2517     // branch will guard the original scop from new induction variables that
2518     // the SCEVExpander may introduce while code generating the parameters and
2519     // which may introduce scalar dependences that prevent us from correctly
2520     // code generating this scop.
2521     BasicBlock *StartBlock =
2522         executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
2523 
2524     GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
2525                                StartBlock, Prog);
2526 
2527     // TODO: Handle LICM
2528     auto SplitBlock = StartBlock->getSinglePredecessor();
2529     Builder.SetInsertPoint(SplitBlock->getTerminator());
2530     NodeBuilder.addParameters(S->getContext());
2531 
2532     isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
2533     isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build);
2534     isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
2535     Condition = isl_ast_expr_and(Condition, SufficientCompute);
2536     isl_ast_build_free(Build);
2537 
2538     Value *RTC = NodeBuilder.createRTC(Condition);
2539     Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
2540 
2541     Builder.SetInsertPoint(&*StartBlock->begin());
2542 
2543     NodeBuilder.initializeAfterRTH();
2544     NodeBuilder.create(Root);
2545     NodeBuilder.finalize();
2546 
2547     /// In case a sequential kernel has more surrounding loops as any parallel
2548     /// kernel, the SCoP is probably mostly sequential. Hence, there is no
2549     /// point in running it on a GPU.
2550     if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
2551       SplitBlock->getTerminator()->setOperand(0, Builder.getFalse());
2552 
2553     if (!NodeBuilder.BuildSuccessful)
2554       SplitBlock->getTerminator()->setOperand(0, Builder.getFalse());
2555   }
2556 
2557   bool runOnScop(Scop &CurrentScop) override {
2558     S = &CurrentScop;
2559     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2560     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2561     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2562     DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
2563     RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
2564 
2565     // We currently do not support scops with invariant loads.
2566     if (S->hasInvariantAccesses())
2567       return false;
2568 
2569     auto PPCGScop = createPPCGScop();
2570     auto PPCGProg = createPPCGProg(PPCGScop);
2571     auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
2572 
2573     if (PPCGGen->tree)
2574       generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
2575 
2576     freeOptions(PPCGScop);
2577     freePPCGGen(PPCGGen);
2578     gpu_prog_free(PPCGProg);
2579     ppcg_scop_free(PPCGScop);
2580 
2581     return true;
2582   }
2583 
2584   void printScop(raw_ostream &, Scop &) const override {}
2585 
2586   void getAnalysisUsage(AnalysisUsage &AU) const override {
2587     AU.addRequired<DominatorTreeWrapperPass>();
2588     AU.addRequired<RegionInfoPass>();
2589     AU.addRequired<ScalarEvolutionWrapperPass>();
2590     AU.addRequired<ScopDetection>();
2591     AU.addRequired<ScopInfoRegionPass>();
2592     AU.addRequired<LoopInfoWrapperPass>();
2593 
2594     AU.addPreserved<AAResultsWrapperPass>();
2595     AU.addPreserved<BasicAAWrapperPass>();
2596     AU.addPreserved<LoopInfoWrapperPass>();
2597     AU.addPreserved<DominatorTreeWrapperPass>();
2598     AU.addPreserved<GlobalsAAWrapperPass>();
2599     AU.addPreserved<ScopDetection>();
2600     AU.addPreserved<ScalarEvolutionWrapperPass>();
2601     AU.addPreserved<SCEVAAWrapperPass>();
2602 
2603     // FIXME: We do not yet add regions for the newly generated code to the
2604     //        region tree.
2605     AU.addPreserved<RegionInfoPass>();
2606     AU.addPreserved<ScopInfoRegionPass>();
2607   }
2608 };
2609 } // namespace
2610 
2611 char PPCGCodeGeneration::ID = 1;
2612 
2613 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
2614 
2615 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
2616                       "Polly - Apply PPCG translation to SCOP", false, false)
2617 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
2618 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
2619 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
2620 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
2621 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
2622 INITIALIZE_PASS_DEPENDENCY(ScopDetection);
2623 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
2624                     "Polly - Apply PPCG translation to SCOP", false, false)
2625