1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg
11 // GPU mapping strategy.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "polly/CodeGen/IslNodeBuilder.h"
16 #include "polly/CodeGen/Utils.h"
17 #include "polly/DependenceInfo.h"
18 #include "polly/LinkAllPasses.h"
19 #include "polly/Options.h"
20 #include "polly/ScopInfo.h"
21 #include "polly/Support/SCEVValidator.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/Analysis/BasicAliasAnalysis.h"
25 #include "llvm/Analysis/GlobalsModRef.h"
26 #include "llvm/Analysis/PostDominators.h"
27 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/LegacyPassManager.h"
31 #include "llvm/IR/Verifier.h"
32 #include "llvm/Support/TargetRegistry.h"
33 #include "llvm/Support/TargetSelect.h"
34 #include "llvm/Target/TargetMachine.h"
35 
36 #include "isl/union_map.h"
37 
38 extern "C" {
39 #include "ppcg/cuda.h"
40 #include "ppcg/gpu.h"
41 #include "ppcg/gpu_print.h"
42 #include "ppcg/ppcg.h"
43 #include "ppcg/schedule.h"
44 }
45 
46 #include "llvm/Support/Debug.h"
47 
48 using namespace polly;
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "polly-codegen-ppcg"
52 
53 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
54                                   cl::desc("Dump the computed GPU Schedule"),
55                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
56                                   cl::cat(PollyCategory));
57 
58 static cl::opt<bool>
59     DumpCode("polly-acc-dump-code",
60              cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
61              cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
62 
63 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
64                                   cl::desc("Dump the kernel LLVM-IR"),
65                                   cl::Hidden, cl::init(false), cl::ZeroOrMore,
66                                   cl::cat(PollyCategory));
67 
68 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
69                                    cl::desc("Dump the kernel assembly code"),
70                                    cl::Hidden, cl::init(false), cl::ZeroOrMore,
71                                    cl::cat(PollyCategory));
72 
73 static cl::opt<bool> FastMath("polly-acc-fastmath",
74                               cl::desc("Allow unsafe math optimizations"),
75                               cl::Hidden, cl::init(false), cl::ZeroOrMore,
76                               cl::cat(PollyCategory));
77 
78 static cl::opt<std::string>
79     CudaVersion("polly-acc-cuda-version",
80                 cl::desc("The CUDA version to compile for"), cl::Hidden,
81                 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
82 
83 /// Create the ast expressions for a ScopStmt.
84 ///
85 /// This function is a callback for to generate the ast expressions for each
86 /// of the scheduled ScopStmts.
87 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
88     void *StmtT, isl_ast_build *Build,
89     isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
90                                        isl_id *Id, void *User),
91     void *UserIndex,
92     isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
93     void *UserExpr) {
94 
95   ScopStmt *Stmt = (ScopStmt *)StmtT;
96 
97   isl_ctx *Ctx;
98 
99   if (!Stmt || !Build)
100     return NULL;
101 
102   Ctx = isl_ast_build_get_ctx(Build);
103   isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0);
104 
105   for (MemoryAccess *Acc : *Stmt) {
106     isl_map *AddrFunc = Acc->getAddressFunction();
107     AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain());
108     isl_id *RefId = Acc->getId();
109     isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc);
110     isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA);
111     MPA = isl_multi_pw_aff_coalesce(MPA);
112     MPA = FunctionIndex(MPA, RefId, UserIndex);
113     isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);
114     Access = FunctionExpr(Access, RefId, UserExpr);
115     RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);
116   }
117 
118   return RefToExpr;
119 }
120 
121 /// Generate code for a GPU specific isl AST.
122 ///
123 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
124 /// generates code for general-prupose AST nodes, with special functionality
125 /// for generating GPU specific user nodes.
126 ///
127 /// @see GPUNodeBuilder::createUser
128 class GPUNodeBuilder : public IslNodeBuilder {
129 public:
130   GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
131                  const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
132                  DominatorTree &DT, Scop &S, gpu_prog *Prog)
133       : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {
134     getExprBuilder().setIDToSAI(&IDToSAI);
135   }
136 
137 private:
138   /// A vector of array base pointers for which a new ScopArrayInfo was created.
139   ///
140   /// This vector is used to delete the ScopArrayInfo when it is not needed any
141   /// more.
142   std::vector<Value *> LocalArrays;
143 
144   /// A module containing GPU code.
145   ///
146   /// This pointer is only set in case we are currently generating GPU code.
147   std::unique_ptr<Module> GPUModule;
148 
149   /// The GPU program we generate code for.
150   gpu_prog *Prog;
151 
152   /// Class to free isl_ids.
153   class IslIdDeleter {
154   public:
155     void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
156   };
157 
158   /// A set containing all isl_ids allocated in a GPU kernel.
159   ///
160   /// By releasing this set all isl_ids will be freed.
161   std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
162 
163   IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
164 
165   /// Create code for user-defined AST nodes.
166   ///
167   /// These AST nodes can be of type:
168   ///
169   ///   - ScopStmt:      A computational statement (TODO)
170   ///   - Kernel:        A GPU kernel call (TODO)
171   ///   - Data-Transfer: A GPU <-> CPU data-transfer (TODO)
172   ///   - In-kernel synchronization
173   ///   - In-kernel memory copy statement
174   ///
175   /// @param UserStmt The ast node to generate code for.
176   virtual void createUser(__isl_take isl_ast_node *UserStmt);
177 
178   /// Find llvm::Values referenced in GPU kernel.
179   ///
180   /// @param Kernel The kernel to scan for llvm::Values
181   ///
182   /// @returns A set of values referenced by the kernel.
183   SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
184 
185   /// Create GPU kernel.
186   ///
187   /// Code generate the kernel described by @p KernelStmt.
188   ///
189   /// @param KernelStmt The ast node to generate kernel code for.
190   void createKernel(__isl_take isl_ast_node *KernelStmt);
191 
192   /// Create kernel function.
193   ///
194   /// Create a kernel function located in a newly created module that can serve
195   /// as target for device code generation. Set the Builder to point to the
196   /// start block of this newly created function.
197   ///
198   /// @param Kernel The kernel to generate code for.
199   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
200   void createKernelFunction(ppcg_kernel *Kernel,
201                             SetVector<Value *> &SubtreeValues);
202 
203   /// Create the declaration of a kernel function.
204   ///
205   /// The kernel function takes as arguments:
206   ///
207   ///   - One i8 pointer for each external array reference used in the kernel.
208   ///   - Host iterators
209   ///   - Parameters
210   ///   - Other LLVM Value references (TODO)
211   ///
212   /// @param Kernel The kernel to generate the function declaration for.
213   /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
214   ///
215   /// @returns The newly declared function.
216   Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
217                                      SetVector<Value *> &SubtreeValues);
218 
219   /// Insert intrinsic functions to obtain thread and block ids.
220   ///
221   /// @param The kernel to generate the intrinsic functions for.
222   void insertKernelIntrinsics(ppcg_kernel *Kernel);
223 
224   /// Create code for a ScopStmt called in @p Expr.
225   ///
226   /// @param Expr The expression containing the call.
227   /// @param KernelStmt The kernel statement referenced in the call.
228   void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
229 
230   /// Create an in-kernel synchronization call.
231   void createKernelSync();
232 
233   /// Create a PTX assembly string for the current GPU kernel.
234   ///
235   /// @returns A string containing the corresponding PTX assembly code.
236   std::string createKernelASM();
237 
238   /// Remove references from the dominator tree to the kernel function @p F.
239   ///
240   /// @param F The function to remove references to.
241   void clearDominators(Function *F);
242 
243   /// Remove references from scalar evolution to the kernel function @p F.
244   ///
245   /// @param F The function to remove references to.
246   void clearScalarEvolution(Function *F);
247 
248   /// Remove references from loop info to the kernel function @p F.
249   ///
250   /// @param F The function to remove references to.
251   void clearLoops(Function *F);
252 
253   /// Finalize the generation of the kernel function.
254   ///
255   /// Free the LLVM-IR module corresponding to the kernel and -- if requested --
256   /// dump its IR to stderr.
257   void finalizeKernelFunction();
258 };
259 
260 /// Check if one string is a prefix of another.
261 ///
262 /// @param String The string in which to look for the prefix.
263 /// @param Prefix The prefix to look for.
264 static bool isPrefix(std::string String, std::string Prefix) {
265   return String.find(Prefix) == 0;
266 }
267 
268 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
269   isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
270   isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
271   isl_id *Id = isl_ast_expr_get_id(StmtExpr);
272   isl_id_free(Id);
273   isl_ast_expr_free(StmtExpr);
274 
275   const char *Str = isl_id_get_name(Id);
276   if (!strcmp(Str, "kernel")) {
277     createKernel(UserStmt);
278     isl_ast_expr_free(Expr);
279     return;
280   }
281 
282   if (isPrefix(Str, "to_device") || isPrefix(Str, "from_device")) {
283     // TODO: Insert memory copies
284     isl_ast_expr_free(Expr);
285     isl_ast_node_free(UserStmt);
286     return;
287   }
288 
289   isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
290   struct ppcg_kernel_stmt *KernelStmt =
291       (struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
292   isl_id_free(Anno);
293 
294   switch (KernelStmt->type) {
295   case ppcg_kernel_domain:
296     createScopStmt(Expr, KernelStmt);
297     isl_ast_node_free(UserStmt);
298     return;
299   case ppcg_kernel_copy:
300     // TODO: Create kernel copy stmt
301     isl_ast_expr_free(Expr);
302     isl_ast_node_free(UserStmt);
303     return;
304   case ppcg_kernel_sync:
305     createKernelSync();
306     isl_ast_expr_free(Expr);
307     isl_ast_node_free(UserStmt);
308     return;
309   }
310 
311   isl_ast_expr_free(Expr);
312   isl_ast_node_free(UserStmt);
313   return;
314 }
315 
316 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
317                                     ppcg_kernel_stmt *KernelStmt) {
318   auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
319   isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
320 
321   LoopToScevMapT LTS;
322   LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
323 
324   createSubstitutions(Expr, Stmt, LTS);
325 
326   if (Stmt->isBlockStmt())
327     BlockGen.copyStmt(*Stmt, LTS, Indexes);
328   else
329     assert(0 && "Region statement not supported\n");
330 }
331 
332 void GPUNodeBuilder::createKernelSync() {
333   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
334   auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
335   Builder.CreateCall(Sync, {});
336 }
337 
338 /// Collect llvm::Values referenced from @p Node
339 ///
340 /// This function only applies to isl_ast_nodes that are user_nodes referring
341 /// to a ScopStmt. All other node types are ignore.
342 ///
343 /// @param Node The node to collect references for.
344 /// @param User A user pointer used as storage for the data that is collected.
345 ///
346 /// @returns isl_bool_true if data could be collected successfully.
347 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
348   if (isl_ast_node_get_type(Node) != isl_ast_node_user)
349     return isl_bool_true;
350 
351   isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
352   isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
353   isl_id *Id = isl_ast_expr_get_id(StmtExpr);
354   const char *Str = isl_id_get_name(Id);
355   isl_id_free(Id);
356   isl_ast_expr_free(StmtExpr);
357   isl_ast_expr_free(Expr);
358 
359   if (!isPrefix(Str, "Stmt"))
360     return isl_bool_true;
361 
362   Id = isl_ast_node_get_annotation(Node);
363   auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
364   auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
365   isl_id_free(Id);
366 
367   addReferencesFromStmt(Stmt, User);
368 
369   return isl_bool_true;
370 }
371 
372 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
373   SetVector<Value *> SubtreeValues;
374   SetVector<const SCEV *> SCEVs;
375   SetVector<const Loop *> Loops;
376   SubtreeReferences References = {
377       LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
378 
379   for (const auto &I : IDToValue)
380     SubtreeValues.insert(I.second);
381 
382   isl_ast_node_foreach_descendant_top_down(
383       Kernel->tree, collectReferencesInGPUStmt, &References);
384 
385   for (const SCEV *Expr : SCEVs)
386     findValues(Expr, SE, SubtreeValues);
387 
388   for (auto &SAI : S.arrays())
389     SubtreeValues.remove(SAI.second->getBasePtr());
390 
391   isl_space *Space = S.getParamSpace();
392   for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) {
393     isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
394     assert(IDToValue.count(Id));
395     Value *Val = IDToValue[Id];
396     SubtreeValues.remove(Val);
397     isl_id_free(Id);
398   }
399   isl_space_free(Space);
400 
401   for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) {
402     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
403     assert(IDToValue.count(Id));
404     Value *Val = IDToValue[Id];
405     SubtreeValues.remove(Val);
406     isl_id_free(Id);
407   }
408 
409   return SubtreeValues;
410 }
411 
412 void GPUNodeBuilder::clearDominators(Function *F) {
413   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
414   std::vector<BasicBlock *> Nodes;
415   for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
416     Nodes.push_back(I->getBlock());
417 
418   for (BasicBlock *BB : Nodes)
419     DT.eraseNode(BB);
420 }
421 
422 void GPUNodeBuilder::clearScalarEvolution(Function *F) {
423   for (BasicBlock &BB : *F) {
424     Loop *L = LI.getLoopFor(&BB);
425     if (L)
426       SE.forgetLoop(L);
427   }
428 }
429 
430 void GPUNodeBuilder::clearLoops(Function *F) {
431   for (BasicBlock &BB : *F) {
432     Loop *L = LI.getLoopFor(&BB);
433     if (L)
434       SE.forgetLoop(L);
435     LI.removeBlock(&BB);
436   }
437 }
438 
439 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
440   isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
441   ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
442   isl_id_free(Id);
443   isl_ast_node_free(KernelStmt);
444 
445   SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
446 
447   assert(Kernel->tree && "Device AST of kernel node is empty");
448 
449   Instruction &HostInsertPoint = *Builder.GetInsertPoint();
450   IslExprBuilder::IDToValueTy HostIDs = IDToValue;
451   ValueMapT HostValueMap = ValueMap;
452 
453   SetVector<const Loop *> Loops;
454 
455   // Create for all loops we depend on values that contain the current loop
456   // iteration. These values are necessary to generate code for SCEVs that
457   // depend on such loops. As a result we need to pass them to the subfunction.
458   for (const Loop *L : Loops) {
459     const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
460                                             SE.getUnknown(Builder.getInt64(1)),
461                                             L, SCEV::FlagAnyWrap);
462     Value *V = generateSCEV(OuterLIV);
463     OutsideLoopIterations[L] = SE.getUnknown(V);
464     SubtreeValues.insert(V);
465   }
466 
467   createKernelFunction(Kernel, SubtreeValues);
468 
469   create(isl_ast_node_copy(Kernel->tree));
470 
471   Function *F = Builder.GetInsertBlock()->getParent();
472   clearDominators(F);
473   clearScalarEvolution(F);
474   clearLoops(F);
475 
476   Builder.SetInsertPoint(&HostInsertPoint);
477   IDToValue = HostIDs;
478 
479   ValueMap = HostValueMap;
480   ScalarMap.clear();
481   PHIOpMap.clear();
482   EscapeMap.clear();
483   IDToSAI.clear();
484   Annotator.resetAlternativeAliasBases();
485   for (auto &BasePtr : LocalArrays)
486     S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
487   LocalArrays.clear();
488 
489   finalizeKernelFunction();
490 }
491 
492 /// Compute the DataLayout string for the NVPTX backend.
493 ///
494 /// @param is64Bit Are we looking for a 64 bit architecture?
495 static std::string computeNVPTXDataLayout(bool is64Bit) {
496   std::string Ret = "e";
497 
498   if (!is64Bit)
499     Ret += "-p:32:32";
500 
501   Ret += "-i64:64-v16:16-v32:32-n16:32:64";
502 
503   return Ret;
504 }
505 
506 Function *
507 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
508                                          SetVector<Value *> &SubtreeValues) {
509   std::vector<Type *> Args;
510   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
511 
512   for (long i = 0; i < Prog->n_array; i++) {
513     if (!ppcg_kernel_requires_array_argument(Kernel, i))
514       continue;
515 
516     Args.push_back(Builder.getInt8PtrTy());
517   }
518 
519   int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
520 
521   for (long i = 0; i < NumHostIters; i++)
522     Args.push_back(Builder.getInt64Ty());
523 
524   int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
525 
526   for (long i = 0; i < NumVars; i++)
527     Args.push_back(Builder.getInt64Ty());
528 
529   for (auto *V : SubtreeValues)
530     Args.push_back(V->getType());
531 
532   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
533   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
534                               GPUModule.get());
535   FN->setCallingConv(CallingConv::PTX_Kernel);
536 
537   auto Arg = FN->arg_begin();
538   for (long i = 0; i < Kernel->n_array; i++) {
539     if (!ppcg_kernel_requires_array_argument(Kernel, i))
540       continue;
541 
542     Arg->setName(Kernel->array[i].array->name);
543 
544     isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
545     const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
546     Type *EleTy = SAI->getElementType();
547     Value *Val = &*Arg;
548     SmallVector<const SCEV *, 4> Sizes;
549     isl_ast_build *Build =
550         isl_ast_build_from_context(isl_set_copy(Prog->context));
551     for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
552       isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
553           Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
554       auto V = ExprBuilder.create(DimSize);
555       Sizes.push_back(SE.getSCEV(V));
556     }
557     const ScopArrayInfo *SAIRep =
558         S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array);
559     LocalArrays.push_back(Val);
560 
561     isl_ast_build_free(Build);
562     isl_id_free(Id);
563     IDToSAI[Id] = SAIRep;
564     Arg++;
565   }
566 
567   for (long i = 0; i < NumHostIters; i++) {
568     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
569     Arg->setName(isl_id_get_name(Id));
570     IDToValue[Id] = &*Arg;
571     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
572     Arg++;
573   }
574 
575   for (long i = 0; i < NumVars; i++) {
576     isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
577     Arg->setName(isl_id_get_name(Id));
578     IDToValue[Id] = &*Arg;
579     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
580     Arg++;
581   }
582 
583   for (auto *V : SubtreeValues) {
584     Arg->setName(V->getName());
585     ValueMap[V] = &*Arg;
586     Arg++;
587   }
588 
589   return FN;
590 }
591 
592 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
593   Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
594                                    Intrinsic::nvvm_read_ptx_sreg_ctaid_y};
595 
596   Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x,
597                                    Intrinsic::nvvm_read_ptx_sreg_tid_y,
598                                    Intrinsic::nvvm_read_ptx_sreg_tid_z};
599 
600   auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
601     std::string Name = isl_id_get_name(Id);
602     Module *M = Builder.GetInsertBlock()->getParent()->getParent();
603     Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
604     Value *Val = Builder.CreateCall(IntrinsicFn, {});
605     Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
606     IDToValue[Id] = Val;
607     KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
608   };
609 
610   for (int i = 0; i < Kernel->n_grid; ++i) {
611     isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
612     addId(Id, IntrinsicsBID[i]);
613   }
614 
615   for (int i = 0; i < Kernel->n_block; ++i) {
616     isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
617     addId(Id, IntrinsicsTID[i]);
618   }
619 }
620 
621 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
622                                           SetVector<Value *> &SubtreeValues) {
623 
624   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
625   GPUModule.reset(new Module(Identifier, Builder.getContext()));
626   GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
627   GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
628 
629   Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
630 
631   BasicBlock *PrevBlock = Builder.GetInsertBlock();
632   auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
633 
634   DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
635   DT.addNewBlock(EntryBlock, PrevBlock);
636 
637   Builder.SetInsertPoint(EntryBlock);
638   Builder.CreateRetVoid();
639   Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
640 
641   insertKernelIntrinsics(Kernel);
642 }
643 
644 std::string GPUNodeBuilder::createKernelASM() {
645   llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
646   std::string ErrMsg;
647   auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
648 
649   if (!GPUTarget) {
650     errs() << ErrMsg << "\n";
651     return "";
652   }
653 
654   TargetOptions Options;
655   Options.UnsafeFPMath = FastMath;
656   std::unique_ptr<TargetMachine> TargetM(
657       GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
658                                      Options, Optional<Reloc::Model>()));
659 
660   SmallString<0> ASMString;
661   raw_svector_ostream ASMStream(ASMString);
662   llvm::legacy::PassManager PM;
663 
664   PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
665 
666   if (TargetM->addPassesToEmitFile(
667           PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) {
668     errs() << "The target does not support generation of this file type!\n";
669     return "";
670   }
671 
672   PM.run(*GPUModule);
673 
674   return ASMStream.str();
675 }
676 
677 void GPUNodeBuilder::finalizeKernelFunction() {
678   // Verify module.
679   llvm::legacy::PassManager Passes;
680   Passes.add(createVerifierPass());
681   Passes.run(*GPUModule);
682 
683   if (DumpKernelIR)
684     outs() << *GPUModule << "\n";
685 
686   std::string Assembly = createKernelASM();
687 
688   if (DumpKernelASM)
689     outs() << Assembly << "\n";
690 
691   GPUModule.release();
692   KernelIDs.clear();
693 }
694 
695 namespace {
696 class PPCGCodeGeneration : public ScopPass {
697 public:
698   static char ID;
699 
700   /// The scop that is currently processed.
701   Scop *S;
702 
703   LoopInfo *LI;
704   DominatorTree *DT;
705   ScalarEvolution *SE;
706   const DataLayout *DL;
707   RegionInfo *RI;
708 
709   PPCGCodeGeneration() : ScopPass(ID) {}
710 
711   /// Construct compilation options for PPCG.
712   ///
713   /// @returns The compilation options.
714   ppcg_options *createPPCGOptions() {
715     auto DebugOptions =
716         (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
717     auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
718 
719     DebugOptions->dump_schedule_constraints = false;
720     DebugOptions->dump_schedule = false;
721     DebugOptions->dump_final_schedule = false;
722     DebugOptions->dump_sizes = false;
723 
724     Options->debug = DebugOptions;
725 
726     Options->reschedule = true;
727     Options->scale_tile_loops = false;
728     Options->wrap = false;
729 
730     Options->non_negative_parameters = false;
731     Options->ctx = nullptr;
732     Options->sizes = nullptr;
733 
734     Options->tile_size = 32;
735 
736     Options->use_private_memory = false;
737     Options->use_shared_memory = false;
738     Options->max_shared_memory = 0;
739 
740     Options->target = PPCG_TARGET_CUDA;
741     Options->openmp = false;
742     Options->linearize_device_arrays = true;
743     Options->live_range_reordering = false;
744 
745     Options->opencl_compiler_options = nullptr;
746     Options->opencl_use_gpu = false;
747     Options->opencl_n_include_file = 0;
748     Options->opencl_include_files = nullptr;
749     Options->opencl_print_kernel_types = false;
750     Options->opencl_embed_kernel_code = false;
751 
752     Options->save_schedule_file = nullptr;
753     Options->load_schedule_file = nullptr;
754 
755     return Options;
756   }
757 
758   /// Get a tagged access relation containing all accesses of type @p AccessTy.
759   ///
760   /// Instead of a normal access of the form:
761   ///
762   ///   Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
763   ///
764   /// a tagged access has the form
765   ///
766   ///   [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
767   ///
768   /// where 'id' is an additional space that references the memory access that
769   /// triggered the access.
770   ///
771   /// @param AccessTy The type of the memory accesses to collect.
772   ///
773   /// @return The relation describing all tagged memory accesses.
774   isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
775     isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace());
776 
777     for (auto &Stmt : *S)
778       for (auto &Acc : Stmt)
779         if (Acc->getType() == AccessTy) {
780           isl_map *Relation = Acc->getAccessRelation();
781           Relation = isl_map_intersect_domain(Relation, Stmt.getDomain());
782 
783           isl_space *Space = isl_map_get_space(Relation);
784           Space = isl_space_range(Space);
785           Space = isl_space_from_range(Space);
786           Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId());
787           isl_map *Universe = isl_map_universe(Space);
788           Relation = isl_map_domain_product(Relation, Universe);
789           Accesses = isl_union_map_add_map(Accesses, Relation);
790         }
791 
792     return Accesses;
793   }
794 
795   /// Get the set of all read accesses, tagged with the access id.
796   ///
797   /// @see getTaggedAccesses
798   isl_union_map *getTaggedReads() {
799     return getTaggedAccesses(MemoryAccess::READ);
800   }
801 
802   /// Get the set of all may (and must) accesses, tagged with the access id.
803   ///
804   /// @see getTaggedAccesses
805   isl_union_map *getTaggedMayWrites() {
806     return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
807                                getTaggedAccesses(MemoryAccess::MUST_WRITE));
808   }
809 
810   /// Get the set of all must accesses, tagged with the access id.
811   ///
812   /// @see getTaggedAccesses
813   isl_union_map *getTaggedMustWrites() {
814     return getTaggedAccesses(MemoryAccess::MUST_WRITE);
815   }
816 
817   /// Collect parameter and array names as isl_ids.
818   ///
819   /// To reason about the different parameters and arrays used, ppcg requires
820   /// a list of all isl_ids in use. As PPCG traditionally performs
821   /// source-to-source compilation each of these isl_ids is mapped to the
822   /// expression that represents it. As we do not have a corresponding
823   /// expression in Polly, we just map each id to a 'zero' expression to match
824   /// the data format that ppcg expects.
825   ///
826   /// @returns Retun a map from collected ids to 'zero' ast expressions.
827   __isl_give isl_id_to_ast_expr *getNames() {
828     auto *Names = isl_id_to_ast_expr_alloc(
829         S->getIslCtx(),
830         S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
831     auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx()));
832     auto *Space = S->getParamSpace();
833 
834     for (int I = 0, E = S->getNumParams(); I < E; ++I) {
835       isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I);
836       Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
837     }
838 
839     for (auto &Array : S->arrays()) {
840       auto Id = Array.second->getBasePtrId();
841       Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
842     }
843 
844     isl_space_free(Space);
845     isl_ast_expr_free(Zero);
846 
847     return Names;
848   }
849 
850   /// Create a new PPCG scop from the current scop.
851   ///
852   /// The PPCG scop is initialized with data from the current polly::Scop. From
853   /// this initial data, the data-dependences in the PPCG scop are initialized.
854   /// We do not use Polly's dependence analysis for now, to ensure we match
855   /// the PPCG default behaviour more closely.
856   ///
857   /// @returns A new ppcg scop.
858   ppcg_scop *createPPCGScop() {
859     auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
860 
861     PPCGScop->options = createPPCGOptions();
862 
863     PPCGScop->start = 0;
864     PPCGScop->end = 0;
865 
866     PPCGScop->context = S->getContext();
867     PPCGScop->domain = S->getDomains();
868     PPCGScop->call = nullptr;
869     PPCGScop->tagged_reads = getTaggedReads();
870     PPCGScop->reads = S->getReads();
871     PPCGScop->live_in = nullptr;
872     PPCGScop->tagged_may_writes = getTaggedMayWrites();
873     PPCGScop->may_writes = S->getWrites();
874     PPCGScop->tagged_must_writes = getTaggedMustWrites();
875     PPCGScop->must_writes = S->getMustWrites();
876     PPCGScop->live_out = nullptr;
877     PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace());
878     PPCGScop->tagger = nullptr;
879 
880     PPCGScop->independence = nullptr;
881     PPCGScop->dep_flow = nullptr;
882     PPCGScop->tagged_dep_flow = nullptr;
883     PPCGScop->dep_false = nullptr;
884     PPCGScop->dep_forced = nullptr;
885     PPCGScop->dep_order = nullptr;
886     PPCGScop->tagged_dep_order = nullptr;
887 
888     PPCGScop->schedule = S->getScheduleTree();
889     PPCGScop->names = getNames();
890 
891     PPCGScop->pet = nullptr;
892 
893     compute_tagger(PPCGScop);
894     compute_dependences(PPCGScop);
895 
896     return PPCGScop;
897   }
898 
899   /// Collect the array acesses in a statement.
900   ///
901   /// @param Stmt The statement for which to collect the accesses.
902   ///
903   /// @returns A list of array accesses.
904   gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
905     gpu_stmt_access *Accesses = nullptr;
906 
907     for (MemoryAccess *Acc : Stmt) {
908       auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access);
909       Access->read = Acc->isRead();
910       Access->write = Acc->isWrite();
911       Access->access = Acc->getAccessRelation();
912       isl_space *Space = isl_map_get_space(Access->access);
913       Space = isl_space_range(Space);
914       Space = isl_space_from_range(Space);
915       Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId());
916       isl_map *Universe = isl_map_universe(Space);
917       Access->tagged_access =
918           isl_map_domain_product(Acc->getAccessRelation(), Universe);
919       Access->exact_write = Acc->isWrite();
920       Access->ref_id = Acc->getId();
921       Access->next = Accesses;
922       Accesses = Access;
923     }
924 
925     return Accesses;
926   }
927 
928   /// Collect the list of GPU statements.
929   ///
930   /// Each statement has an id, a pointer to the underlying data structure,
931   /// as well as a list with all memory accesses.
932   ///
933   /// TODO: Initialize the list of memory accesses.
934   ///
935   /// @returns A linked-list of statements.
936   gpu_stmt *getStatements() {
937     gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt,
938                                        std::distance(S->begin(), S->end()));
939 
940     int i = 0;
941     for (auto &Stmt : *S) {
942       gpu_stmt *GPUStmt = &Stmts[i];
943 
944       GPUStmt->id = Stmt.getDomainId();
945 
946       // We use the pet stmt pointer to keep track of the Polly statements.
947       GPUStmt->stmt = (pet_stmt *)&Stmt;
948       GPUStmt->accesses = getStmtAccesses(Stmt);
949       i++;
950     }
951 
952     return Stmts;
953   }
954 
955   /// Derive the extent of an array.
956   ///
957   /// The extent of an array is defined by the set of memory locations for
958   /// which a memory access in the iteration domain exists.
959   ///
960   /// @param Array The array to derive the extent for.
961   ///
962   /// @returns An isl_set describing the extent of the array.
963   __isl_give isl_set *getExtent(ScopArrayInfo *Array) {
964     isl_union_map *Accesses = S->getAccesses();
965     Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains());
966     isl_union_set *AccessUSet = isl_union_map_range(Accesses);
967     isl_set *AccessSet =
968         isl_union_set_extract_set(AccessUSet, Array->getSpace());
969     isl_union_set_free(AccessUSet);
970 
971     return AccessSet;
972   }
973 
974   /// Derive the bounds of an array.
975   ///
976   /// For the first dimension we derive the bound of the array from the extent
977   /// of this dimension. For inner dimensions we obtain their size directly from
978   /// ScopArrayInfo.
979   ///
980   /// @param PPCGArray The array to compute bounds for.
981   /// @param Array The polly array from which to take the information.
982   void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
983     if (PPCGArray.n_index > 0) {
984       isl_set *Dom = isl_set_copy(PPCGArray.extent);
985       Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
986       isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
987       isl_set_free(Dom);
988       Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
989       isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom));
990       isl_aff *One = isl_aff_zero_on_domain(LS);
991       One = isl_aff_add_constant_si(One, 1);
992       Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
993       Bound = isl_pw_aff_gist(Bound, S->getContext());
994       PPCGArray.bound[0] = Bound;
995     }
996 
997     for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
998       isl_pw_aff *Bound = Array->getDimensionSizePw(i);
999       auto LS = isl_pw_aff_get_domain_space(Bound);
1000       auto Aff = isl_multi_aff_zero(LS);
1001       Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
1002       PPCGArray.bound[i] = Bound;
1003     }
1004   }
1005 
1006   /// Create the arrays for @p PPCGProg.
1007   ///
1008   /// @param PPCGProg The program to compute the arrays for.
1009   void createArrays(gpu_prog *PPCGProg) {
1010     int i = 0;
1011     for (auto &Element : S->arrays()) {
1012       ScopArrayInfo *Array = Element.second.get();
1013 
1014       std::string TypeName;
1015       raw_string_ostream OS(TypeName);
1016 
1017       OS << *Array->getElementType();
1018       TypeName = OS.str();
1019 
1020       gpu_array_info &PPCGArray = PPCGProg->array[i];
1021 
1022       PPCGArray.space = Array->getSpace();
1023       PPCGArray.type = strdup(TypeName.c_str());
1024       PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8;
1025       PPCGArray.name = strdup(Array->getName().c_str());
1026       PPCGArray.extent = nullptr;
1027       PPCGArray.n_index = Array->getNumberOfDimensions();
1028       PPCGArray.bound =
1029           isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index);
1030       PPCGArray.extent = getExtent(Array);
1031       PPCGArray.n_ref = 0;
1032       PPCGArray.refs = nullptr;
1033       PPCGArray.accessed = true;
1034       PPCGArray.read_only_scalar = false;
1035       PPCGArray.has_compound_element = false;
1036       PPCGArray.local = false;
1037       PPCGArray.declare_local = false;
1038       PPCGArray.global = false;
1039       PPCGArray.linearize = false;
1040       PPCGArray.dep_order = nullptr;
1041 
1042       setArrayBounds(PPCGArray, Array);
1043       i++;
1044 
1045       collect_references(PPCGProg, &PPCGArray);
1046     }
1047   }
1048 
1049   /// Create an identity map between the arrays in the scop.
1050   ///
1051   /// @returns An identity map between the arrays in the scop.
1052   isl_union_map *getArrayIdentity() {
1053     isl_union_map *Maps = isl_union_map_empty(S->getParamSpace());
1054 
1055     for (auto &Item : S->arrays()) {
1056       ScopArrayInfo *Array = Item.second.get();
1057       isl_space *Space = Array->getSpace();
1058       Space = isl_space_map_from_set(Space);
1059       isl_map *Identity = isl_map_identity(Space);
1060       Maps = isl_union_map_add_map(Maps, Identity);
1061     }
1062 
1063     return Maps;
1064   }
1065 
1066   /// Create a default-initialized PPCG GPU program.
1067   ///
1068   /// @returns A new gpu grogram description.
1069   gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
1070 
1071     if (!PPCGScop)
1072       return nullptr;
1073 
1074     auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog);
1075 
1076     PPCGProg->ctx = S->getIslCtx();
1077     PPCGProg->scop = PPCGScop;
1078     PPCGProg->context = isl_set_copy(PPCGScop->context);
1079     PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
1080     PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
1081     PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
1082     PPCGProg->tagged_must_kill =
1083         isl_union_map_copy(PPCGScop->tagged_must_kills);
1084     PPCGProg->to_inner = getArrayIdentity();
1085     PPCGProg->to_outer = getArrayIdentity();
1086     PPCGProg->may_persist = compute_may_persist(PPCGProg);
1087     PPCGProg->any_to_outer = nullptr;
1088     PPCGProg->array_order = nullptr;
1089     PPCGProg->n_stmts = std::distance(S->begin(), S->end());
1090     PPCGProg->stmts = getStatements();
1091     PPCGProg->n_array = std::distance(S->array_begin(), S->array_end());
1092     PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info,
1093                                        PPCGProg->n_array);
1094 
1095     createArrays(PPCGProg);
1096 
1097     return PPCGProg;
1098   }
1099 
1100   struct PrintGPUUserData {
1101     struct cuda_info *CudaInfo;
1102     struct gpu_prog *PPCGProg;
1103     std::vector<ppcg_kernel *> Kernels;
1104   };
1105 
1106   /// Print a user statement node in the host code.
1107   ///
1108   /// We use ppcg's printing facilities to print the actual statement and
1109   /// additionally build up a list of all kernels that are encountered in the
1110   /// host ast.
1111   ///
1112   /// @param P The printer to print to
1113   /// @param Options The printing options to use
1114   /// @param Node The node to print
1115   /// @param User A user pointer to carry additional data. This pointer is
1116   ///             expected to be of type PrintGPUUserData.
1117   ///
1118   /// @returns A printer to which the output has been printed.
1119   static __isl_give isl_printer *
1120   printHostUser(__isl_take isl_printer *P,
1121                 __isl_take isl_ast_print_options *Options,
1122                 __isl_take isl_ast_node *Node, void *User) {
1123     auto Data = (struct PrintGPUUserData *)User;
1124     auto Id = isl_ast_node_get_annotation(Node);
1125 
1126     if (Id) {
1127       bool IsUser = !strcmp(isl_id_get_name(Id), "user");
1128 
1129       // If this is a user statement, format it ourselves as ppcg would
1130       // otherwise try to call pet functionality that is not available in
1131       // Polly.
1132       if (IsUser) {
1133         P = isl_printer_start_line(P);
1134         P = isl_printer_print_ast_node(P, Node);
1135         P = isl_printer_end_line(P);
1136         isl_id_free(Id);
1137         isl_ast_print_options_free(Options);
1138         return P;
1139       }
1140 
1141       auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
1142       isl_id_free(Id);
1143       Data->Kernels.push_back(Kernel);
1144     }
1145 
1146     return print_host_user(P, Options, Node, User);
1147   }
1148 
1149   /// Print C code corresponding to the control flow in @p Kernel.
1150   ///
1151   /// @param Kernel The kernel to print
1152   void printKernel(ppcg_kernel *Kernel) {
1153     auto *P = isl_printer_to_str(S->getIslCtx());
1154     P = isl_printer_set_output_format(P, ISL_FORMAT_C);
1155     auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
1156     P = isl_ast_node_print(Kernel->tree, P, Options);
1157     char *String = isl_printer_get_str(P);
1158     printf("%s\n", String);
1159     free(String);
1160     isl_printer_free(P);
1161   }
1162 
1163   /// Print C code corresponding to the GPU code described by @p Tree.
1164   ///
1165   /// @param Tree An AST describing GPU code
1166   /// @param PPCGProg The PPCG program from which @Tree has been constructed.
1167   void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
1168     auto *P = isl_printer_to_str(S->getIslCtx());
1169     P = isl_printer_set_output_format(P, ISL_FORMAT_C);
1170 
1171     PrintGPUUserData Data;
1172     Data.PPCGProg = PPCGProg;
1173 
1174     auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
1175     Options =
1176         isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
1177     P = isl_ast_node_print(Tree, P, Options);
1178     char *String = isl_printer_get_str(P);
1179     printf("# host\n");
1180     printf("%s\n", String);
1181     free(String);
1182     isl_printer_free(P);
1183 
1184     for (auto Kernel : Data.Kernels) {
1185       printf("# kernel%d\n", Kernel->id);
1186       printKernel(Kernel);
1187     }
1188   }
1189 
1190   // Generate a GPU program using PPCG.
1191   //
1192   // GPU mapping consists of multiple steps:
1193   //
1194   //  1) Compute new schedule for the program.
1195   //  2) Map schedule to GPU (TODO)
1196   //  3) Generate code for new schedule (TODO)
1197   //
1198   // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
1199   // is mostly CPU specific. Instead, we use PPCG's GPU code generation
1200   // strategy directly from this pass.
1201   gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
1202 
1203     auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen);
1204 
1205     PPCGGen->ctx = S->getIslCtx();
1206     PPCGGen->options = PPCGScop->options;
1207     PPCGGen->print = nullptr;
1208     PPCGGen->print_user = nullptr;
1209     PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
1210     PPCGGen->prog = PPCGProg;
1211     PPCGGen->tree = nullptr;
1212     PPCGGen->types.n = 0;
1213     PPCGGen->types.name = nullptr;
1214     PPCGGen->sizes = nullptr;
1215     PPCGGen->used_sizes = nullptr;
1216     PPCGGen->kernel_id = 0;
1217 
1218     // Set scheduling strategy to same strategy PPCG is using.
1219     isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
1220     isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
1221     isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
1222 
1223     isl_schedule *Schedule = get_schedule(PPCGGen);
1224 
1225     int has_permutable = has_any_permutable_node(Schedule);
1226 
1227     if (!has_permutable || has_permutable < 0) {
1228       Schedule = isl_schedule_free(Schedule);
1229     } else {
1230       Schedule = map_to_device(PPCGGen, Schedule);
1231       PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
1232     }
1233 
1234     if (DumpSchedule) {
1235       isl_printer *P = isl_printer_to_str(S->getIslCtx());
1236       P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
1237       P = isl_printer_print_str(P, "Schedule\n");
1238       P = isl_printer_print_str(P, "========\n");
1239       if (Schedule)
1240         P = isl_printer_print_schedule(P, Schedule);
1241       else
1242         P = isl_printer_print_str(P, "No schedule found\n");
1243 
1244       printf("%s\n", isl_printer_get_str(P));
1245       isl_printer_free(P);
1246     }
1247 
1248     if (DumpCode) {
1249       printf("Code\n");
1250       printf("====\n");
1251       if (PPCGGen->tree)
1252         printGPUTree(PPCGGen->tree, PPCGProg);
1253       else
1254         printf("No code generated\n");
1255     }
1256 
1257     isl_schedule_free(Schedule);
1258 
1259     return PPCGGen;
1260   }
1261 
1262   /// Free gpu_gen structure.
1263   ///
1264   /// @param PPCGGen The ppcg_gen object to free.
1265   void freePPCGGen(gpu_gen *PPCGGen) {
1266     isl_ast_node_free(PPCGGen->tree);
1267     isl_union_map_free(PPCGGen->sizes);
1268     isl_union_map_free(PPCGGen->used_sizes);
1269     free(PPCGGen);
1270   }
1271 
1272   /// Free the options in the ppcg scop structure.
1273   ///
1274   /// ppcg is not freeing these options for us. To avoid leaks we do this
1275   /// ourselves.
1276   ///
1277   /// @param PPCGScop The scop referencing the options to free.
1278   void freeOptions(ppcg_scop *PPCGScop) {
1279     free(PPCGScop->options->debug);
1280     PPCGScop->options->debug = nullptr;
1281     free(PPCGScop->options);
1282     PPCGScop->options = nullptr;
1283   }
1284 
1285   /// Generate code for a given GPU AST described by @p Root.
1286   ///
1287   /// @param Root An isl_ast_node pointing to the root of the GPU AST.
1288   /// @param Prog The GPU Program to generate code for.
1289   void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
1290     ScopAnnotator Annotator;
1291     Annotator.buildAliasScopes(*S);
1292 
1293     Region *R = &S->getRegion();
1294 
1295     simplifyRegion(R, DT, LI, RI);
1296 
1297     BasicBlock *EnteringBB = R->getEnteringBlock();
1298 
1299     PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
1300 
1301     GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S,
1302                                Prog);
1303 
1304     // Only build the run-time condition and parameters _after_ having
1305     // introduced the conditional branch. This is important as the conditional
1306     // branch will guard the original scop from new induction variables that
1307     // the SCEVExpander may introduce while code generating the parameters and
1308     // which may introduce scalar dependences that prevent us from correctly
1309     // code generating this scop.
1310     BasicBlock *StartBlock =
1311         executeScopConditionally(*S, this, Builder.getTrue());
1312 
1313     // TODO: Handle LICM
1314     // TODO: Verify run-time checks
1315     auto SplitBlock = StartBlock->getSinglePredecessor();
1316     Builder.SetInsertPoint(SplitBlock->getTerminator());
1317     NodeBuilder.addParameters(S->getContext());
1318     Builder.SetInsertPoint(&*StartBlock->begin());
1319     NodeBuilder.create(Root);
1320     NodeBuilder.finalizeSCoP(*S);
1321   }
1322 
1323   bool runOnScop(Scop &CurrentScop) override {
1324     S = &CurrentScop;
1325     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1326     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1327     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1328     DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout();
1329     RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
1330 
1331     // We currently do not support scops with invariant loads.
1332     if (S->hasInvariantAccesses())
1333       return false;
1334 
1335     auto PPCGScop = createPPCGScop();
1336     auto PPCGProg = createPPCGProg(PPCGScop);
1337     auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
1338 
1339     if (PPCGGen->tree)
1340       generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
1341 
1342     freeOptions(PPCGScop);
1343     freePPCGGen(PPCGGen);
1344     gpu_prog_free(PPCGProg);
1345     ppcg_scop_free(PPCGScop);
1346 
1347     return true;
1348   }
1349 
1350   void printScop(raw_ostream &, Scop &) const override {}
1351 
1352   void getAnalysisUsage(AnalysisUsage &AU) const override {
1353     AU.addRequired<DominatorTreeWrapperPass>();
1354     AU.addRequired<RegionInfoPass>();
1355     AU.addRequired<ScalarEvolutionWrapperPass>();
1356     AU.addRequired<ScopDetection>();
1357     AU.addRequired<ScopInfoRegionPass>();
1358     AU.addRequired<LoopInfoWrapperPass>();
1359 
1360     AU.addPreserved<AAResultsWrapperPass>();
1361     AU.addPreserved<BasicAAWrapperPass>();
1362     AU.addPreserved<LoopInfoWrapperPass>();
1363     AU.addPreserved<DominatorTreeWrapperPass>();
1364     AU.addPreserved<GlobalsAAWrapperPass>();
1365     AU.addPreserved<PostDominatorTreeWrapperPass>();
1366     AU.addPreserved<ScopDetection>();
1367     AU.addPreserved<ScalarEvolutionWrapperPass>();
1368     AU.addPreserved<SCEVAAWrapperPass>();
1369 
1370     // FIXME: We do not yet add regions for the newly generated code to the
1371     //        region tree.
1372     AU.addPreserved<RegionInfoPass>();
1373     AU.addPreserved<ScopInfoRegionPass>();
1374   }
1375 };
1376 }
1377 
1378 char PPCGCodeGeneration::ID = 1;
1379 
1380 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); }
1381 
1382 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
1383                       "Polly - Apply PPCG translation to SCOP", false, false)
1384 INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1385 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
1386 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
1387 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
1388 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
1389 INITIALIZE_PASS_DEPENDENCY(ScopDetection);
1390 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
1391                     "Polly - Apply PPCG translation to SCOP", false, false)
1392