1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // OpenMP specific optimizations:
10 //
11 // - Deduplication of runtime calls, e.g., omp_get_thread_num.
12 // - Replacing globalized device memory with stack memory.
13 // - Replacing globalized device memory with shared memory.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/Transforms/IPO/OpenMPOpt.h"
18 
19 #include "llvm/ADT/EnumeratedArray.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/CallGraph.h"
23 #include "llvm/Analysis/CallGraphSCCPass.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/ValueTracking.h"
26 #include "llvm/Frontend/OpenMP/OMPConstants.h"
27 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
28 #include "llvm/IR/IntrinsicInst.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/IR/IntrinsicsNVPTX.h"
31 #include "llvm/IR/PatternMatch.h"
32 #include "llvm/InitializePasses.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Transforms/IPO.h"
35 #include "llvm/Transforms/IPO/Attributor.h"
36 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
37 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
38 #include "llvm/Transforms/Utils/CodeExtractor.h"
39 
40 using namespace llvm::PatternMatch;
41 using namespace llvm;
42 using namespace omp;
43 
44 #define DEBUG_TYPE "openmp-opt"
45 
46 static cl::opt<bool> DisableOpenMPOptimizations(
47     "openmp-opt-disable", cl::ZeroOrMore,
48     cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
49     cl::init(false));
50 
51 static cl::opt<bool> EnableParallelRegionMerging(
52     "openmp-opt-enable-merging", cl::ZeroOrMore,
53     cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
54     cl::init(false));
55 
56 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
57                                     cl::Hidden);
58 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
59                                         cl::init(false), cl::Hidden);
60 
61 static cl::opt<bool> HideMemoryTransferLatency(
62     "openmp-hide-memory-transfer-latency",
63     cl::desc("[WIP] Tries to hide the latency of host to device memory"
64              " transfers"),
65     cl::Hidden, cl::init(false));
66 
67 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
68           "Number of OpenMP runtime calls deduplicated");
69 STATISTIC(NumOpenMPParallelRegionsDeleted,
70           "Number of OpenMP parallel regions deleted");
71 STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
72           "Number of OpenMP runtime functions identified");
73 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
74           "Number of OpenMP runtime function uses identified");
75 STATISTIC(NumOpenMPTargetRegionKernels,
76           "Number of OpenMP target region entry points (=kernels) identified");
77 STATISTIC(
78     NumOpenMPParallelRegionsReplacedInGPUStateMachine,
79     "Number of OpenMP parallel regions replaced with ID in GPU state machines");
80 STATISTIC(NumOpenMPParallelRegionsMerged,
81           "Number of OpenMP parallel regions merged");
82 STATISTIC(NumBytesMovedToSharedMemory,
83           "Amount of memory pushed to shared memory");
84 
85 #if !defined(NDEBUG)
86 static constexpr auto TAG = "[" DEBUG_TYPE "]";
87 #endif
88 
89 namespace {
90 
91 enum class AddressSpace : unsigned {
92   Generic = 0,
93   Global = 1,
94   Shared = 3,
95   Constant = 4,
96   Local = 5,
97 };
98 
99 struct AAHeapToShared;
100 
101 struct AAICVTracker;
102 
103 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
104 /// Attributor runs.
105 struct OMPInformationCache : public InformationCache {
106   OMPInformationCache(Module &M, AnalysisGetter &AG,
107                       BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
108                       SmallPtrSetImpl<Kernel> &Kernels)
109       : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
110         Kernels(Kernels) {
111 
112     OMPBuilder.initialize();
113     initializeRuntimeFunctions();
114     initializeInternalControlVars();
115   }
116 
117   /// Generic information that describes an internal control variable.
118   struct InternalControlVarInfo {
119     /// The kind, as described by InternalControlVar enum.
120     InternalControlVar Kind;
121 
122     /// The name of the ICV.
123     StringRef Name;
124 
125     /// Environment variable associated with this ICV.
126     StringRef EnvVarName;
127 
128     /// Initial value kind.
129     ICVInitValue InitKind;
130 
131     /// Initial value.
132     ConstantInt *InitValue;
133 
134     /// Setter RTL function associated with this ICV.
135     RuntimeFunction Setter;
136 
137     /// Getter RTL function associated with this ICV.
138     RuntimeFunction Getter;
139 
140     /// RTL Function corresponding to the override clause of this ICV
141     RuntimeFunction Clause;
142   };
143 
144   /// Generic information that describes a runtime function
145   struct RuntimeFunctionInfo {
146 
147     /// The kind, as described by the RuntimeFunction enum.
148     RuntimeFunction Kind;
149 
150     /// The name of the function.
151     StringRef Name;
152 
153     /// Flag to indicate a variadic function.
154     bool IsVarArg;
155 
156     /// The return type of the function.
157     Type *ReturnType;
158 
159     /// The argument types of the function.
160     SmallVector<Type *, 8> ArgumentTypes;
161 
162     /// The declaration if available.
163     Function *Declaration = nullptr;
164 
165     /// Uses of this runtime function per function containing the use.
166     using UseVector = SmallVector<Use *, 16>;
167 
168     /// Clear UsesMap for runtime function.
169     void clearUsesMap() { UsesMap.clear(); }
170 
171     /// Boolean conversion that is true if the runtime function was found.
172     operator bool() const { return Declaration; }
173 
174     /// Return the vector of uses in function \p F.
175     UseVector &getOrCreateUseVector(Function *F) {
176       std::shared_ptr<UseVector> &UV = UsesMap[F];
177       if (!UV)
178         UV = std::make_shared<UseVector>();
179       return *UV;
180     }
181 
182     /// Return the vector of uses in function \p F or `nullptr` if there are
183     /// none.
184     const UseVector *getUseVector(Function &F) const {
185       auto I = UsesMap.find(&F);
186       if (I != UsesMap.end())
187         return I->second.get();
188       return nullptr;
189     }
190 
191     /// Return how many functions contain uses of this runtime function.
192     size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
193 
194     /// Return the number of arguments (or the minimal number for variadic
195     /// functions).
196     size_t getNumArgs() const { return ArgumentTypes.size(); }
197 
198     /// Run the callback \p CB on each use and forget the use if the result is
199     /// true. The callback will be fed the function in which the use was
200     /// encountered as second argument.
201     void foreachUse(SmallVectorImpl<Function *> &SCC,
202                     function_ref<bool(Use &, Function &)> CB) {
203       for (Function *F : SCC)
204         foreachUse(CB, F);
205     }
206 
207     /// Run the callback \p CB on each use within the function \p F and forget
208     /// the use if the result is true.
209     void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
210       SmallVector<unsigned, 8> ToBeDeleted;
211       ToBeDeleted.clear();
212 
213       unsigned Idx = 0;
214       UseVector &UV = getOrCreateUseVector(F);
215 
216       for (Use *U : UV) {
217         if (CB(*U, *F))
218           ToBeDeleted.push_back(Idx);
219         ++Idx;
220       }
221 
222       // Remove the to-be-deleted indices in reverse order as prior
223       // modifications will not modify the smaller indices.
224       while (!ToBeDeleted.empty()) {
225         unsigned Idx = ToBeDeleted.pop_back_val();
226         UV[Idx] = UV.back();
227         UV.pop_back();
228       }
229     }
230 
231   private:
232     /// Map from functions to all uses of this runtime function contained in
233     /// them.
234     DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
235   };
236 
237   /// An OpenMP-IR-Builder instance
238   OpenMPIRBuilder OMPBuilder;
239 
240   /// Map from runtime function kind to the runtime function description.
241   EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
242                   RuntimeFunction::OMPRTL___last>
243       RFIs;
244 
245   /// Map from ICV kind to the ICV description.
246   EnumeratedArray<InternalControlVarInfo, InternalControlVar,
247                   InternalControlVar::ICV___last>
248       ICVs;
249 
250   /// Helper to initialize all internal control variable information for those
251   /// defined in OMPKinds.def.
252   void initializeInternalControlVars() {
253 #define ICV_RT_SET(_Name, RTL)                                                 \
254   {                                                                            \
255     auto &ICV = ICVs[_Name];                                                   \
256     ICV.Setter = RTL;                                                          \
257   }
258 #define ICV_RT_GET(Name, RTL)                                                  \
259   {                                                                            \
260     auto &ICV = ICVs[Name];                                                    \
261     ICV.Getter = RTL;                                                          \
262   }
263 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init)                           \
264   {                                                                            \
265     auto &ICV = ICVs[Enum];                                                    \
266     ICV.Name = _Name;                                                          \
267     ICV.Kind = Enum;                                                           \
268     ICV.InitKind = Init;                                                       \
269     ICV.EnvVarName = _EnvVarName;                                              \
270     switch (ICV.InitKind) {                                                    \
271     case ICV_IMPLEMENTATION_DEFINED:                                           \
272       ICV.InitValue = nullptr;                                                 \
273       break;                                                                   \
274     case ICV_ZERO:                                                             \
275       ICV.InitValue = ConstantInt::get(                                        \
276           Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0);                \
277       break;                                                                   \
278     case ICV_FALSE:                                                            \
279       ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext());    \
280       break;                                                                   \
281     case ICV_LAST:                                                             \
282       break;                                                                   \
283     }                                                                          \
284   }
285 #include "llvm/Frontend/OpenMP/OMPKinds.def"
286   }
287 
288   /// Returns true if the function declaration \p F matches the runtime
289   /// function types, that is, return type \p RTFRetType, and argument types
290   /// \p RTFArgTypes.
291   static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
292                                   SmallVector<Type *, 8> &RTFArgTypes) {
293     // TODO: We should output information to the user (under debug output
294     //       and via remarks).
295 
296     if (!F)
297       return false;
298     if (F->getReturnType() != RTFRetType)
299       return false;
300     if (F->arg_size() != RTFArgTypes.size())
301       return false;
302 
303     auto RTFTyIt = RTFArgTypes.begin();
304     for (Argument &Arg : F->args()) {
305       if (Arg.getType() != *RTFTyIt)
306         return false;
307 
308       ++RTFTyIt;
309     }
310 
311     return true;
312   }
313 
314   // Helper to collect all uses of the declaration in the UsesMap.
315   unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
316     unsigned NumUses = 0;
317     if (!RFI.Declaration)
318       return NumUses;
319     OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
320 
321     if (CollectStats) {
322       NumOpenMPRuntimeFunctionsIdentified += 1;
323       NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
324     }
325 
326     // TODO: We directly convert uses into proper calls and unknown uses.
327     for (Use &U : RFI.Declaration->uses()) {
328       if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
329         if (ModuleSlice.count(UserI->getFunction())) {
330           RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
331           ++NumUses;
332         }
333       } else {
334         RFI.getOrCreateUseVector(nullptr).push_back(&U);
335         ++NumUses;
336       }
337     }
338     return NumUses;
339   }
340 
341   // Helper function to recollect uses of a runtime function.
342   void recollectUsesForFunction(RuntimeFunction RTF) {
343     auto &RFI = RFIs[RTF];
344     RFI.clearUsesMap();
345     collectUses(RFI, /*CollectStats*/ false);
346   }
347 
348   // Helper function to recollect uses of all runtime functions.
349   void recollectUses() {
350     for (int Idx = 0; Idx < RFIs.size(); ++Idx)
351       recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
352   }
353 
354   /// Helper to initialize all runtime function information for those defined
355   /// in OpenMPKinds.def.
356   void initializeRuntimeFunctions() {
357     Module &M = *((*ModuleSlice.begin())->getParent());
358 
359     // Helper macros for handling __VA_ARGS__ in OMP_RTL
360 #define OMP_TYPE(VarName, ...)                                                 \
361   Type *VarName = OMPBuilder.VarName;                                          \
362   (void)VarName;
363 
364 #define OMP_ARRAY_TYPE(VarName, ...)                                           \
365   ArrayType *VarName##Ty = OMPBuilder.VarName##Ty;                             \
366   (void)VarName##Ty;                                                           \
367   PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy;                     \
368   (void)VarName##PtrTy;
369 
370 #define OMP_FUNCTION_TYPE(VarName, ...)                                        \
371   FunctionType *VarName = OMPBuilder.VarName;                                  \
372   (void)VarName;                                                               \
373   PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
374   (void)VarName##Ptr;
375 
376 #define OMP_STRUCT_TYPE(VarName, ...)                                          \
377   StructType *VarName = OMPBuilder.VarName;                                    \
378   (void)VarName;                                                               \
379   PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
380   (void)VarName##Ptr;
381 
382 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...)                     \
383   {                                                                            \
384     SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__});                           \
385     Function *F = M.getFunction(_Name);                                        \
386     if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) {           \
387       auto &RFI = RFIs[_Enum];                                                 \
388       RFI.Kind = _Enum;                                                        \
389       RFI.Name = _Name;                                                        \
390       RFI.IsVarArg = _IsVarArg;                                                \
391       RFI.ReturnType = OMPBuilder._ReturnType;                                 \
392       RFI.ArgumentTypes = std::move(ArgsTypes);                                \
393       RFI.Declaration = F;                                                     \
394       unsigned NumUses = collectUses(RFI);                                     \
395       (void)NumUses;                                                           \
396       LLVM_DEBUG({                                                             \
397         dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not")           \
398                << " found\n";                                                  \
399         if (RFI.Declaration)                                                   \
400           dbgs() << TAG << "-> got " << NumUses << " uses in "                 \
401                  << RFI.getNumFunctionsWithUses()                              \
402                  << " different functions.\n";                                 \
403       });                                                                      \
404     }                                                                          \
405   }
406 #include "llvm/Frontend/OpenMP/OMPKinds.def"
407 
408     // TODO: We should attach the attributes defined in OMPKinds.def.
409   }
410 
411   /// Collection of known kernels (\see Kernel) in the module.
412   SmallPtrSetImpl<Kernel> &Kernels;
413 };
414 
415 /// Used to map the values physically (in the IR) stored in an offload
416 /// array, to a vector in memory.
417 struct OffloadArray {
418   /// Physical array (in the IR).
419   AllocaInst *Array = nullptr;
420   /// Mapped values.
421   SmallVector<Value *, 8> StoredValues;
422   /// Last stores made in the offload array.
423   SmallVector<StoreInst *, 8> LastAccesses;
424 
425   OffloadArray() = default;
426 
427   /// Initializes the OffloadArray with the values stored in \p Array before
428   /// instruction \p Before is reached. Returns false if the initialization
429   /// fails.
430   /// This MUST be used immediately after the construction of the object.
431   bool initialize(AllocaInst &Array, Instruction &Before) {
432     if (!Array.getAllocatedType()->isArrayTy())
433       return false;
434 
435     if (!getValues(Array, Before))
436       return false;
437 
438     this->Array = &Array;
439     return true;
440   }
441 
442   static const unsigned DeviceIDArgNum = 1;
443   static const unsigned BasePtrsArgNum = 3;
444   static const unsigned PtrsArgNum = 4;
445   static const unsigned SizesArgNum = 5;
446 
447 private:
448   /// Traverses the BasicBlock where \p Array is, collecting the stores made to
449   /// \p Array, leaving StoredValues with the values stored before the
450   /// instruction \p Before is reached.
451   bool getValues(AllocaInst &Array, Instruction &Before) {
452     // Initialize container.
453     const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
454     StoredValues.assign(NumValues, nullptr);
455     LastAccesses.assign(NumValues, nullptr);
456 
457     // TODO: This assumes the instruction \p Before is in the same
458     //  BasicBlock as Array. Make it general, for any control flow graph.
459     BasicBlock *BB = Array.getParent();
460     if (BB != Before.getParent())
461       return false;
462 
463     const DataLayout &DL = Array.getModule()->getDataLayout();
464     const unsigned int PointerSize = DL.getPointerSize();
465 
466     for (Instruction &I : *BB) {
467       if (&I == &Before)
468         break;
469 
470       if (!isa<StoreInst>(&I))
471         continue;
472 
473       auto *S = cast<StoreInst>(&I);
474       int64_t Offset = -1;
475       auto *Dst =
476           GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
477       if (Dst == &Array) {
478         int64_t Idx = Offset / PointerSize;
479         StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
480         LastAccesses[Idx] = S;
481       }
482     }
483 
484     return isFilled();
485   }
486 
487   /// Returns true if all values in StoredValues and
488   /// LastAccesses are not nullptrs.
489   bool isFilled() {
490     const unsigned NumValues = StoredValues.size();
491     for (unsigned I = 0; I < NumValues; ++I) {
492       if (!StoredValues[I] || !LastAccesses[I])
493         return false;
494     }
495 
496     return true;
497   }
498 };
499 
500 struct OpenMPOpt {
501 
502   using OptimizationRemarkGetter =
503       function_ref<OptimizationRemarkEmitter &(Function *)>;
504 
505   OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
506             OptimizationRemarkGetter OREGetter,
507             OMPInformationCache &OMPInfoCache, Attributor &A)
508       : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
509         OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
510 
511   /// Check if any remarks are enabled for openmp-opt
512   bool remarksEnabled() {
513     auto &Ctx = M.getContext();
514     return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
515   }
516 
517   /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
518   bool run(bool IsModulePass) {
519     if (SCC.empty())
520       return false;
521 
522     bool Changed = false;
523 
524     LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
525                       << " functions in a slice with "
526                       << OMPInfoCache.ModuleSlice.size() << " functions\n");
527 
528     if (IsModulePass) {
529       Changed |= runAttributor();
530 
531       // Recollect uses, in case Attributor deleted any.
532       OMPInfoCache.recollectUses();
533 
534       if (remarksEnabled())
535         analysisGlobalization();
536     } else {
537       if (PrintICVValues)
538         printICVs();
539       if (PrintOpenMPKernels)
540         printKernels();
541 
542       Changed |= rewriteDeviceCodeStateMachine();
543 
544       Changed |= runAttributor();
545 
546       // Recollect uses, in case Attributor deleted any.
547       OMPInfoCache.recollectUses();
548 
549       Changed |= deleteParallelRegions();
550       if (HideMemoryTransferLatency)
551         Changed |= hideMemTransfersLatency();
552       Changed |= deduplicateRuntimeCalls();
553       if (EnableParallelRegionMerging) {
554         if (mergeParallelRegions()) {
555           deduplicateRuntimeCalls();
556           Changed = true;
557         }
558       }
559     }
560 
561     return Changed;
562   }
563 
564   /// Print initial ICV values for testing.
565   /// FIXME: This should be done from the Attributor once it is added.
566   void printICVs() const {
567     InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
568                                  ICV_proc_bind};
569 
570     for (Function *F : OMPInfoCache.ModuleSlice) {
571       for (auto ICV : ICVs) {
572         auto ICVInfo = OMPInfoCache.ICVs[ICV];
573         auto Remark = [&](OptimizationRemarkAnalysis ORA) {
574           return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
575                      << " Value: "
576                      << (ICVInfo.InitValue
577                              ? toString(ICVInfo.InitValue->getValue(), 10, true)
578                              : "IMPLEMENTATION_DEFINED");
579         };
580 
581         emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
582       }
583     }
584   }
585 
586   /// Print OpenMP GPU kernels for testing.
587   void printKernels() const {
588     for (Function *F : SCC) {
589       if (!OMPInfoCache.Kernels.count(F))
590         continue;
591 
592       auto Remark = [&](OptimizationRemarkAnalysis ORA) {
593         return ORA << "OpenMP GPU kernel "
594                    << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
595       };
596 
597       emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
598     }
599   }
600 
601   /// Return the call if \p U is a callee use in a regular call. If \p RFI is
602   /// given it has to be the callee or a nullptr is returned.
603   static CallInst *getCallIfRegularCall(
604       Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
605     CallInst *CI = dyn_cast<CallInst>(U.getUser());
606     if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
607         (!RFI || CI->getCalledFunction() == RFI->Declaration))
608       return CI;
609     return nullptr;
610   }
611 
612   /// Return the call if \p V is a regular call. If \p RFI is given it has to be
613   /// the callee or a nullptr is returned.
614   static CallInst *getCallIfRegularCall(
615       Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
616     CallInst *CI = dyn_cast<CallInst>(&V);
617     if (CI && !CI->hasOperandBundles() &&
618         (!RFI || CI->getCalledFunction() == RFI->Declaration))
619       return CI;
620     return nullptr;
621   }
622 
623 private:
624   /// Merge parallel regions when it is safe.
625   bool mergeParallelRegions() {
626     const unsigned CallbackCalleeOperand = 2;
627     const unsigned CallbackFirstArgOperand = 3;
628     using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
629 
630     // Check if there are any __kmpc_fork_call calls to merge.
631     OMPInformationCache::RuntimeFunctionInfo &RFI =
632         OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
633 
634     if (!RFI.Declaration)
635       return false;
636 
637     // Unmergable calls that prevent merging a parallel region.
638     OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
639         OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
640         OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
641     };
642 
643     bool Changed = false;
644     LoopInfo *LI = nullptr;
645     DominatorTree *DT = nullptr;
646 
647     SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
648 
649     BasicBlock *StartBB = nullptr, *EndBB = nullptr;
650     auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
651                          BasicBlock &ContinuationIP) {
652       BasicBlock *CGStartBB = CodeGenIP.getBlock();
653       BasicBlock *CGEndBB =
654           SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
655       assert(StartBB != nullptr && "StartBB should not be null");
656       CGStartBB->getTerminator()->setSuccessor(0, StartBB);
657       assert(EndBB != nullptr && "EndBB should not be null");
658       EndBB->getTerminator()->setSuccessor(0, CGEndBB);
659     };
660 
661     auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
662                       Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
663       ReplacementValue = &Inner;
664       return CodeGenIP;
665     };
666 
667     auto FiniCB = [&](InsertPointTy CodeGenIP) {};
668 
669     /// Create a sequential execution region within a merged parallel region,
670     /// encapsulated in a master construct with a barrier for synchronization.
671     auto CreateSequentialRegion = [&](Function *OuterFn,
672                                       BasicBlock *OuterPredBB,
673                                       Instruction *SeqStartI,
674                                       Instruction *SeqEndI) {
675       // Isolate the instructions of the sequential region to a separate
676       // block.
677       BasicBlock *ParentBB = SeqStartI->getParent();
678       BasicBlock *SeqEndBB =
679           SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
680       BasicBlock *SeqAfterBB =
681           SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
682       BasicBlock *SeqStartBB =
683           SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
684 
685       assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
686              "Expected a different CFG");
687       const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
688       ParentBB->getTerminator()->eraseFromParent();
689 
690       auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
691                            BasicBlock &ContinuationIP) {
692         BasicBlock *CGStartBB = CodeGenIP.getBlock();
693         BasicBlock *CGEndBB =
694             SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
695         assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
696         CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
697         assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
698         SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
699       };
700       auto FiniCB = [&](InsertPointTy CodeGenIP) {};
701 
702       // Find outputs from the sequential region to outside users and
703       // broadcast their values to them.
704       for (Instruction &I : *SeqStartBB) {
705         SmallPtrSet<Instruction *, 4> OutsideUsers;
706         for (User *Usr : I.users()) {
707           Instruction &UsrI = *cast<Instruction>(Usr);
708           // Ignore outputs to LT intrinsics, code extraction for the merged
709           // parallel region will fix them.
710           if (UsrI.isLifetimeStartOrEnd())
711             continue;
712 
713           if (UsrI.getParent() != SeqStartBB)
714             OutsideUsers.insert(&UsrI);
715         }
716 
717         if (OutsideUsers.empty())
718           continue;
719 
720         // Emit an alloca in the outer region to store the broadcasted
721         // value.
722         const DataLayout &DL = M.getDataLayout();
723         AllocaInst *AllocaI = new AllocaInst(
724             I.getType(), DL.getAllocaAddrSpace(), nullptr,
725             I.getName() + ".seq.output.alloc", &OuterFn->front().front());
726 
727         // Emit a store instruction in the sequential BB to update the
728         // value.
729         new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
730 
731         // Emit a load instruction and replace the use of the output value
732         // with it.
733         for (Instruction *UsrI : OutsideUsers) {
734           LoadInst *LoadI = new LoadInst(
735               I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI);
736           UsrI->replaceUsesOfWith(&I, LoadI);
737         }
738       }
739 
740       OpenMPIRBuilder::LocationDescription Loc(
741           InsertPointTy(ParentBB, ParentBB->end()), DL);
742       InsertPointTy SeqAfterIP =
743           OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
744 
745       OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
746 
747       BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
748 
749       LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
750                         << "\n");
751     };
752 
753     // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
754     // contained in BB and only separated by instructions that can be
755     // redundantly executed in parallel. The block BB is split before the first
756     // call (in MergableCIs) and after the last so the entire region we merge
757     // into a single parallel region is contained in a single basic block
758     // without any other instructions. We use the OpenMPIRBuilder to outline
759     // that block and call the resulting function via __kmpc_fork_call.
760     auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
761       // TODO: Change the interface to allow single CIs expanded, e.g, to
762       // include an outer loop.
763       assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
764 
765       auto Remark = [&](OptimizationRemark OR) {
766         OR << "Parallel region at "
767            << ore::NV("OpenMPParallelMergeFront",
768                       MergableCIs.front()->getDebugLoc())
769            << " merged with parallel regions at ";
770         for (auto *CI : llvm::drop_begin(MergableCIs)) {
771           OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
772           if (CI != MergableCIs.back())
773             OR << ", ";
774         }
775         return OR;
776       };
777 
778       emitRemark<OptimizationRemark>(MergableCIs.front(),
779                                      "OpenMPParallelRegionMerging", Remark);
780 
781       Function *OriginalFn = BB->getParent();
782       LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
783                         << " parallel regions in " << OriginalFn->getName()
784                         << "\n");
785 
786       // Isolate the calls to merge in a separate block.
787       EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
788       BasicBlock *AfterBB =
789           SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
790       StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
791                            "omp.par.merged");
792 
793       assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
794       const DebugLoc DL = BB->getTerminator()->getDebugLoc();
795       BB->getTerminator()->eraseFromParent();
796 
797       // Create sequential regions for sequential instructions that are
798       // in-between mergable parallel regions.
799       for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
800            It != End; ++It) {
801         Instruction *ForkCI = *It;
802         Instruction *NextForkCI = *(It + 1);
803 
804         // Continue if there are not in-between instructions.
805         if (ForkCI->getNextNode() == NextForkCI)
806           continue;
807 
808         CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
809                                NextForkCI->getPrevNode());
810       }
811 
812       OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
813                                                DL);
814       IRBuilder<>::InsertPoint AllocaIP(
815           &OriginalFn->getEntryBlock(),
816           OriginalFn->getEntryBlock().getFirstInsertionPt());
817       // Create the merged parallel region with default proc binding, to
818       // avoid overriding binding settings, and without explicit cancellation.
819       InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
820           Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
821           OMP_PROC_BIND_default, /* IsCancellable */ false);
822       BranchInst::Create(AfterBB, AfterIP.getBlock());
823 
824       // Perform the actual outlining.
825       OMPInfoCache.OMPBuilder.finalize(OriginalFn,
826                                        /* AllowExtractorSinking */ true);
827 
828       Function *OutlinedFn = MergableCIs.front()->getCaller();
829 
830       // Replace the __kmpc_fork_call calls with direct calls to the outlined
831       // callbacks.
832       SmallVector<Value *, 8> Args;
833       for (auto *CI : MergableCIs) {
834         Value *Callee =
835             CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
836         FunctionType *FT =
837             cast<FunctionType>(Callee->getType()->getPointerElementType());
838         Args.clear();
839         Args.push_back(OutlinedFn->getArg(0));
840         Args.push_back(OutlinedFn->getArg(1));
841         for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
842              U < E; ++U)
843           Args.push_back(CI->getArgOperand(U));
844 
845         CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
846         if (CI->getDebugLoc())
847           NewCI->setDebugLoc(CI->getDebugLoc());
848 
849         // Forward parameter attributes from the callback to the callee.
850         for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
851              U < E; ++U)
852           for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
853             NewCI->addParamAttr(
854                 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
855 
856         // Emit an explicit barrier to replace the implicit fork-join barrier.
857         if (CI != MergableCIs.back()) {
858           // TODO: Remove barrier if the merged parallel region includes the
859           // 'nowait' clause.
860           OMPInfoCache.OMPBuilder.createBarrier(
861               InsertPointTy(NewCI->getParent(),
862                             NewCI->getNextNode()->getIterator()),
863               OMPD_parallel);
864         }
865 
866         auto Remark = [&](OptimizationRemark OR) {
867           return OR << "Parallel region at "
868                     << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
869                     << " merged with "
870                     << ore::NV("OpenMPParallelMergeFront",
871                                MergableCIs.front()->getDebugLoc());
872         };
873         if (CI != MergableCIs.front())
874           emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
875                                          Remark);
876 
877         CI->eraseFromParent();
878       }
879 
880       assert(OutlinedFn != OriginalFn && "Outlining failed");
881       CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
882       CGUpdater.reanalyzeFunction(*OriginalFn);
883 
884       NumOpenMPParallelRegionsMerged += MergableCIs.size();
885 
886       return true;
887     };
888 
889     // Helper function that identifes sequences of
890     // __kmpc_fork_call uses in a basic block.
891     auto DetectPRsCB = [&](Use &U, Function &F) {
892       CallInst *CI = getCallIfRegularCall(U, &RFI);
893       BB2PRMap[CI->getParent()].insert(CI);
894 
895       return false;
896     };
897 
898     BB2PRMap.clear();
899     RFI.foreachUse(SCC, DetectPRsCB);
900     SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
901     // Find mergable parallel regions within a basic block that are
902     // safe to merge, that is any in-between instructions can safely
903     // execute in parallel after merging.
904     // TODO: support merging across basic-blocks.
905     for (auto &It : BB2PRMap) {
906       auto &CIs = It.getSecond();
907       if (CIs.size() < 2)
908         continue;
909 
910       BasicBlock *BB = It.getFirst();
911       SmallVector<CallInst *, 4> MergableCIs;
912 
913       /// Returns true if the instruction is mergable, false otherwise.
914       /// A terminator instruction is unmergable by definition since merging
915       /// works within a BB. Instructions before the mergable region are
916       /// mergable if they are not calls to OpenMP runtime functions that may
917       /// set different execution parameters for subsequent parallel regions.
918       /// Instructions in-between parallel regions are mergable if they are not
919       /// calls to any non-intrinsic function since that may call a non-mergable
920       /// OpenMP runtime function.
921       auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
922         // We do not merge across BBs, hence return false (unmergable) if the
923         // instruction is a terminator.
924         if (I.isTerminator())
925           return false;
926 
927         if (!isa<CallInst>(&I))
928           return true;
929 
930         CallInst *CI = cast<CallInst>(&I);
931         if (IsBeforeMergableRegion) {
932           Function *CalledFunction = CI->getCalledFunction();
933           if (!CalledFunction)
934             return false;
935           // Return false (unmergable) if the call before the parallel
936           // region calls an explicit affinity (proc_bind) or number of
937           // threads (num_threads) compiler-generated function. Those settings
938           // may be incompatible with following parallel regions.
939           // TODO: ICV tracking to detect compatibility.
940           for (const auto &RFI : UnmergableCallsInfo) {
941             if (CalledFunction == RFI.Declaration)
942               return false;
943           }
944         } else {
945           // Return false (unmergable) if there is a call instruction
946           // in-between parallel regions when it is not an intrinsic. It
947           // may call an unmergable OpenMP runtime function in its callpath.
948           // TODO: Keep track of possible OpenMP calls in the callpath.
949           if (!isa<IntrinsicInst>(CI))
950             return false;
951         }
952 
953         return true;
954       };
955       // Find maximal number of parallel region CIs that are safe to merge.
956       for (auto It = BB->begin(), End = BB->end(); It != End;) {
957         Instruction &I = *It;
958         ++It;
959 
960         if (CIs.count(&I)) {
961           MergableCIs.push_back(cast<CallInst>(&I));
962           continue;
963         }
964 
965         // Continue expanding if the instruction is mergable.
966         if (IsMergable(I, MergableCIs.empty()))
967           continue;
968 
969         // Forward the instruction iterator to skip the next parallel region
970         // since there is an unmergable instruction which can affect it.
971         for (; It != End; ++It) {
972           Instruction &SkipI = *It;
973           if (CIs.count(&SkipI)) {
974             LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
975                               << " due to " << I << "\n");
976             ++It;
977             break;
978           }
979         }
980 
981         // Store mergable regions found.
982         if (MergableCIs.size() > 1) {
983           MergableCIsVector.push_back(MergableCIs);
984           LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
985                             << " parallel regions in block " << BB->getName()
986                             << " of function " << BB->getParent()->getName()
987                             << "\n";);
988         }
989 
990         MergableCIs.clear();
991       }
992 
993       if (!MergableCIsVector.empty()) {
994         Changed = true;
995 
996         for (auto &MergableCIs : MergableCIsVector)
997           Merge(MergableCIs, BB);
998         MergableCIsVector.clear();
999       }
1000     }
1001 
1002     if (Changed) {
1003       /// Re-collect use for fork calls, emitted barrier calls, and
1004       /// any emitted master/end_master calls.
1005       OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1006       OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1007       OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1008       OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1009     }
1010 
1011     return Changed;
1012   }
1013 
1014   /// Try to delete parallel regions if possible.
1015   bool deleteParallelRegions() {
1016     const unsigned CallbackCalleeOperand = 2;
1017 
1018     OMPInformationCache::RuntimeFunctionInfo &RFI =
1019         OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1020 
1021     if (!RFI.Declaration)
1022       return false;
1023 
1024     bool Changed = false;
1025     auto DeleteCallCB = [&](Use &U, Function &) {
1026       CallInst *CI = getCallIfRegularCall(U);
1027       if (!CI)
1028         return false;
1029       auto *Fn = dyn_cast<Function>(
1030           CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1031       if (!Fn)
1032         return false;
1033       if (!Fn->onlyReadsMemory())
1034         return false;
1035       if (!Fn->hasFnAttribute(Attribute::WillReturn))
1036         return false;
1037 
1038       LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1039                         << CI->getCaller()->getName() << "\n");
1040 
1041       auto Remark = [&](OptimizationRemark OR) {
1042         return OR << "Parallel region in "
1043                   << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
1044                   << " deleted";
1045       };
1046       emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
1047                                      Remark);
1048 
1049       CGUpdater.removeCallSite(*CI);
1050       CI->eraseFromParent();
1051       Changed = true;
1052       ++NumOpenMPParallelRegionsDeleted;
1053       return true;
1054     };
1055 
1056     RFI.foreachUse(SCC, DeleteCallCB);
1057 
1058     return Changed;
1059   }
1060 
1061   /// Try to eliminate runtime calls by reusing existing ones.
1062   bool deduplicateRuntimeCalls() {
1063     bool Changed = false;
1064 
1065     RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1066         OMPRTL_omp_get_num_threads,
1067         OMPRTL_omp_in_parallel,
1068         OMPRTL_omp_get_cancellation,
1069         OMPRTL_omp_get_thread_limit,
1070         OMPRTL_omp_get_supported_active_levels,
1071         OMPRTL_omp_get_level,
1072         OMPRTL_omp_get_ancestor_thread_num,
1073         OMPRTL_omp_get_team_size,
1074         OMPRTL_omp_get_active_level,
1075         OMPRTL_omp_in_final,
1076         OMPRTL_omp_get_proc_bind,
1077         OMPRTL_omp_get_num_places,
1078         OMPRTL_omp_get_num_procs,
1079         OMPRTL_omp_get_place_num,
1080         OMPRTL_omp_get_partition_num_places,
1081         OMPRTL_omp_get_partition_place_nums};
1082 
1083     // Global-tid is handled separately.
1084     SmallSetVector<Value *, 16> GTIdArgs;
1085     collectGlobalThreadIdArguments(GTIdArgs);
1086     LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
1087                       << " global thread ID arguments\n");
1088 
1089     for (Function *F : SCC) {
1090       for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1091         Changed |= deduplicateRuntimeCalls(
1092             *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1093 
1094       // __kmpc_global_thread_num is special as we can replace it with an
1095       // argument in enough cases to make it worth trying.
1096       Value *GTIdArg = nullptr;
1097       for (Argument &Arg : F->args())
1098         if (GTIdArgs.count(&Arg)) {
1099           GTIdArg = &Arg;
1100           break;
1101         }
1102       Changed |= deduplicateRuntimeCalls(
1103           *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1104     }
1105 
1106     return Changed;
1107   }
1108 
1109   /// Tries to hide the latency of runtime calls that involve host to
1110   /// device memory transfers by splitting them into their "issue" and "wait"
1111   /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1112   /// moved downards as much as possible. The "issue" issues the memory transfer
1113   /// asynchronously, returning a handle. The "wait" waits in the returned
1114   /// handle for the memory transfer to finish.
1115   bool hideMemTransfersLatency() {
1116     auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1117     bool Changed = false;
1118     auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1119       auto *RTCall = getCallIfRegularCall(U, &RFI);
1120       if (!RTCall)
1121         return false;
1122 
1123       OffloadArray OffloadArrays[3];
1124       if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1125         return false;
1126 
1127       LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1128 
1129       // TODO: Check if can be moved upwards.
1130       bool WasSplit = false;
1131       Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1132       if (WaitMovementPoint)
1133         WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1134 
1135       Changed |= WasSplit;
1136       return WasSplit;
1137     };
1138     RFI.foreachUse(SCC, SplitMemTransfers);
1139 
1140     return Changed;
1141   }
1142 
1143   void analysisGlobalization() {
1144     auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1145 
1146     auto CheckGlobalization = [&](Use &U, Function &Decl) {
1147       if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1148         auto Remark = [&](OptimizationRemarkMissed ORM) {
1149           return ORM
1150                  << "Found thread data sharing on the GPU. "
1151                  << "Expect degraded performance due to data globalization.";
1152         };
1153         emitRemark<OptimizationRemarkMissed>(CI, "OpenMPGlobalization", Remark);
1154       }
1155 
1156       return false;
1157     };
1158 
1159     RFI.foreachUse(SCC, CheckGlobalization);
1160   }
1161 
1162   /// Maps the values stored in the offload arrays passed as arguments to
1163   /// \p RuntimeCall into the offload arrays in \p OAs.
1164   bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1165                                 MutableArrayRef<OffloadArray> OAs) {
1166     assert(OAs.size() == 3 && "Need space for three offload arrays!");
1167 
1168     // A runtime call that involves memory offloading looks something like:
1169     // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1170     //   i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1171     // ...)
1172     // So, the idea is to access the allocas that allocate space for these
1173     // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1174     // Therefore:
1175     // i8** %offload_baseptrs.
1176     Value *BasePtrsArg =
1177         RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1178     // i8** %offload_ptrs.
1179     Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1180     // i8** %offload_sizes.
1181     Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1182 
1183     // Get values stored in **offload_baseptrs.
1184     auto *V = getUnderlyingObject(BasePtrsArg);
1185     if (!isa<AllocaInst>(V))
1186       return false;
1187     auto *BasePtrsArray = cast<AllocaInst>(V);
1188     if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1189       return false;
1190 
1191     // Get values stored in **offload_baseptrs.
1192     V = getUnderlyingObject(PtrsArg);
1193     if (!isa<AllocaInst>(V))
1194       return false;
1195     auto *PtrsArray = cast<AllocaInst>(V);
1196     if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1197       return false;
1198 
1199     // Get values stored in **offload_sizes.
1200     V = getUnderlyingObject(SizesArg);
1201     // If it's a [constant] global array don't analyze it.
1202     if (isa<GlobalValue>(V))
1203       return isa<Constant>(V);
1204     if (!isa<AllocaInst>(V))
1205       return false;
1206 
1207     auto *SizesArray = cast<AllocaInst>(V);
1208     if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1209       return false;
1210 
1211     return true;
1212   }
1213 
1214   /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1215   /// For now this is a way to test that the function getValuesInOffloadArrays
1216   /// is working properly.
1217   /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1218   void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1219     assert(OAs.size() == 3 && "There are three offload arrays to debug!");
1220 
1221     LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1222     std::string ValuesStr;
1223     raw_string_ostream Printer(ValuesStr);
1224     std::string Separator = " --- ";
1225 
1226     for (auto *BP : OAs[0].StoredValues) {
1227       BP->print(Printer);
1228       Printer << Separator;
1229     }
1230     LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
1231     ValuesStr.clear();
1232 
1233     for (auto *P : OAs[1].StoredValues) {
1234       P->print(Printer);
1235       Printer << Separator;
1236     }
1237     LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
1238     ValuesStr.clear();
1239 
1240     for (auto *S : OAs[2].StoredValues) {
1241       S->print(Printer);
1242       Printer << Separator;
1243     }
1244     LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
1245   }
1246 
1247   /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1248   /// moved. Returns nullptr if the movement is not possible, or not worth it.
1249   Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1250     // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1251     //  Make it traverse the CFG.
1252 
1253     Instruction *CurrentI = &RuntimeCall;
1254     bool IsWorthIt = false;
1255     while ((CurrentI = CurrentI->getNextNode())) {
1256 
1257       // TODO: Once we detect the regions to be offloaded we should use the
1258       //  alias analysis manager to check if CurrentI may modify one of
1259       //  the offloaded regions.
1260       if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1261         if (IsWorthIt)
1262           return CurrentI;
1263 
1264         return nullptr;
1265       }
1266 
1267       // FIXME: For now if we move it over anything without side effect
1268       //  is worth it.
1269       IsWorthIt = true;
1270     }
1271 
1272     // Return end of BasicBlock.
1273     return RuntimeCall.getParent()->getTerminator();
1274   }
1275 
1276   /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1277   bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1278                                Instruction &WaitMovementPoint) {
1279     // Create stack allocated handle (__tgt_async_info) at the beginning of the
1280     // function. Used for storing information of the async transfer, allowing to
1281     // wait on it later.
1282     auto &IRBuilder = OMPInfoCache.OMPBuilder;
1283     auto *F = RuntimeCall.getCaller();
1284     Instruction *FirstInst = &(F->getEntryBlock().front());
1285     AllocaInst *Handle = new AllocaInst(
1286         IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
1287 
1288     // Add "issue" runtime call declaration:
1289     // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1290     //   i8**, i8**, i64*, i64*)
1291     FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1292         M, OMPRTL___tgt_target_data_begin_mapper_issue);
1293 
1294     // Change RuntimeCall call site for its asynchronous version.
1295     SmallVector<Value *, 16> Args;
1296     for (auto &Arg : RuntimeCall.args())
1297       Args.push_back(Arg.get());
1298     Args.push_back(Handle);
1299 
1300     CallInst *IssueCallsite =
1301         CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
1302     RuntimeCall.eraseFromParent();
1303 
1304     // Add "wait" runtime call declaration:
1305     // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1306     FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1307         M, OMPRTL___tgt_target_data_begin_mapper_wait);
1308 
1309     Value *WaitParams[2] = {
1310         IssueCallsite->getArgOperand(
1311             OffloadArray::DeviceIDArgNum), // device_id.
1312         Handle                             // handle to wait on.
1313     };
1314     CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
1315 
1316     return true;
1317   }
1318 
1319   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1320                                     bool GlobalOnly, bool &SingleChoice) {
1321     if (CurrentIdent == NextIdent)
1322       return CurrentIdent;
1323 
1324     // TODO: Figure out how to actually combine multiple debug locations. For
1325     //       now we just keep an existing one if there is a single choice.
1326     if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1327       SingleChoice = !CurrentIdent;
1328       return NextIdent;
1329     }
1330     return nullptr;
1331   }
1332 
1333   /// Return an `struct ident_t*` value that represents the ones used in the
1334   /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1335   /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1336   /// return value we create one from scratch. We also do not yet combine
1337   /// information, e.g., the source locations, see combinedIdentStruct.
1338   Value *
1339   getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1340                                  Function &F, bool GlobalOnly) {
1341     bool SingleChoice = true;
1342     Value *Ident = nullptr;
1343     auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1344       CallInst *CI = getCallIfRegularCall(U, &RFI);
1345       if (!CI || &F != &Caller)
1346         return false;
1347       Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1348                                   /* GlobalOnly */ true, SingleChoice);
1349       return false;
1350     };
1351     RFI.foreachUse(SCC, CombineIdentStruct);
1352 
1353     if (!Ident || !SingleChoice) {
1354       // The IRBuilder uses the insertion block to get to the module, this is
1355       // unfortunate but we work around it for now.
1356       if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1357         OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1358             &F.getEntryBlock(), F.getEntryBlock().begin()));
1359       // Create a fallback location if non was found.
1360       // TODO: Use the debug locations of the calls instead.
1361       Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
1362       Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
1363     }
1364     return Ident;
1365   }
1366 
1367   /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1368   /// \p ReplVal if given.
1369   bool deduplicateRuntimeCalls(Function &F,
1370                                OMPInformationCache::RuntimeFunctionInfo &RFI,
1371                                Value *ReplVal = nullptr) {
1372     auto *UV = RFI.getUseVector(F);
1373     if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1374       return false;
1375 
1376     LLVM_DEBUG(
1377         dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1378                << (ReplVal ? " with an existing value\n" : "\n") << "\n");
1379 
1380     assert((!ReplVal || (isa<Argument>(ReplVal) &&
1381                          cast<Argument>(ReplVal)->getParent() == &F)) &&
1382            "Unexpected replacement value!");
1383 
1384     // TODO: Use dominance to find a good position instead.
1385     auto CanBeMoved = [this](CallBase &CB) {
1386       unsigned NumArgs = CB.getNumArgOperands();
1387       if (NumArgs == 0)
1388         return true;
1389       if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1390         return false;
1391       for (unsigned u = 1; u < NumArgs; ++u)
1392         if (isa<Instruction>(CB.getArgOperand(u)))
1393           return false;
1394       return true;
1395     };
1396 
1397     if (!ReplVal) {
1398       for (Use *U : *UV)
1399         if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1400           if (!CanBeMoved(*CI))
1401             continue;
1402 
1403           auto Remark = [&](OptimizationRemark OR) {
1404             return OR << "OpenMP runtime call "
1405                       << ore::NV("OpenMPOptRuntime", RFI.Name)
1406                       << " moved to beginning of OpenMP region";
1407           };
1408           emitRemark<OptimizationRemark>(&F, "OpenMPRuntimeCodeMotion", Remark);
1409 
1410           CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
1411           ReplVal = CI;
1412           break;
1413         }
1414       if (!ReplVal)
1415         return false;
1416     }
1417 
1418     // If we use a call as a replacement value we need to make sure the ident is
1419     // valid at the new location. For now we just pick a global one, either
1420     // existing and used by one of the calls, or created from scratch.
1421     if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1422       if (CI->getNumArgOperands() > 0 &&
1423           CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1424         Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1425                                                       /* GlobalOnly */ true);
1426         CI->setArgOperand(0, Ident);
1427       }
1428     }
1429 
1430     bool Changed = false;
1431     auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1432       CallInst *CI = getCallIfRegularCall(U, &RFI);
1433       if (!CI || CI == ReplVal || &F != &Caller)
1434         return false;
1435       assert(CI->getCaller() == &F && "Unexpected call!");
1436 
1437       auto Remark = [&](OptimizationRemark OR) {
1438         return OR << "OpenMP runtime call "
1439                   << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
1440       };
1441       emitRemark<OptimizationRemark>(&F, "OpenMPRuntimeDeduplicated", Remark);
1442 
1443       CGUpdater.removeCallSite(*CI);
1444       CI->replaceAllUsesWith(ReplVal);
1445       CI->eraseFromParent();
1446       ++NumOpenMPRuntimeCallsDeduplicated;
1447       Changed = true;
1448       return true;
1449     };
1450     RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1451 
1452     return Changed;
1453   }
1454 
1455   /// Collect arguments that represent the global thread id in \p GTIdArgs.
1456   void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
1457     // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1458     //       initialization. We could define an AbstractAttribute instead and
1459     //       run the Attributor here once it can be run as an SCC pass.
1460 
1461     // Helper to check the argument \p ArgNo at all call sites of \p F for
1462     // a GTId.
1463     auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1464       if (!F.hasLocalLinkage())
1465         return false;
1466       for (Use &U : F.uses()) {
1467         if (CallInst *CI = getCallIfRegularCall(U)) {
1468           Value *ArgOp = CI->getArgOperand(ArgNo);
1469           if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1470               getCallIfRegularCall(
1471                   *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1472             continue;
1473         }
1474         return false;
1475       }
1476       return true;
1477     };
1478 
1479     // Helper to identify uses of a GTId as GTId arguments.
1480     auto AddUserArgs = [&](Value &GTId) {
1481       for (Use &U : GTId.uses())
1482         if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1483           if (CI->isArgOperand(&U))
1484             if (Function *Callee = CI->getCalledFunction())
1485               if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1486                 GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1487     };
1488 
1489     // The argument users of __kmpc_global_thread_num calls are GTIds.
1490     OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1491         OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1492 
1493     GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1494       if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1495         AddUserArgs(*CI);
1496       return false;
1497     });
1498 
1499     // Transitively search for more arguments by looking at the users of the
1500     // ones we know already. During the search the GTIdArgs vector is extended
1501     // so we cannot cache the size nor can we use a range based for.
1502     for (unsigned u = 0; u < GTIdArgs.size(); ++u)
1503       AddUserArgs(*GTIdArgs[u]);
1504   }
1505 
1506   /// Kernel (=GPU) optimizations and utility functions
1507   ///
1508   ///{{
1509 
1510   /// Check if \p F is a kernel, hence entry point for target offloading.
1511   bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
1512 
1513   /// Cache to remember the unique kernel for a function.
1514   DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
1515 
1516   /// Find the unique kernel that will execute \p F, if any.
1517   Kernel getUniqueKernelFor(Function &F);
1518 
1519   /// Find the unique kernel that will execute \p I, if any.
1520   Kernel getUniqueKernelFor(Instruction &I) {
1521     return getUniqueKernelFor(*I.getFunction());
1522   }
1523 
1524   /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1525   /// the cases we can avoid taking the address of a function.
1526   bool rewriteDeviceCodeStateMachine();
1527 
1528   ///
1529   ///}}
1530 
1531   /// Emit a remark generically
1532   ///
1533   /// This template function can be used to generically emit a remark. The
1534   /// RemarkKind should be one of the following:
1535   ///   - OptimizationRemark to indicate a successful optimization attempt
1536   ///   - OptimizationRemarkMissed to report a failed optimization attempt
1537   ///   - OptimizationRemarkAnalysis to provide additional information about an
1538   ///     optimization attempt
1539   ///
1540   /// The remark is built using a callback function provided by the caller that
1541   /// takes a RemarkKind as input and returns a RemarkKind.
1542   template <typename RemarkKind, typename RemarkCallBack>
1543   void emitRemark(Instruction *I, StringRef RemarkName,
1544                   RemarkCallBack &&RemarkCB) const {
1545     Function *F = I->getParent()->getParent();
1546     auto &ORE = OREGetter(F);
1547 
1548     ORE.emit([&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
1549   }
1550 
1551   /// Emit a remark on a function.
1552   template <typename RemarkKind, typename RemarkCallBack>
1553   void emitRemark(Function *F, StringRef RemarkName,
1554                   RemarkCallBack &&RemarkCB) const {
1555     auto &ORE = OREGetter(F);
1556 
1557     ORE.emit([&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
1558   }
1559 
1560   /// The underlying module.
1561   Module &M;
1562 
1563   /// The SCC we are operating on.
1564   SmallVectorImpl<Function *> &SCC;
1565 
1566   /// Callback to update the call graph, the first argument is a removed call,
1567   /// the second an optional replacement call.
1568   CallGraphUpdater &CGUpdater;
1569 
1570   /// Callback to get an OptimizationRemarkEmitter from a Function *
1571   OptimizationRemarkGetter OREGetter;
1572 
1573   /// OpenMP-specific information cache. Also Used for Attributor runs.
1574   OMPInformationCache &OMPInfoCache;
1575 
1576   /// Attributor instance.
1577   Attributor &A;
1578 
1579   /// Helper function to run Attributor on SCC.
1580   bool runAttributor() {
1581     if (SCC.empty())
1582       return false;
1583 
1584     registerAAs();
1585 
1586     ChangeStatus Changed = A.run();
1587 
1588     LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
1589                       << " functions, result: " << Changed << ".\n");
1590 
1591     return Changed == ChangeStatus::CHANGED;
1592   }
1593 
1594   /// Populate the Attributor with abstract attribute opportunities in the
1595   /// function.
1596   void registerAAs() {
1597     if (SCC.empty())
1598       return;
1599 
1600     // Create CallSite AA for all Getters.
1601     for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
1602       auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
1603 
1604       auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
1605 
1606       auto CreateAA = [&](Use &U, Function &Caller) {
1607         CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
1608         if (!CI)
1609           return false;
1610 
1611         auto &CB = cast<CallBase>(*CI);
1612 
1613         IRPosition CBPos = IRPosition::callsite_function(CB);
1614         A.getOrCreateAAFor<AAICVTracker>(CBPos);
1615         return false;
1616       };
1617 
1618       GetterRFI.foreachUse(SCC, CreateAA);
1619     }
1620     auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1621     auto CreateAA = [&](Use &U, Function &F) {
1622       A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
1623       return false;
1624     };
1625     GlobalizationRFI.foreachUse(SCC, CreateAA);
1626 
1627     // Create an ExecutionDomain AA for every function and a HeapToStack AA for
1628     // every function if there is a device kernel.
1629     for (auto *F : SCC) {
1630       if (!F->isDeclaration())
1631         A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
1632       if (isOpenMPDevice(M))
1633         A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
1634     }
1635   }
1636 };
1637 
1638 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
1639   if (!OMPInfoCache.ModuleSlice.count(&F))
1640     return nullptr;
1641 
1642   // Use a scope to keep the lifetime of the CachedKernel short.
1643   {
1644     Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
1645     if (CachedKernel)
1646       return *CachedKernel;
1647 
1648     // TODO: We should use an AA to create an (optimistic and callback
1649     //       call-aware) call graph. For now we stick to simple patterns that
1650     //       are less powerful, basically the worst fixpoint.
1651     if (isKernel(F)) {
1652       CachedKernel = Kernel(&F);
1653       return *CachedKernel;
1654     }
1655 
1656     CachedKernel = nullptr;
1657     if (!F.hasLocalLinkage()) {
1658 
1659       // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
1660       auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1661         return ORA
1662                << "[OMP100] Potentially unknown OpenMP target region caller";
1663       };
1664       emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
1665 
1666       return nullptr;
1667     }
1668   }
1669 
1670   auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
1671     if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
1672       // Allow use in equality comparisons.
1673       if (Cmp->isEquality())
1674         return getUniqueKernelFor(*Cmp);
1675       return nullptr;
1676     }
1677     if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
1678       // Allow direct calls.
1679       if (CB->isCallee(&U))
1680         return getUniqueKernelFor(*CB);
1681 
1682       OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1683           OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1684       // Allow the use in __kmpc_parallel_51 calls.
1685       if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
1686         return getUniqueKernelFor(*CB);
1687       return nullptr;
1688     }
1689     // Disallow every other use.
1690     return nullptr;
1691   };
1692 
1693   // TODO: In the future we want to track more than just a unique kernel.
1694   SmallPtrSet<Kernel, 2> PotentialKernels;
1695   OMPInformationCache::foreachUse(F, [&](const Use &U) {
1696     PotentialKernels.insert(GetUniqueKernelForUse(U));
1697   });
1698 
1699   Kernel K = nullptr;
1700   if (PotentialKernels.size() == 1)
1701     K = *PotentialKernels.begin();
1702 
1703   // Cache the result.
1704   UniqueKernelMap[&F] = K;
1705 
1706   return K;
1707 }
1708 
1709 bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1710   OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1711       OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1712 
1713   bool Changed = false;
1714   if (!KernelParallelRFI)
1715     return Changed;
1716 
1717   for (Function *F : SCC) {
1718 
1719     // Check if the function is a use in a __kmpc_parallel_51 call at
1720     // all.
1721     bool UnknownUse = false;
1722     bool KernelParallelUse = false;
1723     unsigned NumDirectCalls = 0;
1724 
1725     SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1726     OMPInformationCache::foreachUse(*F, [&](Use &U) {
1727       if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1728         if (CB->isCallee(&U)) {
1729           ++NumDirectCalls;
1730           return;
1731         }
1732 
1733       if (isa<ICmpInst>(U.getUser())) {
1734         ToBeReplacedStateMachineUses.push_back(&U);
1735         return;
1736       }
1737 
1738       // Find wrapper functions that represent parallel kernels.
1739       CallInst *CI =
1740           OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
1741       const unsigned int WrapperFunctionArgNo = 6;
1742       if (!KernelParallelUse && CI &&
1743           CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
1744         KernelParallelUse = true;
1745         ToBeReplacedStateMachineUses.push_back(&U);
1746         return;
1747       }
1748       UnknownUse = true;
1749     });
1750 
1751     // Do not emit a remark if we haven't seen a __kmpc_parallel_51
1752     // use.
1753     if (!KernelParallelUse)
1754       continue;
1755 
1756     {
1757       auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1758         return ORA << "Found a parallel region that is called in a target "
1759                       "region but not part of a combined target construct nor "
1760                       "nested inside a target construct without intermediate "
1761                       "code. This can lead to excessive register usage for "
1762                       "unrelated target regions in the same translation unit "
1763                       "due to spurious call edges assumed by ptxas.";
1764       };
1765       emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPParallelRegionInNonSPMD",
1766                                              Remark);
1767     }
1768 
1769     // If this ever hits, we should investigate.
1770     // TODO: Checking the number of uses is not a necessary restriction and
1771     // should be lifted.
1772     if (UnknownUse || NumDirectCalls != 1 ||
1773         ToBeReplacedStateMachineUses.size() != 2) {
1774       {
1775         auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1776           return ORA << "Parallel region is used in "
1777                      << (UnknownUse ? "unknown" : "unexpected")
1778                      << " ways; will not attempt to rewrite the state machine.";
1779         };
1780         emitRemark<OptimizationRemarkAnalysis>(
1781             F, "OpenMPParallelRegionInNonSPMD", Remark);
1782       }
1783       continue;
1784     }
1785 
1786     // Even if we have __kmpc_parallel_51 calls, we (for now) give
1787     // up if the function is not called from a unique kernel.
1788     Kernel K = getUniqueKernelFor(*F);
1789     if (!K) {
1790       {
1791         auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1792           return ORA << "Parallel region is not known to be called from a "
1793                         "unique single target region, maybe the surrounding "
1794                         "function has external linkage?; will not attempt to "
1795                         "rewrite the state machine use.";
1796         };
1797         emitRemark<OptimizationRemarkAnalysis>(
1798             F, "OpenMPParallelRegionInMultipleKernesl", Remark);
1799       }
1800       continue;
1801     }
1802 
1803     // We now know F is a parallel body function called only from the kernel K.
1804     // We also identified the state machine uses in which we replace the
1805     // function pointer by a new global symbol for identification purposes. This
1806     // ensures only direct calls to the function are left.
1807 
1808     {
1809       auto RemarkParalleRegion = [&](OptimizationRemarkAnalysis ORA) {
1810         return ORA << "Specialize parallel region that is only reached from a "
1811                       "single target region to avoid spurious call edges and "
1812                       "excessive register usage in other target regions. "
1813                       "(parallel region ID: "
1814                    << ore::NV("OpenMPParallelRegion", F->getName())
1815                    << ", kernel ID: "
1816                    << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1817       };
1818       emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPParallelRegionInNonSPMD",
1819                                              RemarkParalleRegion);
1820       auto RemarkKernel = [&](OptimizationRemarkAnalysis ORA) {
1821         return ORA << "Target region containing the parallel region that is "
1822                       "specialized. (parallel region ID: "
1823                    << ore::NV("OpenMPParallelRegion", F->getName())
1824                    << ", kernel ID: "
1825                    << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1826       };
1827       emitRemark<OptimizationRemarkAnalysis>(K, "OpenMPParallelRegionInNonSPMD",
1828                                              RemarkKernel);
1829     }
1830 
1831     Module &M = *F->getParent();
1832     Type *Int8Ty = Type::getInt8Ty(M.getContext());
1833 
1834     auto *ID = new GlobalVariable(
1835         M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
1836         UndefValue::get(Int8Ty), F->getName() + ".ID");
1837 
1838     for (Use *U : ToBeReplacedStateMachineUses)
1839       U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
1840 
1841     ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
1842 
1843     Changed = true;
1844   }
1845 
1846   return Changed;
1847 }
1848 
1849 /// Abstract Attribute for tracking ICV values.
1850 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
1851   using Base = StateWrapper<BooleanState, AbstractAttribute>;
1852   AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1853 
1854   void initialize(Attributor &A) override {
1855     Function *F = getAnchorScope();
1856     if (!F || !A.isFunctionIPOAmendable(*F))
1857       indicatePessimisticFixpoint();
1858   }
1859 
1860   /// Returns true if value is assumed to be tracked.
1861   bool isAssumedTracked() const { return getAssumed(); }
1862 
1863   /// Returns true if value is known to be tracked.
1864   bool isKnownTracked() const { return getAssumed(); }
1865 
1866   /// Create an abstract attribute biew for the position \p IRP.
1867   static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
1868 
1869   /// Return the value with which \p I can be replaced for specific \p ICV.
1870   virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
1871                                                 const Instruction *I,
1872                                                 Attributor &A) const {
1873     return None;
1874   }
1875 
1876   /// Return an assumed unique ICV value if a single candidate is found. If
1877   /// there cannot be one, return a nullptr. If it is not clear yet, return the
1878   /// Optional::NoneType.
1879   virtual Optional<Value *>
1880   getUniqueReplacementValue(InternalControlVar ICV) const = 0;
1881 
1882   // Currently only nthreads is being tracked.
1883   // this array will only grow with time.
1884   InternalControlVar TrackableICVs[1] = {ICV_nthreads};
1885 
1886   /// See AbstractAttribute::getName()
1887   const std::string getName() const override { return "AAICVTracker"; }
1888 
1889   /// See AbstractAttribute::getIdAddr()
1890   const char *getIdAddr() const override { return &ID; }
1891 
1892   /// This function should return true if the type of the \p AA is AAICVTracker
1893   static bool classof(const AbstractAttribute *AA) {
1894     return (AA->getIdAddr() == &ID);
1895   }
1896 
1897   static const char ID;
1898 };
1899 
1900 struct AAICVTrackerFunction : public AAICVTracker {
1901   AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
1902       : AAICVTracker(IRP, A) {}
1903 
1904   // FIXME: come up with better string.
1905   const std::string getAsStr() const override { return "ICVTrackerFunction"; }
1906 
1907   // FIXME: come up with some stats.
1908   void trackStatistics() const override {}
1909 
1910   /// We don't manifest anything for this AA.
1911   ChangeStatus manifest(Attributor &A) override {
1912     return ChangeStatus::UNCHANGED;
1913   }
1914 
1915   // Map of ICV to their values at specific program point.
1916   EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
1917                   InternalControlVar::ICV___last>
1918       ICVReplacementValuesMap;
1919 
1920   ChangeStatus updateImpl(Attributor &A) override {
1921     ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
1922 
1923     Function *F = getAnchorScope();
1924 
1925     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1926 
1927     for (InternalControlVar ICV : TrackableICVs) {
1928       auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1929 
1930       auto &ValuesMap = ICVReplacementValuesMap[ICV];
1931       auto TrackValues = [&](Use &U, Function &) {
1932         CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
1933         if (!CI)
1934           return false;
1935 
1936         // FIXME: handle setters with more that 1 arguments.
1937         /// Track new value.
1938         if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
1939           HasChanged = ChangeStatus::CHANGED;
1940 
1941         return false;
1942       };
1943 
1944       auto CallCheck = [&](Instruction &I) {
1945         Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
1946         if (ReplVal.hasValue() &&
1947             ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
1948           HasChanged = ChangeStatus::CHANGED;
1949 
1950         return true;
1951       };
1952 
1953       // Track all changes of an ICV.
1954       SetterRFI.foreachUse(TrackValues, F);
1955 
1956       A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
1957                                 /* CheckBBLivenessOnly */ true);
1958 
1959       /// TODO: Figure out a way to avoid adding entry in
1960       /// ICVReplacementValuesMap
1961       Instruction *Entry = &F->getEntryBlock().front();
1962       if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
1963         ValuesMap.insert(std::make_pair(Entry, nullptr));
1964     }
1965 
1966     return HasChanged;
1967   }
1968 
1969   /// Hepler to check if \p I is a call and get the value for it if it is
1970   /// unique.
1971   Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
1972                                     InternalControlVar &ICV) const {
1973 
1974     const auto *CB = dyn_cast<CallBase>(I);
1975     if (!CB || CB->hasFnAttr("no_openmp") ||
1976         CB->hasFnAttr("no_openmp_routines"))
1977       return None;
1978 
1979     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1980     auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
1981     auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1982     Function *CalledFunction = CB->getCalledFunction();
1983 
1984     // Indirect call, assume ICV changes.
1985     if (CalledFunction == nullptr)
1986       return nullptr;
1987     if (CalledFunction == GetterRFI.Declaration)
1988       return None;
1989     if (CalledFunction == SetterRFI.Declaration) {
1990       if (ICVReplacementValuesMap[ICV].count(I))
1991         return ICVReplacementValuesMap[ICV].lookup(I);
1992 
1993       return nullptr;
1994     }
1995 
1996     // Since we don't know, assume it changes the ICV.
1997     if (CalledFunction->isDeclaration())
1998       return nullptr;
1999 
2000     const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2001         *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
2002 
2003     if (ICVTrackingAA.isAssumedTracked())
2004       return ICVTrackingAA.getUniqueReplacementValue(ICV);
2005 
2006     // If we don't know, assume it changes.
2007     return nullptr;
2008   }
2009 
2010   // We don't check unique value for a function, so return None.
2011   Optional<Value *>
2012   getUniqueReplacementValue(InternalControlVar ICV) const override {
2013     return None;
2014   }
2015 
2016   /// Return the value with which \p I can be replaced for specific \p ICV.
2017   Optional<Value *> getReplacementValue(InternalControlVar ICV,
2018                                         const Instruction *I,
2019                                         Attributor &A) const override {
2020     const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2021     if (ValuesMap.count(I))
2022       return ValuesMap.lookup(I);
2023 
2024     SmallVector<const Instruction *, 16> Worklist;
2025     SmallPtrSet<const Instruction *, 16> Visited;
2026     Worklist.push_back(I);
2027 
2028     Optional<Value *> ReplVal;
2029 
2030     while (!Worklist.empty()) {
2031       const Instruction *CurrInst = Worklist.pop_back_val();
2032       if (!Visited.insert(CurrInst).second)
2033         continue;
2034 
2035       const BasicBlock *CurrBB = CurrInst->getParent();
2036 
2037       // Go up and look for all potential setters/calls that might change the
2038       // ICV.
2039       while ((CurrInst = CurrInst->getPrevNode())) {
2040         if (ValuesMap.count(CurrInst)) {
2041           Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2042           // Unknown value, track new.
2043           if (!ReplVal.hasValue()) {
2044             ReplVal = NewReplVal;
2045             break;
2046           }
2047 
2048           // If we found a new value, we can't know the icv value anymore.
2049           if (NewReplVal.hasValue())
2050             if (ReplVal != NewReplVal)
2051               return nullptr;
2052 
2053           break;
2054         }
2055 
2056         Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
2057         if (!NewReplVal.hasValue())
2058           continue;
2059 
2060         // Unknown value, track new.
2061         if (!ReplVal.hasValue()) {
2062           ReplVal = NewReplVal;
2063           break;
2064         }
2065 
2066         // if (NewReplVal.hasValue())
2067         // We found a new value, we can't know the icv value anymore.
2068         if (ReplVal != NewReplVal)
2069           return nullptr;
2070       }
2071 
2072       // If we are in the same BB and we have a value, we are done.
2073       if (CurrBB == I->getParent() && ReplVal.hasValue())
2074         return ReplVal;
2075 
2076       // Go through all predecessors and add terminators for analysis.
2077       for (const BasicBlock *Pred : predecessors(CurrBB))
2078         if (const Instruction *Terminator = Pred->getTerminator())
2079           Worklist.push_back(Terminator);
2080     }
2081 
2082     return ReplVal;
2083   }
2084 };
2085 
2086 struct AAICVTrackerFunctionReturned : AAICVTracker {
2087   AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2088       : AAICVTracker(IRP, A) {}
2089 
2090   // FIXME: come up with better string.
2091   const std::string getAsStr() const override {
2092     return "ICVTrackerFunctionReturned";
2093   }
2094 
2095   // FIXME: come up with some stats.
2096   void trackStatistics() const override {}
2097 
2098   /// We don't manifest anything for this AA.
2099   ChangeStatus manifest(Attributor &A) override {
2100     return ChangeStatus::UNCHANGED;
2101   }
2102 
2103   // Map of ICV to their values at specific program point.
2104   EnumeratedArray<Optional<Value *>, InternalControlVar,
2105                   InternalControlVar::ICV___last>
2106       ICVReplacementValuesMap;
2107 
2108   /// Return the value with which \p I can be replaced for specific \p ICV.
2109   Optional<Value *>
2110   getUniqueReplacementValue(InternalControlVar ICV) const override {
2111     return ICVReplacementValuesMap[ICV];
2112   }
2113 
2114   ChangeStatus updateImpl(Attributor &A) override {
2115     ChangeStatus Changed = ChangeStatus::UNCHANGED;
2116     const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2117         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2118 
2119     if (!ICVTrackingAA.isAssumedTracked())
2120       return indicatePessimisticFixpoint();
2121 
2122     for (InternalControlVar ICV : TrackableICVs) {
2123       Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2124       Optional<Value *> UniqueICVValue;
2125 
2126       auto CheckReturnInst = [&](Instruction &I) {
2127         Optional<Value *> NewReplVal =
2128             ICVTrackingAA.getReplacementValue(ICV, &I, A);
2129 
2130         // If we found a second ICV value there is no unique returned value.
2131         if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
2132           return false;
2133 
2134         UniqueICVValue = NewReplVal;
2135 
2136         return true;
2137       };
2138 
2139       if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2140                                      /* CheckBBLivenessOnly */ true))
2141         UniqueICVValue = nullptr;
2142 
2143       if (UniqueICVValue == ReplVal)
2144         continue;
2145 
2146       ReplVal = UniqueICVValue;
2147       Changed = ChangeStatus::CHANGED;
2148     }
2149 
2150     return Changed;
2151   }
2152 };
2153 
2154 struct AAICVTrackerCallSite : AAICVTracker {
2155   AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2156       : AAICVTracker(IRP, A) {}
2157 
2158   void initialize(Attributor &A) override {
2159     Function *F = getAnchorScope();
2160     if (!F || !A.isFunctionIPOAmendable(*F))
2161       indicatePessimisticFixpoint();
2162 
2163     // We only initialize this AA for getters, so we need to know which ICV it
2164     // gets.
2165     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2166     for (InternalControlVar ICV : TrackableICVs) {
2167       auto ICVInfo = OMPInfoCache.ICVs[ICV];
2168       auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2169       if (Getter.Declaration == getAssociatedFunction()) {
2170         AssociatedICV = ICVInfo.Kind;
2171         return;
2172       }
2173     }
2174 
2175     /// Unknown ICV.
2176     indicatePessimisticFixpoint();
2177   }
2178 
2179   ChangeStatus manifest(Attributor &A) override {
2180     if (!ReplVal.hasValue() || !ReplVal.getValue())
2181       return ChangeStatus::UNCHANGED;
2182 
2183     A.changeValueAfterManifest(*getCtxI(), **ReplVal);
2184     A.deleteAfterManifest(*getCtxI());
2185 
2186     return ChangeStatus::CHANGED;
2187   }
2188 
2189   // FIXME: come up with better string.
2190   const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
2191 
2192   // FIXME: come up with some stats.
2193   void trackStatistics() const override {}
2194 
2195   InternalControlVar AssociatedICV;
2196   Optional<Value *> ReplVal;
2197 
2198   ChangeStatus updateImpl(Attributor &A) override {
2199     const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2200         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2201 
2202     // We don't have any information, so we assume it changes the ICV.
2203     if (!ICVTrackingAA.isAssumedTracked())
2204       return indicatePessimisticFixpoint();
2205 
2206     Optional<Value *> NewReplVal =
2207         ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
2208 
2209     if (ReplVal == NewReplVal)
2210       return ChangeStatus::UNCHANGED;
2211 
2212     ReplVal = NewReplVal;
2213     return ChangeStatus::CHANGED;
2214   }
2215 
2216   // Return the value with which associated value can be replaced for specific
2217   // \p ICV.
2218   Optional<Value *>
2219   getUniqueReplacementValue(InternalControlVar ICV) const override {
2220     return ReplVal;
2221   }
2222 };
2223 
2224 struct AAICVTrackerCallSiteReturned : AAICVTracker {
2225   AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2226       : AAICVTracker(IRP, A) {}
2227 
2228   // FIXME: come up with better string.
2229   const std::string getAsStr() const override {
2230     return "ICVTrackerCallSiteReturned";
2231   }
2232 
2233   // FIXME: come up with some stats.
2234   void trackStatistics() const override {}
2235 
2236   /// We don't manifest anything for this AA.
2237   ChangeStatus manifest(Attributor &A) override {
2238     return ChangeStatus::UNCHANGED;
2239   }
2240 
2241   // Map of ICV to their values at specific program point.
2242   EnumeratedArray<Optional<Value *>, InternalControlVar,
2243                   InternalControlVar::ICV___last>
2244       ICVReplacementValuesMap;
2245 
2246   /// Return the value with which associated value can be replaced for specific
2247   /// \p ICV.
2248   Optional<Value *>
2249   getUniqueReplacementValue(InternalControlVar ICV) const override {
2250     return ICVReplacementValuesMap[ICV];
2251   }
2252 
2253   ChangeStatus updateImpl(Attributor &A) override {
2254     ChangeStatus Changed = ChangeStatus::UNCHANGED;
2255     const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2256         *this, IRPosition::returned(*getAssociatedFunction()),
2257         DepClassTy::REQUIRED);
2258 
2259     // We don't have any information, so we assume it changes the ICV.
2260     if (!ICVTrackingAA.isAssumedTracked())
2261       return indicatePessimisticFixpoint();
2262 
2263     for (InternalControlVar ICV : TrackableICVs) {
2264       Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2265       Optional<Value *> NewReplVal =
2266           ICVTrackingAA.getUniqueReplacementValue(ICV);
2267 
2268       if (ReplVal == NewReplVal)
2269         continue;
2270 
2271       ReplVal = NewReplVal;
2272       Changed = ChangeStatus::CHANGED;
2273     }
2274     return Changed;
2275   }
2276 };
2277 
2278 struct AAExecutionDomainFunction : public AAExecutionDomain {
2279   AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2280       : AAExecutionDomain(IRP, A) {}
2281 
2282   const std::string getAsStr() const override {
2283     return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
2284            "/" + std::to_string(NumBBs) + " BBs thread 0 only.";
2285   }
2286 
2287   /// See AbstractAttribute::trackStatistics().
2288   void trackStatistics() const override {}
2289 
2290   void initialize(Attributor &A) override {
2291     Function *F = getAnchorScope();
2292     for (const auto &BB : *F)
2293       SingleThreadedBBs.insert(&BB);
2294     NumBBs = SingleThreadedBBs.size();
2295   }
2296 
2297   ChangeStatus manifest(Attributor &A) override {
2298     LLVM_DEBUG({
2299       for (const BasicBlock *BB : SingleThreadedBBs)
2300         dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
2301                << BB->getName() << " is executed by a single thread.\n";
2302     });
2303     return ChangeStatus::UNCHANGED;
2304   }
2305 
2306   ChangeStatus updateImpl(Attributor &A) override;
2307 
2308   /// Check if an instruction is executed by a single thread.
2309   bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
2310     return isExecutedByInitialThreadOnly(*I.getParent());
2311   }
2312 
2313   bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2314     return isValidState() && SingleThreadedBBs.contains(&BB);
2315   }
2316 
2317   /// Set of basic blocks that are executed by a single thread.
2318   DenseSet<const BasicBlock *> SingleThreadedBBs;
2319 
2320   /// Total number of basic blocks in this function.
2321   long unsigned NumBBs;
2322 };
2323 
2324 ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
2325   Function *F = getAnchorScope();
2326   ReversePostOrderTraversal<Function *> RPOT(F);
2327   auto NumSingleThreadedBBs = SingleThreadedBBs.size();
2328 
2329   bool AllCallSitesKnown;
2330   auto PredForCallSite = [&](AbstractCallSite ACS) {
2331     const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
2332         *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
2333         DepClassTy::REQUIRED);
2334     return ACS.isDirectCall() &&
2335            ExecutionDomainAA.isExecutedByInitialThreadOnly(
2336                *ACS.getInstruction());
2337   };
2338 
2339   if (!A.checkForAllCallSites(PredForCallSite, *this,
2340                               /* RequiresAllCallSites */ true,
2341                               AllCallSitesKnown))
2342     SingleThreadedBBs.erase(&F->getEntryBlock());
2343 
2344   // Check if the edge into the successor block compares a thread-id function to
2345   // a constant zero.
2346   // TODO: Use AAValueSimplify to simplify and propogate constants.
2347   // TODO: Check more than a single use for thread ID's.
2348   auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
2349     if (!Edge || !Edge->isConditional())
2350       return false;
2351     if (Edge->getSuccessor(0) != SuccessorBB)
2352       return false;
2353 
2354     auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
2355     if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
2356       return false;
2357 
2358     // Temporarily match the pattern generated by clang for teams regions.
2359     // TODO: Remove this once the new runtime is in place.
2360     ConstantInt *One, *NegOne;
2361     CmpInst::Predicate Pred;
2362     auto &&m_ThreadID = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_tid_x>();
2363     auto &&m_WarpSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_warpsize>();
2364     auto &&m_BlockSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_ntid_x>();
2365     if (match(Cmp, m_Cmp(Pred, m_ThreadID,
2366                          m_And(m_Sub(m_BlockSize, m_ConstantInt(One)),
2367                                m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)),
2368                                      m_ConstantInt(NegOne))))))
2369       if (One->isOne() && NegOne->isMinusOne() &&
2370           Pred == CmpInst::Predicate::ICMP_EQ)
2371         return true;
2372 
2373     ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
2374     if (!C || !C->isZero())
2375       return false;
2376 
2377     if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2378       if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2379         return true;
2380     if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2381       if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2382         return true;
2383 
2384     return false;
2385   };
2386 
2387   // Merge all the predecessor states into the current basic block. A basic
2388   // block is executed by a single thread if all of its predecessors are.
2389   auto MergePredecessorStates = [&](BasicBlock *BB) {
2390     if (pred_begin(BB) == pred_end(BB))
2391       return SingleThreadedBBs.contains(BB);
2392 
2393     bool IsInitialThread = true;
2394     for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
2395          PredBB != PredEndBB; ++PredBB) {
2396       if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
2397                               BB))
2398         IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
2399     }
2400 
2401     return IsInitialThread;
2402   };
2403 
2404   for (auto *BB : RPOT) {
2405     if (!MergePredecessorStates(BB))
2406       SingleThreadedBBs.erase(BB);
2407   }
2408 
2409   return (NumSingleThreadedBBs == SingleThreadedBBs.size())
2410              ? ChangeStatus::UNCHANGED
2411              : ChangeStatus::CHANGED;
2412 }
2413 
2414 /// Try to replace memory allocation calls called by a single thread with a
2415 /// static buffer of shared memory.
2416 struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
2417   using Base = StateWrapper<BooleanState, AbstractAttribute>;
2418   AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2419 
2420   /// Create an abstract attribute view for the position \p IRP.
2421   static AAHeapToShared &createForPosition(const IRPosition &IRP,
2422                                            Attributor &A);
2423 
2424   /// See AbstractAttribute::getName().
2425   const std::string getName() const override { return "AAHeapToShared"; }
2426 
2427   /// See AbstractAttribute::getIdAddr().
2428   const char *getIdAddr() const override { return &ID; }
2429 
2430   /// This function should return true if the type of the \p AA is
2431   /// AAHeapToShared.
2432   static bool classof(const AbstractAttribute *AA) {
2433     return (AA->getIdAddr() == &ID);
2434   }
2435 
2436   /// Unique ID (due to the unique address)
2437   static const char ID;
2438 };
2439 
2440 struct AAHeapToSharedFunction : public AAHeapToShared {
2441   AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
2442       : AAHeapToShared(IRP, A) {}
2443 
2444   const std::string getAsStr() const override {
2445     return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
2446            " malloc calls eligible.";
2447   }
2448 
2449   /// See AbstractAttribute::trackStatistics().
2450   void trackStatistics() const override {}
2451 
2452   void initialize(Attributor &A) override {
2453     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2454     auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2455 
2456     for (User *U : RFI.Declaration->users())
2457       if (CallBase *CB = dyn_cast<CallBase>(U))
2458         MallocCalls.insert(CB);
2459   }
2460 
2461   ChangeStatus manifest(Attributor &A) override {
2462     if (MallocCalls.empty())
2463       return ChangeStatus::UNCHANGED;
2464 
2465     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2466     auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2467 
2468     Function *F = getAnchorScope();
2469     auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
2470                                             DepClassTy::OPTIONAL);
2471 
2472     ChangeStatus Changed = ChangeStatus::UNCHANGED;
2473     for (CallBase *CB : MallocCalls) {
2474       // Skip replacing this if HeapToStack has already claimed it.
2475       if (HS && HS->isKnownHeapToStack(*CB))
2476         continue;
2477 
2478       // Find the unique free call to remove it.
2479       SmallVector<CallBase *, 4> FreeCalls;
2480       for (auto *U : CB->users()) {
2481         CallBase *C = dyn_cast<CallBase>(U);
2482         if (C && C->getCalledFunction() == FreeCall.Declaration)
2483           FreeCalls.push_back(C);
2484       }
2485       if (FreeCalls.size() != 1)
2486         continue;
2487 
2488       ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
2489 
2490       LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
2491                         << CB->getCaller()->getName() << " with "
2492                         << AllocSize->getZExtValue()
2493                         << " bytes of shared memory\n");
2494 
2495       // Create a new shared memory buffer of the same size as the allocation
2496       // and replace all the uses of the original allocation with it.
2497       Module *M = CB->getModule();
2498       Type *Int8Ty = Type::getInt8Ty(M->getContext());
2499       Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
2500       auto *SharedMem = new GlobalVariable(
2501           *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
2502           UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
2503           GlobalValue::NotThreadLocal,
2504           static_cast<unsigned>(AddressSpace::Shared));
2505       auto *NewBuffer =
2506           ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
2507 
2508       auto Remark = [&](OptimizationRemark OR) {
2509         return OR << "Replaced globalized variable with "
2510                   << ore::NV("SharedMemory", AllocSize->getZExtValue())
2511                   << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
2512                   << "of shared memory";
2513       };
2514       A.emitRemark<OptimizationRemark>(CB, "OpenMPReplaceGlobalization",
2515                                        Remark);
2516 
2517       SharedMem->setAlignment(MaybeAlign(32));
2518 
2519       A.changeValueAfterManifest(*CB, *NewBuffer);
2520       A.deleteAfterManifest(*CB);
2521       A.deleteAfterManifest(*FreeCalls.front());
2522 
2523       NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
2524       Changed = ChangeStatus::CHANGED;
2525     }
2526 
2527     return Changed;
2528   }
2529 
2530   ChangeStatus updateImpl(Attributor &A) override {
2531     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2532     auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2533     Function *F = getAnchorScope();
2534 
2535     auto NumMallocCalls = MallocCalls.size();
2536 
2537     // Only consider malloc calls executed by a single thread with a constant.
2538     for (User *U : RFI.Declaration->users()) {
2539       const auto &ED = A.getAAFor<AAExecutionDomain>(
2540           *this, IRPosition::function(*F), DepClassTy::REQUIRED);
2541       if (CallBase *CB = dyn_cast<CallBase>(U))
2542         if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) ||
2543             !ED.isExecutedByInitialThreadOnly(*CB))
2544           MallocCalls.erase(CB);
2545     }
2546 
2547     if (NumMallocCalls != MallocCalls.size())
2548       return ChangeStatus::CHANGED;
2549 
2550     return ChangeStatus::UNCHANGED;
2551   }
2552 
2553   /// Collection of all malloc calls in a function.
2554   SmallPtrSet<CallBase *, 4> MallocCalls;
2555 };
2556 
2557 } // namespace
2558 
2559 const char AAICVTracker::ID = 0;
2560 const char AAExecutionDomain::ID = 0;
2561 const char AAHeapToShared::ID = 0;
2562 
2563 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
2564                                               Attributor &A) {
2565   AAICVTracker *AA = nullptr;
2566   switch (IRP.getPositionKind()) {
2567   case IRPosition::IRP_INVALID:
2568   case IRPosition::IRP_FLOAT:
2569   case IRPosition::IRP_ARGUMENT:
2570   case IRPosition::IRP_CALL_SITE_ARGUMENT:
2571     llvm_unreachable("ICVTracker can only be created for function position!");
2572   case IRPosition::IRP_RETURNED:
2573     AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
2574     break;
2575   case IRPosition::IRP_CALL_SITE_RETURNED:
2576     AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
2577     break;
2578   case IRPosition::IRP_CALL_SITE:
2579     AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
2580     break;
2581   case IRPosition::IRP_FUNCTION:
2582     AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
2583     break;
2584   }
2585 
2586   return *AA;
2587 }
2588 
2589 AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
2590                                                         Attributor &A) {
2591   AAExecutionDomainFunction *AA = nullptr;
2592   switch (IRP.getPositionKind()) {
2593   case IRPosition::IRP_INVALID:
2594   case IRPosition::IRP_FLOAT:
2595   case IRPosition::IRP_ARGUMENT:
2596   case IRPosition::IRP_CALL_SITE_ARGUMENT:
2597   case IRPosition::IRP_RETURNED:
2598   case IRPosition::IRP_CALL_SITE_RETURNED:
2599   case IRPosition::IRP_CALL_SITE:
2600     llvm_unreachable(
2601         "AAExecutionDomain can only be created for function position!");
2602   case IRPosition::IRP_FUNCTION:
2603     AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
2604     break;
2605   }
2606 
2607   return *AA;
2608 }
2609 
2610 AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
2611                                                   Attributor &A) {
2612   AAHeapToSharedFunction *AA = nullptr;
2613   switch (IRP.getPositionKind()) {
2614   case IRPosition::IRP_INVALID:
2615   case IRPosition::IRP_FLOAT:
2616   case IRPosition::IRP_ARGUMENT:
2617   case IRPosition::IRP_CALL_SITE_ARGUMENT:
2618   case IRPosition::IRP_RETURNED:
2619   case IRPosition::IRP_CALL_SITE_RETURNED:
2620   case IRPosition::IRP_CALL_SITE:
2621     llvm_unreachable(
2622         "AAHeapToShared can only be created for function position!");
2623   case IRPosition::IRP_FUNCTION:
2624     AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
2625     break;
2626   }
2627 
2628   return *AA;
2629 }
2630 
2631 PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
2632   if (!containsOpenMP(M))
2633     return PreservedAnalyses::all();
2634   if (DisableOpenMPOptimizations)
2635     return PreservedAnalyses::all();
2636 
2637   FunctionAnalysisManager &FAM =
2638       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
2639   KernelSet Kernels = getDeviceKernels(M);
2640 
2641   auto IsCalled = [&](Function &F) {
2642     if (Kernels.contains(&F))
2643       return true;
2644     for (const User *U : F.users())
2645       if (!isa<BlockAddress>(U))
2646         return true;
2647     return false;
2648   };
2649 
2650   auto EmitRemark = [&](Function &F) {
2651     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
2652     ORE.emit([&]() {
2653       OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "InternalizationFailure", &F);
2654       return ORA << "Could not internalize function. "
2655                  << "Some optimizations may not be possible.";
2656     });
2657   };
2658 
2659   // Create internal copies of each function if this is a kernel Module. This
2660   // allows iterprocedural passes to see every call edge.
2661   DenseSet<const Function *> InternalizedFuncs;
2662   if (isOpenMPDevice(M))
2663     for (Function &F : M)
2664       if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F)) {
2665         if (Attributor::internalizeFunction(F, /* Force */ true)) {
2666           InternalizedFuncs.insert(&F);
2667         } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
2668           EmitRemark(F);
2669         }
2670       }
2671 
2672   // Look at every function in the Module unless it was internalized.
2673   SmallVector<Function *, 16> SCC;
2674   for (Function &F : M)
2675     if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
2676       SCC.push_back(&F);
2677 
2678   if (SCC.empty())
2679     return PreservedAnalyses::all();
2680 
2681   AnalysisGetter AG(FAM);
2682 
2683   auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
2684     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
2685   };
2686 
2687   BumpPtrAllocator Allocator;
2688   CallGraphUpdater CGUpdater;
2689 
2690   SetVector<Function *> Functions(SCC.begin(), SCC.end());
2691   OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
2692 
2693   unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
2694   Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
2695                MaxFixpointIterations, OREGetter, DEBUG_TYPE);
2696 
2697   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2698   bool Changed = OMPOpt.run(true);
2699   if (Changed)
2700     return PreservedAnalyses::none();
2701 
2702   return PreservedAnalyses::all();
2703 }
2704 
2705 PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
2706                                           CGSCCAnalysisManager &AM,
2707                                           LazyCallGraph &CG,
2708                                           CGSCCUpdateResult &UR) {
2709   if (!containsOpenMP(*C.begin()->getFunction().getParent()))
2710     return PreservedAnalyses::all();
2711   if (DisableOpenMPOptimizations)
2712     return PreservedAnalyses::all();
2713 
2714   SmallVector<Function *, 16> SCC;
2715   // If there are kernels in the module, we have to run on all SCC's.
2716   for (LazyCallGraph::Node &N : C) {
2717     Function *Fn = &N.getFunction();
2718     SCC.push_back(Fn);
2719   }
2720 
2721   if (SCC.empty())
2722     return PreservedAnalyses::all();
2723 
2724   Module &M = *C.begin()->getFunction().getParent();
2725 
2726   KernelSet Kernels = getDeviceKernels(M);
2727 
2728   FunctionAnalysisManager &FAM =
2729       AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
2730 
2731   AnalysisGetter AG(FAM);
2732 
2733   auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
2734     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
2735   };
2736 
2737   BumpPtrAllocator Allocator;
2738   CallGraphUpdater CGUpdater;
2739   CGUpdater.initialize(CG, C, AM, UR);
2740 
2741   SetVector<Function *> Functions(SCC.begin(), SCC.end());
2742   OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
2743                                 /*CGSCC*/ Functions, Kernels);
2744 
2745   unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
2746   Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
2747                MaxFixpointIterations, OREGetter, DEBUG_TYPE);
2748 
2749   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2750   bool Changed = OMPOpt.run(false);
2751   if (Changed)
2752     return PreservedAnalyses::none();
2753 
2754   return PreservedAnalyses::all();
2755 }
2756 
2757 namespace {
2758 
2759 struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
2760   CallGraphUpdater CGUpdater;
2761   static char ID;
2762 
2763   OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) {
2764     initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
2765   }
2766 
2767   void getAnalysisUsage(AnalysisUsage &AU) const override {
2768     CallGraphSCCPass::getAnalysisUsage(AU);
2769   }
2770 
2771   bool runOnSCC(CallGraphSCC &CGSCC) override {
2772     if (!containsOpenMP(CGSCC.getCallGraph().getModule()))
2773       return false;
2774     if (DisableOpenMPOptimizations || skipSCC(CGSCC))
2775       return false;
2776 
2777     SmallVector<Function *, 16> SCC;
2778     // If there are kernels in the module, we have to run on all SCC's.
2779     for (CallGraphNode *CGN : CGSCC) {
2780       Function *Fn = CGN->getFunction();
2781       if (!Fn || Fn->isDeclaration())
2782         continue;
2783       SCC.push_back(Fn);
2784     }
2785 
2786     if (SCC.empty())
2787       return false;
2788 
2789     Module &M = CGSCC.getCallGraph().getModule();
2790     KernelSet Kernels = getDeviceKernels(M);
2791 
2792     CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
2793     CGUpdater.initialize(CG, CGSCC);
2794 
2795     // Maintain a map of functions to avoid rebuilding the ORE
2796     DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
2797     auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
2798       std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
2799       if (!ORE)
2800         ORE = std::make_unique<OptimizationRemarkEmitter>(F);
2801       return *ORE;
2802     };
2803 
2804     AnalysisGetter AG;
2805     SetVector<Function *> Functions(SCC.begin(), SCC.end());
2806     BumpPtrAllocator Allocator;
2807     OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
2808                                   Allocator,
2809                                   /*CGSCC*/ Functions, Kernels);
2810 
2811     unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
2812     Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
2813                  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
2814 
2815     OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2816     return OMPOpt.run(false);
2817   }
2818 
2819   bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
2820 };
2821 
2822 } // end anonymous namespace
2823 
2824 KernelSet llvm::omp::getDeviceKernels(Module &M) {
2825   // TODO: Create a more cross-platform way of determining device kernels.
2826   NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
2827   KernelSet Kernels;
2828 
2829   if (!MD)
2830     return Kernels;
2831 
2832   for (auto *Op : MD->operands()) {
2833     if (Op->getNumOperands() < 2)
2834       continue;
2835     MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
2836     if (!KindID || KindID->getString() != "kernel")
2837       continue;
2838 
2839     Function *KernelFn =
2840         mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
2841     if (!KernelFn)
2842       continue;
2843 
2844     ++NumOpenMPTargetRegionKernels;
2845 
2846     Kernels.insert(KernelFn);
2847   }
2848 
2849   return Kernels;
2850 }
2851 
2852 bool llvm::omp::containsOpenMP(Module &M) {
2853   Metadata *MD = M.getModuleFlag("openmp");
2854   if (!MD)
2855     return false;
2856 
2857   return true;
2858 }
2859 
2860 bool llvm::omp::isOpenMPDevice(Module &M) {
2861   Metadata *MD = M.getModuleFlag("openmp-device");
2862   if (!MD)
2863     return false;
2864 
2865   return true;
2866 }
2867 
2868 char OpenMPOptCGSCCLegacyPass::ID = 0;
2869 
2870 INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
2871                       "OpenMP specific optimizations", false, false)
2872 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
2873 INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
2874                     "OpenMP specific optimizations", false, false)
2875 
2876 Pass *llvm::createOpenMPOptCGSCCLegacyPass() {
2877   return new OpenMPOptCGSCCLegacyPass();
2878 }
2879