19548b74aSJohannes Doerfert //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// 29548b74aSJohannes Doerfert // 39548b74aSJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 49548b74aSJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information. 59548b74aSJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 69548b74aSJohannes Doerfert // 79548b74aSJohannes Doerfert //===----------------------------------------------------------------------===// 89548b74aSJohannes Doerfert // 99548b74aSJohannes Doerfert // OpenMP specific optimizations: 109548b74aSJohannes Doerfert // 119548b74aSJohannes Doerfert // - Deduplication of runtime calls, e.g., omp_get_thread_num. 12ca1560daSJoseph Huber // - Replacing globalized device memory with stack memory. 13ca1560daSJoseph Huber // - Replacing globalized device memory with shared memory. 149548b74aSJohannes Doerfert // 159548b74aSJohannes Doerfert //===----------------------------------------------------------------------===// 169548b74aSJohannes Doerfert 179548b74aSJohannes Doerfert #include "llvm/Transforms/IPO/OpenMPOpt.h" 189548b74aSJohannes Doerfert 199548b74aSJohannes Doerfert #include "llvm/ADT/EnumeratedArray.h" 2018283125SJoseph Huber #include "llvm/ADT/PostOrderIterator.h" 219548b74aSJohannes Doerfert #include "llvm/ADT/Statistic.h" 229548b74aSJohannes Doerfert #include "llvm/Analysis/CallGraph.h" 239548b74aSJohannes Doerfert #include "llvm/Analysis/CallGraphSCCPass.h" 244d4ea9acSHuber, Joseph #include "llvm/Analysis/OptimizationRemarkEmitter.h" 253a6bfcf2SGiorgis Georgakoudis #include "llvm/Analysis/ValueTracking.h" 269548b74aSJohannes Doerfert #include "llvm/Frontend/OpenMP/OMPConstants.h" 27e28936f6SJohannes Doerfert #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 2868abc3d2SJoseph Huber #include "llvm/IR/IntrinsicInst.h" 2968abc3d2SJoseph Huber #include "llvm/IR/IntrinsicsAMDGPU.h" 3068abc3d2SJoseph Huber #include "llvm/IR/IntrinsicsNVPTX.h" 316fc51c9fSJoseph Huber #include "llvm/IR/PatternMatch.h" 329548b74aSJohannes Doerfert #include "llvm/InitializePasses.h" 339548b74aSJohannes Doerfert #include "llvm/Support/CommandLine.h" 349548b74aSJohannes Doerfert #include "llvm/Transforms/IPO.h" 357cfd267cSsstefan1 #include "llvm/Transforms/IPO/Attributor.h" 363a6bfcf2SGiorgis Georgakoudis #include "llvm/Transforms/Utils/BasicBlockUtils.h" 379548b74aSJohannes Doerfert #include "llvm/Transforms/Utils/CallGraphUpdater.h" 3897517055SGiorgis Georgakoudis #include "llvm/Transforms/Utils/CodeExtractor.h" 399548b74aSJohannes Doerfert 406fc51c9fSJoseph Huber using namespace llvm::PatternMatch; 419548b74aSJohannes Doerfert using namespace llvm; 429548b74aSJohannes Doerfert using namespace omp; 439548b74aSJohannes Doerfert 449548b74aSJohannes Doerfert #define DEBUG_TYPE "openmp-opt" 459548b74aSJohannes Doerfert 469548b74aSJohannes Doerfert static cl::opt<bool> DisableOpenMPOptimizations( 479548b74aSJohannes Doerfert "openmp-opt-disable", cl::ZeroOrMore, 489548b74aSJohannes Doerfert cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, 499548b74aSJohannes Doerfert cl::init(false)); 509548b74aSJohannes Doerfert 513a6bfcf2SGiorgis Georgakoudis static cl::opt<bool> EnableParallelRegionMerging( 523a6bfcf2SGiorgis Georgakoudis "openmp-opt-enable-merging", cl::ZeroOrMore, 533a6bfcf2SGiorgis Georgakoudis cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, 543a6bfcf2SGiorgis Georgakoudis cl::init(false)); 553a6bfcf2SGiorgis Georgakoudis 560f426935Ssstefan1 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), 570f426935Ssstefan1 cl::Hidden); 58e8039ad4SJohannes Doerfert static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", 59e8039ad4SJohannes Doerfert cl::init(false), cl::Hidden); 600f426935Ssstefan1 61496f8e5bSHamilton Tobon Mosquera static cl::opt<bool> HideMemoryTransferLatency( 62496f8e5bSHamilton Tobon Mosquera "openmp-hide-memory-transfer-latency", 63496f8e5bSHamilton Tobon Mosquera cl::desc("[WIP] Tries to hide the latency of host to device memory" 64496f8e5bSHamilton Tobon Mosquera " transfers"), 65496f8e5bSHamilton Tobon Mosquera cl::Hidden, cl::init(false)); 66496f8e5bSHamilton Tobon Mosquera 679548b74aSJohannes Doerfert STATISTIC(NumOpenMPRuntimeCallsDeduplicated, 689548b74aSJohannes Doerfert "Number of OpenMP runtime calls deduplicated"); 6955eb714aSRoman Lebedev STATISTIC(NumOpenMPParallelRegionsDeleted, 7055eb714aSRoman Lebedev "Number of OpenMP parallel regions deleted"); 719548b74aSJohannes Doerfert STATISTIC(NumOpenMPRuntimeFunctionsIdentified, 729548b74aSJohannes Doerfert "Number of OpenMP runtime functions identified"); 739548b74aSJohannes Doerfert STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, 749548b74aSJohannes Doerfert "Number of OpenMP runtime function uses identified"); 75e8039ad4SJohannes Doerfert STATISTIC(NumOpenMPTargetRegionKernels, 76e8039ad4SJohannes Doerfert "Number of OpenMP target region entry points (=kernels) identified"); 775b0581aeSJohannes Doerfert STATISTIC( 785b0581aeSJohannes Doerfert NumOpenMPParallelRegionsReplacedInGPUStateMachine, 795b0581aeSJohannes Doerfert "Number of OpenMP parallel regions replaced with ID in GPU state machines"); 803a6bfcf2SGiorgis Georgakoudis STATISTIC(NumOpenMPParallelRegionsMerged, 813a6bfcf2SGiorgis Georgakoudis "Number of OpenMP parallel regions merged"); 826fc51c9fSJoseph Huber STATISTIC(NumBytesMovedToSharedMemory, 836fc51c9fSJoseph Huber "Amount of memory pushed to shared memory"); 849548b74aSJohannes Doerfert 85263c4a3cSrathod-sahaab #if !defined(NDEBUG) 869548b74aSJohannes Doerfert static constexpr auto TAG = "[" DEBUG_TYPE "]"; 87a50c0b0dSMikael Holmen #endif 889548b74aSJohannes Doerfert 899548b74aSJohannes Doerfert namespace { 909548b74aSJohannes Doerfert 916fc51c9fSJoseph Huber enum class AddressSpace : unsigned { 926fc51c9fSJoseph Huber Generic = 0, 936fc51c9fSJoseph Huber Global = 1, 946fc51c9fSJoseph Huber Shared = 3, 956fc51c9fSJoseph Huber Constant = 4, 966fc51c9fSJoseph Huber Local = 5, 976fc51c9fSJoseph Huber }; 986fc51c9fSJoseph Huber 996fc51c9fSJoseph Huber struct AAHeapToShared; 1006fc51c9fSJoseph Huber 101b8235d2bSsstefan1 struct AAICVTracker; 102b8235d2bSsstefan1 1037cfd267cSsstefan1 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for 1047cfd267cSsstefan1 /// Attributor runs. 1057cfd267cSsstefan1 struct OMPInformationCache : public InformationCache { 1067cfd267cSsstefan1 OMPInformationCache(Module &M, AnalysisGetter &AG, 107624d34afSJohannes Doerfert BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, 108e8039ad4SJohannes Doerfert SmallPtrSetImpl<Kernel> &Kernels) 109624d34afSJohannes Doerfert : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), 110624d34afSJohannes Doerfert Kernels(Kernels) { 111624d34afSJohannes Doerfert 11261238d26Ssstefan1 OMPBuilder.initialize(); 1139548b74aSJohannes Doerfert initializeRuntimeFunctions(); 1140f426935Ssstefan1 initializeInternalControlVars(); 1159548b74aSJohannes Doerfert } 1169548b74aSJohannes Doerfert 1170f426935Ssstefan1 /// Generic information that describes an internal control variable. 1180f426935Ssstefan1 struct InternalControlVarInfo { 1190f426935Ssstefan1 /// The kind, as described by InternalControlVar enum. 1200f426935Ssstefan1 InternalControlVar Kind; 1210f426935Ssstefan1 1220f426935Ssstefan1 /// The name of the ICV. 1230f426935Ssstefan1 StringRef Name; 1240f426935Ssstefan1 1250f426935Ssstefan1 /// Environment variable associated with this ICV. 1260f426935Ssstefan1 StringRef EnvVarName; 1270f426935Ssstefan1 1280f426935Ssstefan1 /// Initial value kind. 1290f426935Ssstefan1 ICVInitValue InitKind; 1300f426935Ssstefan1 1310f426935Ssstefan1 /// Initial value. 1320f426935Ssstefan1 ConstantInt *InitValue; 1330f426935Ssstefan1 1340f426935Ssstefan1 /// Setter RTL function associated with this ICV. 1350f426935Ssstefan1 RuntimeFunction Setter; 1360f426935Ssstefan1 1370f426935Ssstefan1 /// Getter RTL function associated with this ICV. 1380f426935Ssstefan1 RuntimeFunction Getter; 1390f426935Ssstefan1 1400f426935Ssstefan1 /// RTL Function corresponding to the override clause of this ICV 1410f426935Ssstefan1 RuntimeFunction Clause; 1420f426935Ssstefan1 }; 1430f426935Ssstefan1 1449548b74aSJohannes Doerfert /// Generic information that describes a runtime function 1459548b74aSJohannes Doerfert struct RuntimeFunctionInfo { 1468855fec3SJohannes Doerfert 1479548b74aSJohannes Doerfert /// The kind, as described by the RuntimeFunction enum. 1489548b74aSJohannes Doerfert RuntimeFunction Kind; 1499548b74aSJohannes Doerfert 1509548b74aSJohannes Doerfert /// The name of the function. 1519548b74aSJohannes Doerfert StringRef Name; 1529548b74aSJohannes Doerfert 1539548b74aSJohannes Doerfert /// Flag to indicate a variadic function. 1549548b74aSJohannes Doerfert bool IsVarArg; 1559548b74aSJohannes Doerfert 1569548b74aSJohannes Doerfert /// The return type of the function. 1579548b74aSJohannes Doerfert Type *ReturnType; 1589548b74aSJohannes Doerfert 1599548b74aSJohannes Doerfert /// The argument types of the function. 1609548b74aSJohannes Doerfert SmallVector<Type *, 8> ArgumentTypes; 1619548b74aSJohannes Doerfert 1629548b74aSJohannes Doerfert /// The declaration if available. 163f09f4b26SJohannes Doerfert Function *Declaration = nullptr; 1649548b74aSJohannes Doerfert 1659548b74aSJohannes Doerfert /// Uses of this runtime function per function containing the use. 1668855fec3SJohannes Doerfert using UseVector = SmallVector<Use *, 16>; 1678855fec3SJohannes Doerfert 168b8235d2bSsstefan1 /// Clear UsesMap for runtime function. 169b8235d2bSsstefan1 void clearUsesMap() { UsesMap.clear(); } 170b8235d2bSsstefan1 17154bd3751SJohannes Doerfert /// Boolean conversion that is true if the runtime function was found. 17254bd3751SJohannes Doerfert operator bool() const { return Declaration; } 17354bd3751SJohannes Doerfert 1748855fec3SJohannes Doerfert /// Return the vector of uses in function \p F. 1758855fec3SJohannes Doerfert UseVector &getOrCreateUseVector(Function *F) { 176b8235d2bSsstefan1 std::shared_ptr<UseVector> &UV = UsesMap[F]; 1778855fec3SJohannes Doerfert if (!UV) 178b8235d2bSsstefan1 UV = std::make_shared<UseVector>(); 1798855fec3SJohannes Doerfert return *UV; 1808855fec3SJohannes Doerfert } 1818855fec3SJohannes Doerfert 1828855fec3SJohannes Doerfert /// Return the vector of uses in function \p F or `nullptr` if there are 1838855fec3SJohannes Doerfert /// none. 1848855fec3SJohannes Doerfert const UseVector *getUseVector(Function &F) const { 18595e57072SDavid Blaikie auto I = UsesMap.find(&F); 18695e57072SDavid Blaikie if (I != UsesMap.end()) 18795e57072SDavid Blaikie return I->second.get(); 18895e57072SDavid Blaikie return nullptr; 1898855fec3SJohannes Doerfert } 1908855fec3SJohannes Doerfert 1918855fec3SJohannes Doerfert /// Return how many functions contain uses of this runtime function. 1928855fec3SJohannes Doerfert size_t getNumFunctionsWithUses() const { return UsesMap.size(); } 1939548b74aSJohannes Doerfert 1949548b74aSJohannes Doerfert /// Return the number of arguments (or the minimal number for variadic 1959548b74aSJohannes Doerfert /// functions). 1969548b74aSJohannes Doerfert size_t getNumArgs() const { return ArgumentTypes.size(); } 1979548b74aSJohannes Doerfert 1989548b74aSJohannes Doerfert /// Run the callback \p CB on each use and forget the use if the result is 1999548b74aSJohannes Doerfert /// true. The callback will be fed the function in which the use was 2009548b74aSJohannes Doerfert /// encountered as second argument. 201624d34afSJohannes Doerfert void foreachUse(SmallVectorImpl<Function *> &SCC, 202624d34afSJohannes Doerfert function_ref<bool(Use &, Function &)> CB) { 203624d34afSJohannes Doerfert for (Function *F : SCC) 204624d34afSJohannes Doerfert foreachUse(CB, F); 205e099c7b6Ssstefan1 } 206e099c7b6Ssstefan1 207e099c7b6Ssstefan1 /// Run the callback \p CB on each use within the function \p F and forget 208e099c7b6Ssstefan1 /// the use if the result is true. 209624d34afSJohannes Doerfert void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { 2108855fec3SJohannes Doerfert SmallVector<unsigned, 8> ToBeDeleted; 2119548b74aSJohannes Doerfert ToBeDeleted.clear(); 212e099c7b6Ssstefan1 2138855fec3SJohannes Doerfert unsigned Idx = 0; 214624d34afSJohannes Doerfert UseVector &UV = getOrCreateUseVector(F); 215e099c7b6Ssstefan1 2168855fec3SJohannes Doerfert for (Use *U : UV) { 217e099c7b6Ssstefan1 if (CB(*U, *F)) 2188855fec3SJohannes Doerfert ToBeDeleted.push_back(Idx); 2198855fec3SJohannes Doerfert ++Idx; 2208855fec3SJohannes Doerfert } 2218855fec3SJohannes Doerfert 2228855fec3SJohannes Doerfert // Remove the to-be-deleted indices in reverse order as prior 223b726c557SJohannes Doerfert // modifications will not modify the smaller indices. 2248855fec3SJohannes Doerfert while (!ToBeDeleted.empty()) { 2258855fec3SJohannes Doerfert unsigned Idx = ToBeDeleted.pop_back_val(); 2268855fec3SJohannes Doerfert UV[Idx] = UV.back(); 2278855fec3SJohannes Doerfert UV.pop_back(); 2289548b74aSJohannes Doerfert } 2299548b74aSJohannes Doerfert } 2308855fec3SJohannes Doerfert 2318855fec3SJohannes Doerfert private: 2328855fec3SJohannes Doerfert /// Map from functions to all uses of this runtime function contained in 2338855fec3SJohannes Doerfert /// them. 234b8235d2bSsstefan1 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; 2359548b74aSJohannes Doerfert }; 2369548b74aSJohannes Doerfert 2377cfd267cSsstefan1 /// An OpenMP-IR-Builder instance 2387cfd267cSsstefan1 OpenMPIRBuilder OMPBuilder; 2397cfd267cSsstefan1 2407cfd267cSsstefan1 /// Map from runtime function kind to the runtime function description. 2417cfd267cSsstefan1 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, 2427cfd267cSsstefan1 RuntimeFunction::OMPRTL___last> 2437cfd267cSsstefan1 RFIs; 2447cfd267cSsstefan1 2450f426935Ssstefan1 /// Map from ICV kind to the ICV description. 2460f426935Ssstefan1 EnumeratedArray<InternalControlVarInfo, InternalControlVar, 2470f426935Ssstefan1 InternalControlVar::ICV___last> 2480f426935Ssstefan1 ICVs; 2490f426935Ssstefan1 2500f426935Ssstefan1 /// Helper to initialize all internal control variable information for those 2510f426935Ssstefan1 /// defined in OMPKinds.def. 2520f426935Ssstefan1 void initializeInternalControlVars() { 2530f426935Ssstefan1 #define ICV_RT_SET(_Name, RTL) \ 2540f426935Ssstefan1 { \ 2550f426935Ssstefan1 auto &ICV = ICVs[_Name]; \ 2560f426935Ssstefan1 ICV.Setter = RTL; \ 2570f426935Ssstefan1 } 2580f426935Ssstefan1 #define ICV_RT_GET(Name, RTL) \ 2590f426935Ssstefan1 { \ 2600f426935Ssstefan1 auto &ICV = ICVs[Name]; \ 2610f426935Ssstefan1 ICV.Getter = RTL; \ 2620f426935Ssstefan1 } 2630f426935Ssstefan1 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \ 2640f426935Ssstefan1 { \ 2650f426935Ssstefan1 auto &ICV = ICVs[Enum]; \ 2660f426935Ssstefan1 ICV.Name = _Name; \ 2670f426935Ssstefan1 ICV.Kind = Enum; \ 2680f426935Ssstefan1 ICV.InitKind = Init; \ 2690f426935Ssstefan1 ICV.EnvVarName = _EnvVarName; \ 2700f426935Ssstefan1 switch (ICV.InitKind) { \ 271951e43f3Ssstefan1 case ICV_IMPLEMENTATION_DEFINED: \ 2720f426935Ssstefan1 ICV.InitValue = nullptr; \ 2730f426935Ssstefan1 break; \ 274951e43f3Ssstefan1 case ICV_ZERO: \ 2756aab27baSsstefan1 ICV.InitValue = ConstantInt::get( \ 2766aab27baSsstefan1 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \ 2770f426935Ssstefan1 break; \ 278951e43f3Ssstefan1 case ICV_FALSE: \ 2796aab27baSsstefan1 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \ 2800f426935Ssstefan1 break; \ 281951e43f3Ssstefan1 case ICV_LAST: \ 2820f426935Ssstefan1 break; \ 2830f426935Ssstefan1 } \ 2840f426935Ssstefan1 } 2850f426935Ssstefan1 #include "llvm/Frontend/OpenMP/OMPKinds.def" 2860f426935Ssstefan1 } 2870f426935Ssstefan1 2887cfd267cSsstefan1 /// Returns true if the function declaration \p F matches the runtime 2897cfd267cSsstefan1 /// function types, that is, return type \p RTFRetType, and argument types 2907cfd267cSsstefan1 /// \p RTFArgTypes. 2917cfd267cSsstefan1 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, 2927cfd267cSsstefan1 SmallVector<Type *, 8> &RTFArgTypes) { 2937cfd267cSsstefan1 // TODO: We should output information to the user (under debug output 2947cfd267cSsstefan1 // and via remarks). 2957cfd267cSsstefan1 2967cfd267cSsstefan1 if (!F) 2977cfd267cSsstefan1 return false; 2987cfd267cSsstefan1 if (F->getReturnType() != RTFRetType) 2997cfd267cSsstefan1 return false; 3007cfd267cSsstefan1 if (F->arg_size() != RTFArgTypes.size()) 3017cfd267cSsstefan1 return false; 3027cfd267cSsstefan1 3037cfd267cSsstefan1 auto RTFTyIt = RTFArgTypes.begin(); 3047cfd267cSsstefan1 for (Argument &Arg : F->args()) { 3057cfd267cSsstefan1 if (Arg.getType() != *RTFTyIt) 3067cfd267cSsstefan1 return false; 3077cfd267cSsstefan1 3087cfd267cSsstefan1 ++RTFTyIt; 3097cfd267cSsstefan1 } 3107cfd267cSsstefan1 3117cfd267cSsstefan1 return true; 3127cfd267cSsstefan1 } 3137cfd267cSsstefan1 314b726c557SJohannes Doerfert // Helper to collect all uses of the declaration in the UsesMap. 315b8235d2bSsstefan1 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { 3167cfd267cSsstefan1 unsigned NumUses = 0; 3177cfd267cSsstefan1 if (!RFI.Declaration) 3187cfd267cSsstefan1 return NumUses; 3197cfd267cSsstefan1 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); 3207cfd267cSsstefan1 321b8235d2bSsstefan1 if (CollectStats) { 3227cfd267cSsstefan1 NumOpenMPRuntimeFunctionsIdentified += 1; 3237cfd267cSsstefan1 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); 324b8235d2bSsstefan1 } 3257cfd267cSsstefan1 3267cfd267cSsstefan1 // TODO: We directly convert uses into proper calls and unknown uses. 3277cfd267cSsstefan1 for (Use &U : RFI.Declaration->uses()) { 3287cfd267cSsstefan1 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { 3297cfd267cSsstefan1 if (ModuleSlice.count(UserI->getFunction())) { 3307cfd267cSsstefan1 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); 3317cfd267cSsstefan1 ++NumUses; 3327cfd267cSsstefan1 } 3337cfd267cSsstefan1 } else { 3347cfd267cSsstefan1 RFI.getOrCreateUseVector(nullptr).push_back(&U); 3357cfd267cSsstefan1 ++NumUses; 3367cfd267cSsstefan1 } 3377cfd267cSsstefan1 } 3387cfd267cSsstefan1 return NumUses; 339b8235d2bSsstefan1 } 3407cfd267cSsstefan1 34197517055SGiorgis Georgakoudis // Helper function to recollect uses of a runtime function. 34297517055SGiorgis Georgakoudis void recollectUsesForFunction(RuntimeFunction RTF) { 34397517055SGiorgis Georgakoudis auto &RFI = RFIs[RTF]; 344b8235d2bSsstefan1 RFI.clearUsesMap(); 345b8235d2bSsstefan1 collectUses(RFI, /*CollectStats*/ false); 346b8235d2bSsstefan1 } 34797517055SGiorgis Georgakoudis 34897517055SGiorgis Georgakoudis // Helper function to recollect uses of all runtime functions. 34997517055SGiorgis Georgakoudis void recollectUses() { 35097517055SGiorgis Georgakoudis for (int Idx = 0; Idx < RFIs.size(); ++Idx) 35197517055SGiorgis Georgakoudis recollectUsesForFunction(static_cast<RuntimeFunction>(Idx)); 352b8235d2bSsstefan1 } 353b8235d2bSsstefan1 354b8235d2bSsstefan1 /// Helper to initialize all runtime function information for those defined 355b8235d2bSsstefan1 /// in OpenMPKinds.def. 356b8235d2bSsstefan1 void initializeRuntimeFunctions() { 3577cfd267cSsstefan1 Module &M = *((*ModuleSlice.begin())->getParent()); 3587cfd267cSsstefan1 3596aab27baSsstefan1 // Helper macros for handling __VA_ARGS__ in OMP_RTL 3606aab27baSsstefan1 #define OMP_TYPE(VarName, ...) \ 3616aab27baSsstefan1 Type *VarName = OMPBuilder.VarName; \ 3626aab27baSsstefan1 (void)VarName; 3636aab27baSsstefan1 3646aab27baSsstefan1 #define OMP_ARRAY_TYPE(VarName, ...) \ 3656aab27baSsstefan1 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \ 3666aab27baSsstefan1 (void)VarName##Ty; \ 3676aab27baSsstefan1 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \ 3686aab27baSsstefan1 (void)VarName##PtrTy; 3696aab27baSsstefan1 3706aab27baSsstefan1 #define OMP_FUNCTION_TYPE(VarName, ...) \ 3716aab27baSsstefan1 FunctionType *VarName = OMPBuilder.VarName; \ 3726aab27baSsstefan1 (void)VarName; \ 3736aab27baSsstefan1 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 3746aab27baSsstefan1 (void)VarName##Ptr; 3756aab27baSsstefan1 3766aab27baSsstefan1 #define OMP_STRUCT_TYPE(VarName, ...) \ 3776aab27baSsstefan1 StructType *VarName = OMPBuilder.VarName; \ 3786aab27baSsstefan1 (void)VarName; \ 3796aab27baSsstefan1 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 3806aab27baSsstefan1 (void)VarName##Ptr; 3816aab27baSsstefan1 3827cfd267cSsstefan1 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ 3837cfd267cSsstefan1 { \ 3847cfd267cSsstefan1 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ 3857cfd267cSsstefan1 Function *F = M.getFunction(_Name); \ 3866aab27baSsstefan1 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ 3877cfd267cSsstefan1 auto &RFI = RFIs[_Enum]; \ 3887cfd267cSsstefan1 RFI.Kind = _Enum; \ 3897cfd267cSsstefan1 RFI.Name = _Name; \ 3907cfd267cSsstefan1 RFI.IsVarArg = _IsVarArg; \ 3916aab27baSsstefan1 RFI.ReturnType = OMPBuilder._ReturnType; \ 3927cfd267cSsstefan1 RFI.ArgumentTypes = std::move(ArgsTypes); \ 3937cfd267cSsstefan1 RFI.Declaration = F; \ 394b8235d2bSsstefan1 unsigned NumUses = collectUses(RFI); \ 3957cfd267cSsstefan1 (void)NumUses; \ 3967cfd267cSsstefan1 LLVM_DEBUG({ \ 3977cfd267cSsstefan1 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ 3987cfd267cSsstefan1 << " found\n"; \ 3997cfd267cSsstefan1 if (RFI.Declaration) \ 4007cfd267cSsstefan1 dbgs() << TAG << "-> got " << NumUses << " uses in " \ 4017cfd267cSsstefan1 << RFI.getNumFunctionsWithUses() \ 4027cfd267cSsstefan1 << " different functions.\n"; \ 4037cfd267cSsstefan1 }); \ 4047cfd267cSsstefan1 } \ 4057cfd267cSsstefan1 } 4067cfd267cSsstefan1 #include "llvm/Frontend/OpenMP/OMPKinds.def" 4077cfd267cSsstefan1 4087cfd267cSsstefan1 // TODO: We should attach the attributes defined in OMPKinds.def. 4097cfd267cSsstefan1 } 410e8039ad4SJohannes Doerfert 411e8039ad4SJohannes Doerfert /// Collection of known kernels (\see Kernel) in the module. 412e8039ad4SJohannes Doerfert SmallPtrSetImpl<Kernel> &Kernels; 4137cfd267cSsstefan1 }; 4147cfd267cSsstefan1 4158931add6SHamilton Tobon Mosquera /// Used to map the values physically (in the IR) stored in an offload 4168931add6SHamilton Tobon Mosquera /// array, to a vector in memory. 4178931add6SHamilton Tobon Mosquera struct OffloadArray { 4188931add6SHamilton Tobon Mosquera /// Physical array (in the IR). 4198931add6SHamilton Tobon Mosquera AllocaInst *Array = nullptr; 4208931add6SHamilton Tobon Mosquera /// Mapped values. 4218931add6SHamilton Tobon Mosquera SmallVector<Value *, 8> StoredValues; 4228931add6SHamilton Tobon Mosquera /// Last stores made in the offload array. 4238931add6SHamilton Tobon Mosquera SmallVector<StoreInst *, 8> LastAccesses; 4248931add6SHamilton Tobon Mosquera 4258931add6SHamilton Tobon Mosquera OffloadArray() = default; 4268931add6SHamilton Tobon Mosquera 4278931add6SHamilton Tobon Mosquera /// Initializes the OffloadArray with the values stored in \p Array before 4288931add6SHamilton Tobon Mosquera /// instruction \p Before is reached. Returns false if the initialization 4298931add6SHamilton Tobon Mosquera /// fails. 4308931add6SHamilton Tobon Mosquera /// This MUST be used immediately after the construction of the object. 4318931add6SHamilton Tobon Mosquera bool initialize(AllocaInst &Array, Instruction &Before) { 4328931add6SHamilton Tobon Mosquera if (!Array.getAllocatedType()->isArrayTy()) 4338931add6SHamilton Tobon Mosquera return false; 4348931add6SHamilton Tobon Mosquera 4358931add6SHamilton Tobon Mosquera if (!getValues(Array, Before)) 4368931add6SHamilton Tobon Mosquera return false; 4378931add6SHamilton Tobon Mosquera 4388931add6SHamilton Tobon Mosquera this->Array = &Array; 4398931add6SHamilton Tobon Mosquera return true; 4408931add6SHamilton Tobon Mosquera } 4418931add6SHamilton Tobon Mosquera 442da8bec47SJoseph Huber static const unsigned DeviceIDArgNum = 1; 443da8bec47SJoseph Huber static const unsigned BasePtrsArgNum = 3; 444da8bec47SJoseph Huber static const unsigned PtrsArgNum = 4; 445da8bec47SJoseph Huber static const unsigned SizesArgNum = 5; 4461d3d9b9cSHamilton Tobon Mosquera 4478931add6SHamilton Tobon Mosquera private: 4488931add6SHamilton Tobon Mosquera /// Traverses the BasicBlock where \p Array is, collecting the stores made to 4498931add6SHamilton Tobon Mosquera /// \p Array, leaving StoredValues with the values stored before the 4508931add6SHamilton Tobon Mosquera /// instruction \p Before is reached. 4518931add6SHamilton Tobon Mosquera bool getValues(AllocaInst &Array, Instruction &Before) { 4528931add6SHamilton Tobon Mosquera // Initialize container. 453d08d490aSJohannes Doerfert const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements(); 4548931add6SHamilton Tobon Mosquera StoredValues.assign(NumValues, nullptr); 4558931add6SHamilton Tobon Mosquera LastAccesses.assign(NumValues, nullptr); 4568931add6SHamilton Tobon Mosquera 4578931add6SHamilton Tobon Mosquera // TODO: This assumes the instruction \p Before is in the same 4588931add6SHamilton Tobon Mosquera // BasicBlock as Array. Make it general, for any control flow graph. 4598931add6SHamilton Tobon Mosquera BasicBlock *BB = Array.getParent(); 4608931add6SHamilton Tobon Mosquera if (BB != Before.getParent()) 4618931add6SHamilton Tobon Mosquera return false; 4628931add6SHamilton Tobon Mosquera 4638931add6SHamilton Tobon Mosquera const DataLayout &DL = Array.getModule()->getDataLayout(); 4648931add6SHamilton Tobon Mosquera const unsigned int PointerSize = DL.getPointerSize(); 4658931add6SHamilton Tobon Mosquera 4668931add6SHamilton Tobon Mosquera for (Instruction &I : *BB) { 4678931add6SHamilton Tobon Mosquera if (&I == &Before) 4688931add6SHamilton Tobon Mosquera break; 4698931add6SHamilton Tobon Mosquera 4708931add6SHamilton Tobon Mosquera if (!isa<StoreInst>(&I)) 4718931add6SHamilton Tobon Mosquera continue; 4728931add6SHamilton Tobon Mosquera 4738931add6SHamilton Tobon Mosquera auto *S = cast<StoreInst>(&I); 4748931add6SHamilton Tobon Mosquera int64_t Offset = -1; 475d08d490aSJohannes Doerfert auto *Dst = 476d08d490aSJohannes Doerfert GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL); 4778931add6SHamilton Tobon Mosquera if (Dst == &Array) { 4788931add6SHamilton Tobon Mosquera int64_t Idx = Offset / PointerSize; 4798931add6SHamilton Tobon Mosquera StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); 4808931add6SHamilton Tobon Mosquera LastAccesses[Idx] = S; 4818931add6SHamilton Tobon Mosquera } 4828931add6SHamilton Tobon Mosquera } 4838931add6SHamilton Tobon Mosquera 4848931add6SHamilton Tobon Mosquera return isFilled(); 4858931add6SHamilton Tobon Mosquera } 4868931add6SHamilton Tobon Mosquera 4878931add6SHamilton Tobon Mosquera /// Returns true if all values in StoredValues and 4888931add6SHamilton Tobon Mosquera /// LastAccesses are not nullptrs. 4898931add6SHamilton Tobon Mosquera bool isFilled() { 4908931add6SHamilton Tobon Mosquera const unsigned NumValues = StoredValues.size(); 4918931add6SHamilton Tobon Mosquera for (unsigned I = 0; I < NumValues; ++I) { 4928931add6SHamilton Tobon Mosquera if (!StoredValues[I] || !LastAccesses[I]) 4938931add6SHamilton Tobon Mosquera return false; 4948931add6SHamilton Tobon Mosquera } 4958931add6SHamilton Tobon Mosquera 4968931add6SHamilton Tobon Mosquera return true; 4978931add6SHamilton Tobon Mosquera } 4988931add6SHamilton Tobon Mosquera }; 4998931add6SHamilton Tobon Mosquera 5007cfd267cSsstefan1 struct OpenMPOpt { 5017cfd267cSsstefan1 5027cfd267cSsstefan1 using OptimizationRemarkGetter = 5037cfd267cSsstefan1 function_ref<OptimizationRemarkEmitter &(Function *)>; 5047cfd267cSsstefan1 5057cfd267cSsstefan1 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, 5067cfd267cSsstefan1 OptimizationRemarkGetter OREGetter, 507b8235d2bSsstefan1 OMPInformationCache &OMPInfoCache, Attributor &A) 50877b79d79SMehdi Amini : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), 509b8235d2bSsstefan1 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} 5107cfd267cSsstefan1 511a2281419SJoseph Huber /// Check if any remarks are enabled for openmp-opt 512a2281419SJoseph Huber bool remarksEnabled() { 513a2281419SJoseph Huber auto &Ctx = M.getContext(); 514a2281419SJoseph Huber return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE); 515a2281419SJoseph Huber } 516a2281419SJoseph Huber 5179548b74aSJohannes Doerfert /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. 518b2ad63d3SJoseph Huber bool run(bool IsModulePass) { 51954bd3751SJohannes Doerfert if (SCC.empty()) 52054bd3751SJohannes Doerfert return false; 52154bd3751SJohannes Doerfert 5229548b74aSJohannes Doerfert bool Changed = false; 5239548b74aSJohannes Doerfert 5249548b74aSJohannes Doerfert LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() 52577b79d79SMehdi Amini << " functions in a slice with " 52677b79d79SMehdi Amini << OMPInfoCache.ModuleSlice.size() << " functions\n"); 5279548b74aSJohannes Doerfert 528b2ad63d3SJoseph Huber if (IsModulePass) { 52918283125SJoseph Huber Changed |= runAttributor(); 53018283125SJoseph Huber 5316fc51c9fSJoseph Huber // Recollect uses, in case Attributor deleted any. 5326fc51c9fSJoseph Huber OMPInfoCache.recollectUses(); 5336fc51c9fSJoseph Huber 534b2ad63d3SJoseph Huber if (remarksEnabled()) 535b2ad63d3SJoseph Huber analysisGlobalization(); 536b2ad63d3SJoseph Huber } else { 537e8039ad4SJohannes Doerfert if (PrintICVValues) 538e8039ad4SJohannes Doerfert printICVs(); 539e8039ad4SJohannes Doerfert if (PrintOpenMPKernels) 540e8039ad4SJohannes Doerfert printKernels(); 541e8039ad4SJohannes Doerfert 5425b0581aeSJohannes Doerfert Changed |= rewriteDeviceCodeStateMachine(); 5435b0581aeSJohannes Doerfert 544e8039ad4SJohannes Doerfert Changed |= runAttributor(); 545e8039ad4SJohannes Doerfert 546e8039ad4SJohannes Doerfert // Recollect uses, in case Attributor deleted any. 547e8039ad4SJohannes Doerfert OMPInfoCache.recollectUses(); 548e8039ad4SJohannes Doerfert 549e8039ad4SJohannes Doerfert Changed |= deleteParallelRegions(); 550496f8e5bSHamilton Tobon Mosquera if (HideMemoryTransferLatency) 551496f8e5bSHamilton Tobon Mosquera Changed |= hideMemTransfersLatency(); 5523a6bfcf2SGiorgis Georgakoudis Changed |= deduplicateRuntimeCalls(); 5533a6bfcf2SGiorgis Georgakoudis if (EnableParallelRegionMerging) { 5543a6bfcf2SGiorgis Georgakoudis if (mergeParallelRegions()) { 5553a6bfcf2SGiorgis Georgakoudis deduplicateRuntimeCalls(); 5563a6bfcf2SGiorgis Georgakoudis Changed = true; 5573a6bfcf2SGiorgis Georgakoudis } 5583a6bfcf2SGiorgis Georgakoudis } 559b2ad63d3SJoseph Huber } 560e8039ad4SJohannes Doerfert 561e8039ad4SJohannes Doerfert return Changed; 562e8039ad4SJohannes Doerfert } 563e8039ad4SJohannes Doerfert 5640f426935Ssstefan1 /// Print initial ICV values for testing. 5650f426935Ssstefan1 /// FIXME: This should be done from the Attributor once it is added. 566e8039ad4SJohannes Doerfert void printICVs() const { 567cb9cfa0dSsstefan1 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, 568cb9cfa0dSsstefan1 ICV_proc_bind}; 5690f426935Ssstefan1 5700f426935Ssstefan1 for (Function *F : OMPInfoCache.ModuleSlice) { 5710f426935Ssstefan1 for (auto ICV : ICVs) { 5720f426935Ssstefan1 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 5732db182ffSJoseph Huber auto Remark = [&](OptimizationRemarkAnalysis ORA) { 5742db182ffSJoseph Huber return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) 5750f426935Ssstefan1 << " Value: " 5760f426935Ssstefan1 << (ICVInfo.InitValue 57761cdaf66SSimon Pilgrim ? toString(ICVInfo.InitValue->getValue(), 10, true) 5780f426935Ssstefan1 : "IMPLEMENTATION_DEFINED"); 5790f426935Ssstefan1 }; 5800f426935Ssstefan1 5812db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark); 5820f426935Ssstefan1 } 5830f426935Ssstefan1 } 5840f426935Ssstefan1 } 5850f426935Ssstefan1 586e8039ad4SJohannes Doerfert /// Print OpenMP GPU kernels for testing. 587e8039ad4SJohannes Doerfert void printKernels() const { 588e8039ad4SJohannes Doerfert for (Function *F : SCC) { 589e8039ad4SJohannes Doerfert if (!OMPInfoCache.Kernels.count(F)) 590e8039ad4SJohannes Doerfert continue; 591b8235d2bSsstefan1 5922db182ffSJoseph Huber auto Remark = [&](OptimizationRemarkAnalysis ORA) { 5932db182ffSJoseph Huber return ORA << "OpenMP GPU kernel " 594e8039ad4SJohannes Doerfert << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; 595e8039ad4SJohannes Doerfert }; 596b8235d2bSsstefan1 5972db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark); 598e8039ad4SJohannes Doerfert } 5999548b74aSJohannes Doerfert } 6009548b74aSJohannes Doerfert 6017cfd267cSsstefan1 /// Return the call if \p U is a callee use in a regular call. If \p RFI is 6027cfd267cSsstefan1 /// given it has to be the callee or a nullptr is returned. 6037cfd267cSsstefan1 static CallInst *getCallIfRegularCall( 6047cfd267cSsstefan1 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 6057cfd267cSsstefan1 CallInst *CI = dyn_cast<CallInst>(U.getUser()); 6067cfd267cSsstefan1 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && 6077cfd267cSsstefan1 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 6087cfd267cSsstefan1 return CI; 6097cfd267cSsstefan1 return nullptr; 6107cfd267cSsstefan1 } 6117cfd267cSsstefan1 6127cfd267cSsstefan1 /// Return the call if \p V is a regular call. If \p RFI is given it has to be 6137cfd267cSsstefan1 /// the callee or a nullptr is returned. 6147cfd267cSsstefan1 static CallInst *getCallIfRegularCall( 6157cfd267cSsstefan1 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 6167cfd267cSsstefan1 CallInst *CI = dyn_cast<CallInst>(&V); 6177cfd267cSsstefan1 if (CI && !CI->hasOperandBundles() && 6187cfd267cSsstefan1 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 6197cfd267cSsstefan1 return CI; 6207cfd267cSsstefan1 return nullptr; 6217cfd267cSsstefan1 } 6227cfd267cSsstefan1 6239548b74aSJohannes Doerfert private: 6243a6bfcf2SGiorgis Georgakoudis /// Merge parallel regions when it is safe. 6253a6bfcf2SGiorgis Georgakoudis bool mergeParallelRegions() { 6263a6bfcf2SGiorgis Georgakoudis const unsigned CallbackCalleeOperand = 2; 6273a6bfcf2SGiorgis Georgakoudis const unsigned CallbackFirstArgOperand = 3; 6283a6bfcf2SGiorgis Georgakoudis using InsertPointTy = OpenMPIRBuilder::InsertPointTy; 6293a6bfcf2SGiorgis Georgakoudis 6303a6bfcf2SGiorgis Georgakoudis // Check if there are any __kmpc_fork_call calls to merge. 6313a6bfcf2SGiorgis Georgakoudis OMPInformationCache::RuntimeFunctionInfo &RFI = 6323a6bfcf2SGiorgis Georgakoudis OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 6333a6bfcf2SGiorgis Georgakoudis 6343a6bfcf2SGiorgis Georgakoudis if (!RFI.Declaration) 6353a6bfcf2SGiorgis Georgakoudis return false; 6363a6bfcf2SGiorgis Georgakoudis 63797517055SGiorgis Georgakoudis // Unmergable calls that prevent merging a parallel region. 63897517055SGiorgis Georgakoudis OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = { 63997517055SGiorgis Georgakoudis OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind], 64097517055SGiorgis Georgakoudis OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads], 64197517055SGiorgis Georgakoudis }; 6423a6bfcf2SGiorgis Georgakoudis 6433a6bfcf2SGiorgis Georgakoudis bool Changed = false; 6443a6bfcf2SGiorgis Georgakoudis LoopInfo *LI = nullptr; 6453a6bfcf2SGiorgis Georgakoudis DominatorTree *DT = nullptr; 6463a6bfcf2SGiorgis Georgakoudis 6473a6bfcf2SGiorgis Georgakoudis SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; 6483a6bfcf2SGiorgis Georgakoudis 6493a6bfcf2SGiorgis Georgakoudis BasicBlock *StartBB = nullptr, *EndBB = nullptr; 6503a6bfcf2SGiorgis Georgakoudis auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 6513a6bfcf2SGiorgis Georgakoudis BasicBlock &ContinuationIP) { 6523a6bfcf2SGiorgis Georgakoudis BasicBlock *CGStartBB = CodeGenIP.getBlock(); 6533a6bfcf2SGiorgis Georgakoudis BasicBlock *CGEndBB = 6543a6bfcf2SGiorgis Georgakoudis SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 6553a6bfcf2SGiorgis Georgakoudis assert(StartBB != nullptr && "StartBB should not be null"); 6563a6bfcf2SGiorgis Georgakoudis CGStartBB->getTerminator()->setSuccessor(0, StartBB); 6573a6bfcf2SGiorgis Georgakoudis assert(EndBB != nullptr && "EndBB should not be null"); 6583a6bfcf2SGiorgis Georgakoudis EndBB->getTerminator()->setSuccessor(0, CGEndBB); 6593a6bfcf2SGiorgis Georgakoudis }; 6603a6bfcf2SGiorgis Georgakoudis 661240dd924SAlex Zinenko auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &, 662240dd924SAlex Zinenko Value &Inner, Value *&ReplacementValue) -> InsertPointTy { 663240dd924SAlex Zinenko ReplacementValue = &Inner; 6643a6bfcf2SGiorgis Georgakoudis return CodeGenIP; 6653a6bfcf2SGiorgis Georgakoudis }; 6663a6bfcf2SGiorgis Georgakoudis 6673a6bfcf2SGiorgis Georgakoudis auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 6683a6bfcf2SGiorgis Georgakoudis 66997517055SGiorgis Georgakoudis /// Create a sequential execution region within a merged parallel region, 67097517055SGiorgis Georgakoudis /// encapsulated in a master construct with a barrier for synchronization. 67197517055SGiorgis Georgakoudis auto CreateSequentialRegion = [&](Function *OuterFn, 67297517055SGiorgis Georgakoudis BasicBlock *OuterPredBB, 67397517055SGiorgis Georgakoudis Instruction *SeqStartI, 67497517055SGiorgis Georgakoudis Instruction *SeqEndI) { 67597517055SGiorgis Georgakoudis // Isolate the instructions of the sequential region to a separate 67697517055SGiorgis Georgakoudis // block. 67797517055SGiorgis Georgakoudis BasicBlock *ParentBB = SeqStartI->getParent(); 67897517055SGiorgis Georgakoudis BasicBlock *SeqEndBB = 67997517055SGiorgis Georgakoudis SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI); 68097517055SGiorgis Georgakoudis BasicBlock *SeqAfterBB = 68197517055SGiorgis Georgakoudis SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI); 68297517055SGiorgis Georgakoudis BasicBlock *SeqStartBB = 68397517055SGiorgis Georgakoudis SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged"); 68497517055SGiorgis Georgakoudis 68597517055SGiorgis Georgakoudis assert(ParentBB->getUniqueSuccessor() == SeqStartBB && 68697517055SGiorgis Georgakoudis "Expected a different CFG"); 68797517055SGiorgis Georgakoudis const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); 68897517055SGiorgis Georgakoudis ParentBB->getTerminator()->eraseFromParent(); 68997517055SGiorgis Georgakoudis 69097517055SGiorgis Georgakoudis auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 69197517055SGiorgis Georgakoudis BasicBlock &ContinuationIP) { 69297517055SGiorgis Georgakoudis BasicBlock *CGStartBB = CodeGenIP.getBlock(); 69397517055SGiorgis Georgakoudis BasicBlock *CGEndBB = 69497517055SGiorgis Georgakoudis SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 69597517055SGiorgis Georgakoudis assert(SeqStartBB != nullptr && "SeqStartBB should not be null"); 69697517055SGiorgis Georgakoudis CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB); 69797517055SGiorgis Georgakoudis assert(SeqEndBB != nullptr && "SeqEndBB should not be null"); 69897517055SGiorgis Georgakoudis SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB); 69997517055SGiorgis Georgakoudis }; 70097517055SGiorgis Georgakoudis auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 70197517055SGiorgis Georgakoudis 70297517055SGiorgis Georgakoudis // Find outputs from the sequential region to outside users and 70397517055SGiorgis Georgakoudis // broadcast their values to them. 70497517055SGiorgis Georgakoudis for (Instruction &I : *SeqStartBB) { 70597517055SGiorgis Georgakoudis SmallPtrSet<Instruction *, 4> OutsideUsers; 70697517055SGiorgis Georgakoudis for (User *Usr : I.users()) { 70797517055SGiorgis Georgakoudis Instruction &UsrI = *cast<Instruction>(Usr); 70897517055SGiorgis Georgakoudis // Ignore outputs to LT intrinsics, code extraction for the merged 70997517055SGiorgis Georgakoudis // parallel region will fix them. 71097517055SGiorgis Georgakoudis if (UsrI.isLifetimeStartOrEnd()) 71197517055SGiorgis Georgakoudis continue; 71297517055SGiorgis Georgakoudis 71397517055SGiorgis Georgakoudis if (UsrI.getParent() != SeqStartBB) 71497517055SGiorgis Georgakoudis OutsideUsers.insert(&UsrI); 71597517055SGiorgis Georgakoudis } 71697517055SGiorgis Georgakoudis 71797517055SGiorgis Georgakoudis if (OutsideUsers.empty()) 71897517055SGiorgis Georgakoudis continue; 71997517055SGiorgis Georgakoudis 72097517055SGiorgis Georgakoudis // Emit an alloca in the outer region to store the broadcasted 72197517055SGiorgis Georgakoudis // value. 72297517055SGiorgis Georgakoudis const DataLayout &DL = M.getDataLayout(); 72397517055SGiorgis Georgakoudis AllocaInst *AllocaI = new AllocaInst( 72497517055SGiorgis Georgakoudis I.getType(), DL.getAllocaAddrSpace(), nullptr, 72597517055SGiorgis Georgakoudis I.getName() + ".seq.output.alloc", &OuterFn->front().front()); 72697517055SGiorgis Georgakoudis 72797517055SGiorgis Georgakoudis // Emit a store instruction in the sequential BB to update the 72897517055SGiorgis Georgakoudis // value. 72997517055SGiorgis Georgakoudis new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()); 73097517055SGiorgis Georgakoudis 73197517055SGiorgis Georgakoudis // Emit a load instruction and replace the use of the output value 73297517055SGiorgis Georgakoudis // with it. 73397517055SGiorgis Georgakoudis for (Instruction *UsrI : OutsideUsers) { 7345b70c12fSJohannes Doerfert LoadInst *LoadI = new LoadInst( 7355b70c12fSJohannes Doerfert I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI); 73697517055SGiorgis Georgakoudis UsrI->replaceUsesOfWith(&I, LoadI); 73797517055SGiorgis Georgakoudis } 73897517055SGiorgis Georgakoudis } 73997517055SGiorgis Georgakoudis 74097517055SGiorgis Georgakoudis OpenMPIRBuilder::LocationDescription Loc( 74197517055SGiorgis Georgakoudis InsertPointTy(ParentBB, ParentBB->end()), DL); 74297517055SGiorgis Georgakoudis InsertPointTy SeqAfterIP = 74397517055SGiorgis Georgakoudis OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB); 74497517055SGiorgis Georgakoudis 74597517055SGiorgis Georgakoudis OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel); 74697517055SGiorgis Georgakoudis 74797517055SGiorgis Georgakoudis BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock()); 74897517055SGiorgis Georgakoudis 74997517055SGiorgis Georgakoudis LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn 75097517055SGiorgis Georgakoudis << "\n"); 75197517055SGiorgis Georgakoudis }; 75297517055SGiorgis Georgakoudis 7533a6bfcf2SGiorgis Georgakoudis // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all 7543a6bfcf2SGiorgis Georgakoudis // contained in BB and only separated by instructions that can be 7553a6bfcf2SGiorgis Georgakoudis // redundantly executed in parallel. The block BB is split before the first 7563a6bfcf2SGiorgis Georgakoudis // call (in MergableCIs) and after the last so the entire region we merge 7573a6bfcf2SGiorgis Georgakoudis // into a single parallel region is contained in a single basic block 7583a6bfcf2SGiorgis Georgakoudis // without any other instructions. We use the OpenMPIRBuilder to outline 7593a6bfcf2SGiorgis Georgakoudis // that block and call the resulting function via __kmpc_fork_call. 7603a6bfcf2SGiorgis Georgakoudis auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { 7613a6bfcf2SGiorgis Georgakoudis // TODO: Change the interface to allow single CIs expanded, e.g, to 7623a6bfcf2SGiorgis Georgakoudis // include an outer loop. 7633a6bfcf2SGiorgis Georgakoudis assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); 7643a6bfcf2SGiorgis Georgakoudis 7653a6bfcf2SGiorgis Georgakoudis auto Remark = [&](OptimizationRemark OR) { 7663a6bfcf2SGiorgis Georgakoudis OR << "Parallel region at " 7673a6bfcf2SGiorgis Georgakoudis << ore::NV("OpenMPParallelMergeFront", 7683a6bfcf2SGiorgis Georgakoudis MergableCIs.front()->getDebugLoc()) 7693a6bfcf2SGiorgis Georgakoudis << " merged with parallel regions at "; 77023b0ab2aSKazu Hirata for (auto *CI : llvm::drop_begin(MergableCIs)) { 7713a6bfcf2SGiorgis Georgakoudis OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); 7723a6bfcf2SGiorgis Georgakoudis if (CI != MergableCIs.back()) 7733a6bfcf2SGiorgis Georgakoudis OR << ", "; 7743a6bfcf2SGiorgis Georgakoudis } 7753a6bfcf2SGiorgis Georgakoudis return OR; 7763a6bfcf2SGiorgis Georgakoudis }; 7773a6bfcf2SGiorgis Georgakoudis 7783a6bfcf2SGiorgis Georgakoudis emitRemark<OptimizationRemark>(MergableCIs.front(), 7793a6bfcf2SGiorgis Georgakoudis "OpenMPParallelRegionMerging", Remark); 7803a6bfcf2SGiorgis Georgakoudis 7813a6bfcf2SGiorgis Georgakoudis Function *OriginalFn = BB->getParent(); 7823a6bfcf2SGiorgis Georgakoudis LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() 7833a6bfcf2SGiorgis Georgakoudis << " parallel regions in " << OriginalFn->getName() 7843a6bfcf2SGiorgis Georgakoudis << "\n"); 7853a6bfcf2SGiorgis Georgakoudis 7863a6bfcf2SGiorgis Georgakoudis // Isolate the calls to merge in a separate block. 7873a6bfcf2SGiorgis Georgakoudis EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); 7883a6bfcf2SGiorgis Georgakoudis BasicBlock *AfterBB = 7893a6bfcf2SGiorgis Georgakoudis SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); 7903a6bfcf2SGiorgis Georgakoudis StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, 7913a6bfcf2SGiorgis Georgakoudis "omp.par.merged"); 7923a6bfcf2SGiorgis Georgakoudis 7933a6bfcf2SGiorgis Georgakoudis assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); 7943a6bfcf2SGiorgis Georgakoudis const DebugLoc DL = BB->getTerminator()->getDebugLoc(); 7953a6bfcf2SGiorgis Georgakoudis BB->getTerminator()->eraseFromParent(); 7963a6bfcf2SGiorgis Georgakoudis 79797517055SGiorgis Georgakoudis // Create sequential regions for sequential instructions that are 79897517055SGiorgis Georgakoudis // in-between mergable parallel regions. 79997517055SGiorgis Georgakoudis for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1; 80097517055SGiorgis Georgakoudis It != End; ++It) { 80197517055SGiorgis Georgakoudis Instruction *ForkCI = *It; 80297517055SGiorgis Georgakoudis Instruction *NextForkCI = *(It + 1); 80397517055SGiorgis Georgakoudis 80497517055SGiorgis Georgakoudis // Continue if there are not in-between instructions. 80597517055SGiorgis Georgakoudis if (ForkCI->getNextNode() == NextForkCI) 80697517055SGiorgis Georgakoudis continue; 80797517055SGiorgis Georgakoudis 80897517055SGiorgis Georgakoudis CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(), 80997517055SGiorgis Georgakoudis NextForkCI->getPrevNode()); 81097517055SGiorgis Georgakoudis } 81197517055SGiorgis Georgakoudis 8123a6bfcf2SGiorgis Georgakoudis OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), 8133a6bfcf2SGiorgis Georgakoudis DL); 8143a6bfcf2SGiorgis Georgakoudis IRBuilder<>::InsertPoint AllocaIP( 8153a6bfcf2SGiorgis Georgakoudis &OriginalFn->getEntryBlock(), 8163a6bfcf2SGiorgis Georgakoudis OriginalFn->getEntryBlock().getFirstInsertionPt()); 8173a6bfcf2SGiorgis Georgakoudis // Create the merged parallel region with default proc binding, to 8183a6bfcf2SGiorgis Georgakoudis // avoid overriding binding settings, and without explicit cancellation. 819e5dba2d7SMichael Kruse InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel( 8203a6bfcf2SGiorgis Georgakoudis Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, 8213a6bfcf2SGiorgis Georgakoudis OMP_PROC_BIND_default, /* IsCancellable */ false); 8223a6bfcf2SGiorgis Georgakoudis BranchInst::Create(AfterBB, AfterIP.getBlock()); 8233a6bfcf2SGiorgis Georgakoudis 8243a6bfcf2SGiorgis Georgakoudis // Perform the actual outlining. 825b1191206SMichael Kruse OMPInfoCache.OMPBuilder.finalize(OriginalFn, 826b1191206SMichael Kruse /* AllowExtractorSinking */ true); 8273a6bfcf2SGiorgis Georgakoudis 8283a6bfcf2SGiorgis Georgakoudis Function *OutlinedFn = MergableCIs.front()->getCaller(); 8293a6bfcf2SGiorgis Georgakoudis 8303a6bfcf2SGiorgis Georgakoudis // Replace the __kmpc_fork_call calls with direct calls to the outlined 8313a6bfcf2SGiorgis Georgakoudis // callbacks. 8323a6bfcf2SGiorgis Georgakoudis SmallVector<Value *, 8> Args; 8333a6bfcf2SGiorgis Georgakoudis for (auto *CI : MergableCIs) { 8343a6bfcf2SGiorgis Georgakoudis Value *Callee = 8353a6bfcf2SGiorgis Georgakoudis CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); 8363a6bfcf2SGiorgis Georgakoudis FunctionType *FT = 8373a6bfcf2SGiorgis Georgakoudis cast<FunctionType>(Callee->getType()->getPointerElementType()); 8383a6bfcf2SGiorgis Georgakoudis Args.clear(); 8393a6bfcf2SGiorgis Georgakoudis Args.push_back(OutlinedFn->getArg(0)); 8403a6bfcf2SGiorgis Georgakoudis Args.push_back(OutlinedFn->getArg(1)); 8413a6bfcf2SGiorgis Georgakoudis for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 8423a6bfcf2SGiorgis Georgakoudis U < E; ++U) 8433a6bfcf2SGiorgis Georgakoudis Args.push_back(CI->getArgOperand(U)); 8443a6bfcf2SGiorgis Georgakoudis 8453a6bfcf2SGiorgis Georgakoudis CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); 8463a6bfcf2SGiorgis Georgakoudis if (CI->getDebugLoc()) 8473a6bfcf2SGiorgis Georgakoudis NewCI->setDebugLoc(CI->getDebugLoc()); 8483a6bfcf2SGiorgis Georgakoudis 8493a6bfcf2SGiorgis Georgakoudis // Forward parameter attributes from the callback to the callee. 8503a6bfcf2SGiorgis Georgakoudis for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 8513a6bfcf2SGiorgis Georgakoudis U < E; ++U) 8523a6bfcf2SGiorgis Georgakoudis for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) 8533a6bfcf2SGiorgis Georgakoudis NewCI->addParamAttr( 8543a6bfcf2SGiorgis Georgakoudis U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); 8553a6bfcf2SGiorgis Georgakoudis 8563a6bfcf2SGiorgis Georgakoudis // Emit an explicit barrier to replace the implicit fork-join barrier. 8573a6bfcf2SGiorgis Georgakoudis if (CI != MergableCIs.back()) { 8583a6bfcf2SGiorgis Georgakoudis // TODO: Remove barrier if the merged parallel region includes the 8593a6bfcf2SGiorgis Georgakoudis // 'nowait' clause. 860e5dba2d7SMichael Kruse OMPInfoCache.OMPBuilder.createBarrier( 8613a6bfcf2SGiorgis Georgakoudis InsertPointTy(NewCI->getParent(), 8623a6bfcf2SGiorgis Georgakoudis NewCI->getNextNode()->getIterator()), 8633a6bfcf2SGiorgis Georgakoudis OMPD_parallel); 8643a6bfcf2SGiorgis Georgakoudis } 8653a6bfcf2SGiorgis Georgakoudis 8663a6bfcf2SGiorgis Georgakoudis auto Remark = [&](OptimizationRemark OR) { 8673a6bfcf2SGiorgis Georgakoudis return OR << "Parallel region at " 8683a6bfcf2SGiorgis Georgakoudis << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()) 8693a6bfcf2SGiorgis Georgakoudis << " merged with " 8703a6bfcf2SGiorgis Georgakoudis << ore::NV("OpenMPParallelMergeFront", 8713a6bfcf2SGiorgis Georgakoudis MergableCIs.front()->getDebugLoc()); 8723a6bfcf2SGiorgis Georgakoudis }; 8733a6bfcf2SGiorgis Georgakoudis if (CI != MergableCIs.front()) 8743a6bfcf2SGiorgis Georgakoudis emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging", 8753a6bfcf2SGiorgis Georgakoudis Remark); 8763a6bfcf2SGiorgis Georgakoudis 8773a6bfcf2SGiorgis Georgakoudis CI->eraseFromParent(); 8783a6bfcf2SGiorgis Georgakoudis } 8793a6bfcf2SGiorgis Georgakoudis 8803a6bfcf2SGiorgis Georgakoudis assert(OutlinedFn != OriginalFn && "Outlining failed"); 8817fea561eSArthur Eubanks CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn); 8823a6bfcf2SGiorgis Georgakoudis CGUpdater.reanalyzeFunction(*OriginalFn); 8833a6bfcf2SGiorgis Georgakoudis 8843a6bfcf2SGiorgis Georgakoudis NumOpenMPParallelRegionsMerged += MergableCIs.size(); 8853a6bfcf2SGiorgis Georgakoudis 8863a6bfcf2SGiorgis Georgakoudis return true; 8873a6bfcf2SGiorgis Georgakoudis }; 8883a6bfcf2SGiorgis Georgakoudis 8893a6bfcf2SGiorgis Georgakoudis // Helper function that identifes sequences of 8903a6bfcf2SGiorgis Georgakoudis // __kmpc_fork_call uses in a basic block. 8913a6bfcf2SGiorgis Georgakoudis auto DetectPRsCB = [&](Use &U, Function &F) { 8923a6bfcf2SGiorgis Georgakoudis CallInst *CI = getCallIfRegularCall(U, &RFI); 8933a6bfcf2SGiorgis Georgakoudis BB2PRMap[CI->getParent()].insert(CI); 8943a6bfcf2SGiorgis Georgakoudis 8953a6bfcf2SGiorgis Georgakoudis return false; 8963a6bfcf2SGiorgis Georgakoudis }; 8973a6bfcf2SGiorgis Georgakoudis 8983a6bfcf2SGiorgis Georgakoudis BB2PRMap.clear(); 8993a6bfcf2SGiorgis Georgakoudis RFI.foreachUse(SCC, DetectPRsCB); 9003a6bfcf2SGiorgis Georgakoudis SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector; 9013a6bfcf2SGiorgis Georgakoudis // Find mergable parallel regions within a basic block that are 9023a6bfcf2SGiorgis Georgakoudis // safe to merge, that is any in-between instructions can safely 9033a6bfcf2SGiorgis Georgakoudis // execute in parallel after merging. 9043a6bfcf2SGiorgis Georgakoudis // TODO: support merging across basic-blocks. 9053a6bfcf2SGiorgis Georgakoudis for (auto &It : BB2PRMap) { 9063a6bfcf2SGiorgis Georgakoudis auto &CIs = It.getSecond(); 9073a6bfcf2SGiorgis Georgakoudis if (CIs.size() < 2) 9083a6bfcf2SGiorgis Georgakoudis continue; 9093a6bfcf2SGiorgis Georgakoudis 9103a6bfcf2SGiorgis Georgakoudis BasicBlock *BB = It.getFirst(); 9113a6bfcf2SGiorgis Georgakoudis SmallVector<CallInst *, 4> MergableCIs; 9123a6bfcf2SGiorgis Georgakoudis 91397517055SGiorgis Georgakoudis /// Returns true if the instruction is mergable, false otherwise. 91497517055SGiorgis Georgakoudis /// A terminator instruction is unmergable by definition since merging 91597517055SGiorgis Georgakoudis /// works within a BB. Instructions before the mergable region are 91697517055SGiorgis Georgakoudis /// mergable if they are not calls to OpenMP runtime functions that may 91797517055SGiorgis Georgakoudis /// set different execution parameters for subsequent parallel regions. 91897517055SGiorgis Georgakoudis /// Instructions in-between parallel regions are mergable if they are not 91997517055SGiorgis Georgakoudis /// calls to any non-intrinsic function since that may call a non-mergable 92097517055SGiorgis Georgakoudis /// OpenMP runtime function. 92197517055SGiorgis Georgakoudis auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) { 92297517055SGiorgis Georgakoudis // We do not merge across BBs, hence return false (unmergable) if the 92397517055SGiorgis Georgakoudis // instruction is a terminator. 92497517055SGiorgis Georgakoudis if (I.isTerminator()) 92597517055SGiorgis Georgakoudis return false; 92697517055SGiorgis Georgakoudis 92797517055SGiorgis Georgakoudis if (!isa<CallInst>(&I)) 92897517055SGiorgis Georgakoudis return true; 92997517055SGiorgis Georgakoudis 93097517055SGiorgis Georgakoudis CallInst *CI = cast<CallInst>(&I); 93197517055SGiorgis Georgakoudis if (IsBeforeMergableRegion) { 93297517055SGiorgis Georgakoudis Function *CalledFunction = CI->getCalledFunction(); 93397517055SGiorgis Georgakoudis if (!CalledFunction) 93497517055SGiorgis Georgakoudis return false; 93597517055SGiorgis Georgakoudis // Return false (unmergable) if the call before the parallel 93697517055SGiorgis Georgakoudis // region calls an explicit affinity (proc_bind) or number of 93797517055SGiorgis Georgakoudis // threads (num_threads) compiler-generated function. Those settings 93897517055SGiorgis Georgakoudis // may be incompatible with following parallel regions. 93997517055SGiorgis Georgakoudis // TODO: ICV tracking to detect compatibility. 94097517055SGiorgis Georgakoudis for (const auto &RFI : UnmergableCallsInfo) { 94197517055SGiorgis Georgakoudis if (CalledFunction == RFI.Declaration) 94297517055SGiorgis Georgakoudis return false; 94397517055SGiorgis Georgakoudis } 94497517055SGiorgis Georgakoudis } else { 94597517055SGiorgis Georgakoudis // Return false (unmergable) if there is a call instruction 94697517055SGiorgis Georgakoudis // in-between parallel regions when it is not an intrinsic. It 94797517055SGiorgis Georgakoudis // may call an unmergable OpenMP runtime function in its callpath. 94897517055SGiorgis Georgakoudis // TODO: Keep track of possible OpenMP calls in the callpath. 94997517055SGiorgis Georgakoudis if (!isa<IntrinsicInst>(CI)) 95097517055SGiorgis Georgakoudis return false; 95197517055SGiorgis Georgakoudis } 95297517055SGiorgis Georgakoudis 95397517055SGiorgis Georgakoudis return true; 95497517055SGiorgis Georgakoudis }; 9553a6bfcf2SGiorgis Georgakoudis // Find maximal number of parallel region CIs that are safe to merge. 95697517055SGiorgis Georgakoudis for (auto It = BB->begin(), End = BB->end(); It != End;) { 95797517055SGiorgis Georgakoudis Instruction &I = *It; 95897517055SGiorgis Georgakoudis ++It; 95997517055SGiorgis Georgakoudis 9603a6bfcf2SGiorgis Georgakoudis if (CIs.count(&I)) { 9613a6bfcf2SGiorgis Georgakoudis MergableCIs.push_back(cast<CallInst>(&I)); 9623a6bfcf2SGiorgis Georgakoudis continue; 9633a6bfcf2SGiorgis Georgakoudis } 9643a6bfcf2SGiorgis Georgakoudis 96597517055SGiorgis Georgakoudis // Continue expanding if the instruction is mergable. 96697517055SGiorgis Georgakoudis if (IsMergable(I, MergableCIs.empty())) 9673a6bfcf2SGiorgis Georgakoudis continue; 9683a6bfcf2SGiorgis Georgakoudis 96997517055SGiorgis Georgakoudis // Forward the instruction iterator to skip the next parallel region 97097517055SGiorgis Georgakoudis // since there is an unmergable instruction which can affect it. 97197517055SGiorgis Georgakoudis for (; It != End; ++It) { 97297517055SGiorgis Georgakoudis Instruction &SkipI = *It; 97397517055SGiorgis Georgakoudis if (CIs.count(&SkipI)) { 97497517055SGiorgis Georgakoudis LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI 97597517055SGiorgis Georgakoudis << " due to " << I << "\n"); 97697517055SGiorgis Georgakoudis ++It; 97797517055SGiorgis Georgakoudis break; 97897517055SGiorgis Georgakoudis } 97997517055SGiorgis Georgakoudis } 98097517055SGiorgis Georgakoudis 98197517055SGiorgis Georgakoudis // Store mergable regions found. 9823a6bfcf2SGiorgis Georgakoudis if (MergableCIs.size() > 1) { 9833a6bfcf2SGiorgis Georgakoudis MergableCIsVector.push_back(MergableCIs); 9843a6bfcf2SGiorgis Georgakoudis LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size() 9853a6bfcf2SGiorgis Georgakoudis << " parallel regions in block " << BB->getName() 9863a6bfcf2SGiorgis Georgakoudis << " of function " << BB->getParent()->getName() 9873a6bfcf2SGiorgis Georgakoudis << "\n";); 9883a6bfcf2SGiorgis Georgakoudis } 9893a6bfcf2SGiorgis Georgakoudis 9903a6bfcf2SGiorgis Georgakoudis MergableCIs.clear(); 9913a6bfcf2SGiorgis Georgakoudis } 9923a6bfcf2SGiorgis Georgakoudis 9933a6bfcf2SGiorgis Georgakoudis if (!MergableCIsVector.empty()) { 9943a6bfcf2SGiorgis Georgakoudis Changed = true; 9953a6bfcf2SGiorgis Georgakoudis 9963a6bfcf2SGiorgis Georgakoudis for (auto &MergableCIs : MergableCIsVector) 9973a6bfcf2SGiorgis Georgakoudis Merge(MergableCIs, BB); 998b2ad63d3SJoseph Huber MergableCIsVector.clear(); 9993a6bfcf2SGiorgis Georgakoudis } 10003a6bfcf2SGiorgis Georgakoudis } 10013a6bfcf2SGiorgis Georgakoudis 10023a6bfcf2SGiorgis Georgakoudis if (Changed) { 100397517055SGiorgis Georgakoudis /// Re-collect use for fork calls, emitted barrier calls, and 100497517055SGiorgis Georgakoudis /// any emitted master/end_master calls. 100597517055SGiorgis Georgakoudis OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call); 100697517055SGiorgis Georgakoudis OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier); 100797517055SGiorgis Georgakoudis OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master); 100897517055SGiorgis Georgakoudis OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master); 10093a6bfcf2SGiorgis Georgakoudis } 10103a6bfcf2SGiorgis Georgakoudis 10113a6bfcf2SGiorgis Georgakoudis return Changed; 10123a6bfcf2SGiorgis Georgakoudis } 10133a6bfcf2SGiorgis Georgakoudis 10149d38f98dSJohannes Doerfert /// Try to delete parallel regions if possible. 1015e565db49SJohannes Doerfert bool deleteParallelRegions() { 1016e565db49SJohannes Doerfert const unsigned CallbackCalleeOperand = 2; 1017e565db49SJohannes Doerfert 10187cfd267cSsstefan1 OMPInformationCache::RuntimeFunctionInfo &RFI = 10197cfd267cSsstefan1 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 10207cfd267cSsstefan1 1021e565db49SJohannes Doerfert if (!RFI.Declaration) 1022e565db49SJohannes Doerfert return false; 1023e565db49SJohannes Doerfert 1024e565db49SJohannes Doerfert bool Changed = false; 1025e565db49SJohannes Doerfert auto DeleteCallCB = [&](Use &U, Function &) { 1026e565db49SJohannes Doerfert CallInst *CI = getCallIfRegularCall(U); 1027e565db49SJohannes Doerfert if (!CI) 1028e565db49SJohannes Doerfert return false; 1029e565db49SJohannes Doerfert auto *Fn = dyn_cast<Function>( 1030e565db49SJohannes Doerfert CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); 1031e565db49SJohannes Doerfert if (!Fn) 1032e565db49SJohannes Doerfert return false; 1033e565db49SJohannes Doerfert if (!Fn->onlyReadsMemory()) 1034e565db49SJohannes Doerfert return false; 1035e565db49SJohannes Doerfert if (!Fn->hasFnAttribute(Attribute::WillReturn)) 1036e565db49SJohannes Doerfert return false; 1037e565db49SJohannes Doerfert 1038e565db49SJohannes Doerfert LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " 1039e565db49SJohannes Doerfert << CI->getCaller()->getName() << "\n"); 10404d4ea9acSHuber, Joseph 10414d4ea9acSHuber, Joseph auto Remark = [&](OptimizationRemark OR) { 10424d4ea9acSHuber, Joseph return OR << "Parallel region in " 10434d4ea9acSHuber, Joseph << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) 10444d4ea9acSHuber, Joseph << " deleted"; 10454d4ea9acSHuber, Joseph }; 10464d4ea9acSHuber, Joseph emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion", 10474d4ea9acSHuber, Joseph Remark); 10484d4ea9acSHuber, Joseph 1049e565db49SJohannes Doerfert CGUpdater.removeCallSite(*CI); 1050e565db49SJohannes Doerfert CI->eraseFromParent(); 1051e565db49SJohannes Doerfert Changed = true; 105255eb714aSRoman Lebedev ++NumOpenMPParallelRegionsDeleted; 1053e565db49SJohannes Doerfert return true; 1054e565db49SJohannes Doerfert }; 1055e565db49SJohannes Doerfert 1056624d34afSJohannes Doerfert RFI.foreachUse(SCC, DeleteCallCB); 1057e565db49SJohannes Doerfert 1058e565db49SJohannes Doerfert return Changed; 1059e565db49SJohannes Doerfert } 1060e565db49SJohannes Doerfert 1061b726c557SJohannes Doerfert /// Try to eliminate runtime calls by reusing existing ones. 10629548b74aSJohannes Doerfert bool deduplicateRuntimeCalls() { 10639548b74aSJohannes Doerfert bool Changed = false; 10649548b74aSJohannes Doerfert 1065e28936f6SJohannes Doerfert RuntimeFunction DeduplicableRuntimeCallIDs[] = { 1066e28936f6SJohannes Doerfert OMPRTL_omp_get_num_threads, 1067e28936f6SJohannes Doerfert OMPRTL_omp_in_parallel, 1068e28936f6SJohannes Doerfert OMPRTL_omp_get_cancellation, 1069e28936f6SJohannes Doerfert OMPRTL_omp_get_thread_limit, 1070e28936f6SJohannes Doerfert OMPRTL_omp_get_supported_active_levels, 1071e28936f6SJohannes Doerfert OMPRTL_omp_get_level, 1072e28936f6SJohannes Doerfert OMPRTL_omp_get_ancestor_thread_num, 1073e28936f6SJohannes Doerfert OMPRTL_omp_get_team_size, 1074e28936f6SJohannes Doerfert OMPRTL_omp_get_active_level, 1075e28936f6SJohannes Doerfert OMPRTL_omp_in_final, 1076e28936f6SJohannes Doerfert OMPRTL_omp_get_proc_bind, 1077e28936f6SJohannes Doerfert OMPRTL_omp_get_num_places, 1078e28936f6SJohannes Doerfert OMPRTL_omp_get_num_procs, 1079e28936f6SJohannes Doerfert OMPRTL_omp_get_place_num, 1080e28936f6SJohannes Doerfert OMPRTL_omp_get_partition_num_places, 1081e28936f6SJohannes Doerfert OMPRTL_omp_get_partition_place_nums}; 1082e28936f6SJohannes Doerfert 1083bc93c2d7SMarek Kurdej // Global-tid is handled separately. 10849548b74aSJohannes Doerfert SmallSetVector<Value *, 16> GTIdArgs; 10859548b74aSJohannes Doerfert collectGlobalThreadIdArguments(GTIdArgs); 10869548b74aSJohannes Doerfert LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() 10879548b74aSJohannes Doerfert << " global thread ID arguments\n"); 10889548b74aSJohannes Doerfert 10899548b74aSJohannes Doerfert for (Function *F : SCC) { 1090e28936f6SJohannes Doerfert for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) 10914e29d256Sserge-sans-paille Changed |= deduplicateRuntimeCalls( 10924e29d256Sserge-sans-paille *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); 1093e28936f6SJohannes Doerfert 1094e28936f6SJohannes Doerfert // __kmpc_global_thread_num is special as we can replace it with an 1095e28936f6SJohannes Doerfert // argument in enough cases to make it worth trying. 10969548b74aSJohannes Doerfert Value *GTIdArg = nullptr; 10979548b74aSJohannes Doerfert for (Argument &Arg : F->args()) 10989548b74aSJohannes Doerfert if (GTIdArgs.count(&Arg)) { 10999548b74aSJohannes Doerfert GTIdArg = &Arg; 11009548b74aSJohannes Doerfert break; 11019548b74aSJohannes Doerfert } 11029548b74aSJohannes Doerfert Changed |= deduplicateRuntimeCalls( 11037cfd267cSsstefan1 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); 11049548b74aSJohannes Doerfert } 11059548b74aSJohannes Doerfert 11069548b74aSJohannes Doerfert return Changed; 11079548b74aSJohannes Doerfert } 11089548b74aSJohannes Doerfert 1109496f8e5bSHamilton Tobon Mosquera /// Tries to hide the latency of runtime calls that involve host to 1110496f8e5bSHamilton Tobon Mosquera /// device memory transfers by splitting them into their "issue" and "wait" 1111496f8e5bSHamilton Tobon Mosquera /// versions. The "issue" is moved upwards as much as possible. The "wait" is 1112496f8e5bSHamilton Tobon Mosquera /// moved downards as much as possible. The "issue" issues the memory transfer 1113496f8e5bSHamilton Tobon Mosquera /// asynchronously, returning a handle. The "wait" waits in the returned 1114496f8e5bSHamilton Tobon Mosquera /// handle for the memory transfer to finish. 1115496f8e5bSHamilton Tobon Mosquera bool hideMemTransfersLatency() { 1116496f8e5bSHamilton Tobon Mosquera auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; 1117496f8e5bSHamilton Tobon Mosquera bool Changed = false; 1118496f8e5bSHamilton Tobon Mosquera auto SplitMemTransfers = [&](Use &U, Function &Decl) { 1119496f8e5bSHamilton Tobon Mosquera auto *RTCall = getCallIfRegularCall(U, &RFI); 1120496f8e5bSHamilton Tobon Mosquera if (!RTCall) 1121496f8e5bSHamilton Tobon Mosquera return false; 1122496f8e5bSHamilton Tobon Mosquera 11238931add6SHamilton Tobon Mosquera OffloadArray OffloadArrays[3]; 11248931add6SHamilton Tobon Mosquera if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) 11258931add6SHamilton Tobon Mosquera return false; 11268931add6SHamilton Tobon Mosquera 11278931add6SHamilton Tobon Mosquera LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); 11288931add6SHamilton Tobon Mosquera 1129bd2fa181SHamilton Tobon Mosquera // TODO: Check if can be moved upwards. 1130bd2fa181SHamilton Tobon Mosquera bool WasSplit = false; 1131bd2fa181SHamilton Tobon Mosquera Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); 1132bd2fa181SHamilton Tobon Mosquera if (WaitMovementPoint) 1133bd2fa181SHamilton Tobon Mosquera WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); 1134bd2fa181SHamilton Tobon Mosquera 1135496f8e5bSHamilton Tobon Mosquera Changed |= WasSplit; 1136496f8e5bSHamilton Tobon Mosquera return WasSplit; 1137496f8e5bSHamilton Tobon Mosquera }; 1138496f8e5bSHamilton Tobon Mosquera RFI.foreachUse(SCC, SplitMemTransfers); 1139496f8e5bSHamilton Tobon Mosquera 1140496f8e5bSHamilton Tobon Mosquera return Changed; 1141496f8e5bSHamilton Tobon Mosquera } 1142496f8e5bSHamilton Tobon Mosquera 1143a2281419SJoseph Huber void analysisGlobalization() { 11446fc51c9fSJoseph Huber auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; 114582453e75SJoseph Huber 114682453e75SJoseph Huber auto CheckGlobalization = [&](Use &U, Function &Decl) { 1147a2281419SJoseph Huber if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { 114844feacc7SJoseph Huber auto Remark = [&](OptimizationRemarkMissed ORM) { 114944feacc7SJoseph Huber return ORM 1150a2281419SJoseph Huber << "Found thread data sharing on the GPU. " 1151a2281419SJoseph Huber << "Expect degraded performance due to data globalization."; 1152a2281419SJoseph Huber }; 115344feacc7SJoseph Huber emitRemark<OptimizationRemarkMissed>(CI, "OpenMPGlobalization", Remark); 1154a2281419SJoseph Huber } 1155a2281419SJoseph Huber 1156a2281419SJoseph Huber return false; 1157a2281419SJoseph Huber }; 1158a2281419SJoseph Huber 115982453e75SJoseph Huber RFI.foreachUse(SCC, CheckGlobalization); 116082453e75SJoseph Huber } 1161a2281419SJoseph Huber 11628931add6SHamilton Tobon Mosquera /// Maps the values stored in the offload arrays passed as arguments to 11638931add6SHamilton Tobon Mosquera /// \p RuntimeCall into the offload arrays in \p OAs. 11648931add6SHamilton Tobon Mosquera bool getValuesInOffloadArrays(CallInst &RuntimeCall, 11658931add6SHamilton Tobon Mosquera MutableArrayRef<OffloadArray> OAs) { 11668931add6SHamilton Tobon Mosquera assert(OAs.size() == 3 && "Need space for three offload arrays!"); 11678931add6SHamilton Tobon Mosquera 11688931add6SHamilton Tobon Mosquera // A runtime call that involves memory offloading looks something like: 11698931add6SHamilton Tobon Mosquera // call void @__tgt_target_data_begin_mapper(arg0, arg1, 11708931add6SHamilton Tobon Mosquera // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, 11718931add6SHamilton Tobon Mosquera // ...) 11728931add6SHamilton Tobon Mosquera // So, the idea is to access the allocas that allocate space for these 11738931add6SHamilton Tobon Mosquera // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. 11748931add6SHamilton Tobon Mosquera // Therefore: 11758931add6SHamilton Tobon Mosquera // i8** %offload_baseptrs. 11761d3d9b9cSHamilton Tobon Mosquera Value *BasePtrsArg = 11771d3d9b9cSHamilton Tobon Mosquera RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); 11788931add6SHamilton Tobon Mosquera // i8** %offload_ptrs. 11791d3d9b9cSHamilton Tobon Mosquera Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); 11808931add6SHamilton Tobon Mosquera // i8** %offload_sizes. 11811d3d9b9cSHamilton Tobon Mosquera Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); 11828931add6SHamilton Tobon Mosquera 11838931add6SHamilton Tobon Mosquera // Get values stored in **offload_baseptrs. 11848931add6SHamilton Tobon Mosquera auto *V = getUnderlyingObject(BasePtrsArg); 11858931add6SHamilton Tobon Mosquera if (!isa<AllocaInst>(V)) 11868931add6SHamilton Tobon Mosquera return false; 11878931add6SHamilton Tobon Mosquera auto *BasePtrsArray = cast<AllocaInst>(V); 11888931add6SHamilton Tobon Mosquera if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) 11898931add6SHamilton Tobon Mosquera return false; 11908931add6SHamilton Tobon Mosquera 11918931add6SHamilton Tobon Mosquera // Get values stored in **offload_baseptrs. 11928931add6SHamilton Tobon Mosquera V = getUnderlyingObject(PtrsArg); 11938931add6SHamilton Tobon Mosquera if (!isa<AllocaInst>(V)) 11948931add6SHamilton Tobon Mosquera return false; 11958931add6SHamilton Tobon Mosquera auto *PtrsArray = cast<AllocaInst>(V); 11968931add6SHamilton Tobon Mosquera if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) 11978931add6SHamilton Tobon Mosquera return false; 11988931add6SHamilton Tobon Mosquera 11998931add6SHamilton Tobon Mosquera // Get values stored in **offload_sizes. 12008931add6SHamilton Tobon Mosquera V = getUnderlyingObject(SizesArg); 12018931add6SHamilton Tobon Mosquera // If it's a [constant] global array don't analyze it. 12028931add6SHamilton Tobon Mosquera if (isa<GlobalValue>(V)) 12038931add6SHamilton Tobon Mosquera return isa<Constant>(V); 12048931add6SHamilton Tobon Mosquera if (!isa<AllocaInst>(V)) 12058931add6SHamilton Tobon Mosquera return false; 12068931add6SHamilton Tobon Mosquera 12078931add6SHamilton Tobon Mosquera auto *SizesArray = cast<AllocaInst>(V); 12088931add6SHamilton Tobon Mosquera if (!OAs[2].initialize(*SizesArray, RuntimeCall)) 12098931add6SHamilton Tobon Mosquera return false; 12108931add6SHamilton Tobon Mosquera 12118931add6SHamilton Tobon Mosquera return true; 12128931add6SHamilton Tobon Mosquera } 12138931add6SHamilton Tobon Mosquera 12148931add6SHamilton Tobon Mosquera /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. 12158931add6SHamilton Tobon Mosquera /// For now this is a way to test that the function getValuesInOffloadArrays 12168931add6SHamilton Tobon Mosquera /// is working properly. 12178931add6SHamilton Tobon Mosquera /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. 12188931add6SHamilton Tobon Mosquera void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) { 12198931add6SHamilton Tobon Mosquera assert(OAs.size() == 3 && "There are three offload arrays to debug!"); 12208931add6SHamilton Tobon Mosquera 12218931add6SHamilton Tobon Mosquera LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); 12228931add6SHamilton Tobon Mosquera std::string ValuesStr; 12238931add6SHamilton Tobon Mosquera raw_string_ostream Printer(ValuesStr); 12248931add6SHamilton Tobon Mosquera std::string Separator = " --- "; 12258931add6SHamilton Tobon Mosquera 12268931add6SHamilton Tobon Mosquera for (auto *BP : OAs[0].StoredValues) { 12278931add6SHamilton Tobon Mosquera BP->print(Printer); 12288931add6SHamilton Tobon Mosquera Printer << Separator; 12298931add6SHamilton Tobon Mosquera } 12308931add6SHamilton Tobon Mosquera LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); 12318931add6SHamilton Tobon Mosquera ValuesStr.clear(); 12328931add6SHamilton Tobon Mosquera 12338931add6SHamilton Tobon Mosquera for (auto *P : OAs[1].StoredValues) { 12348931add6SHamilton Tobon Mosquera P->print(Printer); 12358931add6SHamilton Tobon Mosquera Printer << Separator; 12368931add6SHamilton Tobon Mosquera } 12378931add6SHamilton Tobon Mosquera LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); 12388931add6SHamilton Tobon Mosquera ValuesStr.clear(); 12398931add6SHamilton Tobon Mosquera 12408931add6SHamilton Tobon Mosquera for (auto *S : OAs[2].StoredValues) { 12418931add6SHamilton Tobon Mosquera S->print(Printer); 12428931add6SHamilton Tobon Mosquera Printer << Separator; 12438931add6SHamilton Tobon Mosquera } 12448931add6SHamilton Tobon Mosquera LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); 12458931add6SHamilton Tobon Mosquera } 12468931add6SHamilton Tobon Mosquera 1247bd2fa181SHamilton Tobon Mosquera /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be 1248bd2fa181SHamilton Tobon Mosquera /// moved. Returns nullptr if the movement is not possible, or not worth it. 1249bd2fa181SHamilton Tobon Mosquera Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { 1250bd2fa181SHamilton Tobon Mosquera // FIXME: This traverses only the BasicBlock where RuntimeCall is. 1251bd2fa181SHamilton Tobon Mosquera // Make it traverse the CFG. 1252bd2fa181SHamilton Tobon Mosquera 1253bd2fa181SHamilton Tobon Mosquera Instruction *CurrentI = &RuntimeCall; 1254bd2fa181SHamilton Tobon Mosquera bool IsWorthIt = false; 1255bd2fa181SHamilton Tobon Mosquera while ((CurrentI = CurrentI->getNextNode())) { 1256bd2fa181SHamilton Tobon Mosquera 1257bd2fa181SHamilton Tobon Mosquera // TODO: Once we detect the regions to be offloaded we should use the 1258bd2fa181SHamilton Tobon Mosquera // alias analysis manager to check if CurrentI may modify one of 1259bd2fa181SHamilton Tobon Mosquera // the offloaded regions. 1260bd2fa181SHamilton Tobon Mosquera if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { 1261bd2fa181SHamilton Tobon Mosquera if (IsWorthIt) 1262bd2fa181SHamilton Tobon Mosquera return CurrentI; 1263bd2fa181SHamilton Tobon Mosquera 1264bd2fa181SHamilton Tobon Mosquera return nullptr; 1265bd2fa181SHamilton Tobon Mosquera } 1266bd2fa181SHamilton Tobon Mosquera 1267bd2fa181SHamilton Tobon Mosquera // FIXME: For now if we move it over anything without side effect 1268bd2fa181SHamilton Tobon Mosquera // is worth it. 1269bd2fa181SHamilton Tobon Mosquera IsWorthIt = true; 1270bd2fa181SHamilton Tobon Mosquera } 1271bd2fa181SHamilton Tobon Mosquera 1272bd2fa181SHamilton Tobon Mosquera // Return end of BasicBlock. 1273bd2fa181SHamilton Tobon Mosquera return RuntimeCall.getParent()->getTerminator(); 1274bd2fa181SHamilton Tobon Mosquera } 1275bd2fa181SHamilton Tobon Mosquera 1276496f8e5bSHamilton Tobon Mosquera /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. 1277bd2fa181SHamilton Tobon Mosquera bool splitTargetDataBeginRTC(CallInst &RuntimeCall, 1278bd2fa181SHamilton Tobon Mosquera Instruction &WaitMovementPoint) { 1279bd31abc1SHamilton Tobon Mosquera // Create stack allocated handle (__tgt_async_info) at the beginning of the 1280bd31abc1SHamilton Tobon Mosquera // function. Used for storing information of the async transfer, allowing to 1281bd31abc1SHamilton Tobon Mosquera // wait on it later. 1282496f8e5bSHamilton Tobon Mosquera auto &IRBuilder = OMPInfoCache.OMPBuilder; 1283bd31abc1SHamilton Tobon Mosquera auto *F = RuntimeCall.getCaller(); 1284bd31abc1SHamilton Tobon Mosquera Instruction *FirstInst = &(F->getEntryBlock().front()); 1285bd31abc1SHamilton Tobon Mosquera AllocaInst *Handle = new AllocaInst( 1286bd31abc1SHamilton Tobon Mosquera IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); 1287bd31abc1SHamilton Tobon Mosquera 1288496f8e5bSHamilton Tobon Mosquera // Add "issue" runtime call declaration: 1289496f8e5bSHamilton Tobon Mosquera // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, 1290496f8e5bSHamilton Tobon Mosquera // i8**, i8**, i64*, i64*) 1291496f8e5bSHamilton Tobon Mosquera FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( 1292496f8e5bSHamilton Tobon Mosquera M, OMPRTL___tgt_target_data_begin_mapper_issue); 1293496f8e5bSHamilton Tobon Mosquera 1294496f8e5bSHamilton Tobon Mosquera // Change RuntimeCall call site for its asynchronous version. 129597e55cfeSJoseph Huber SmallVector<Value *, 16> Args; 1296bd2fa181SHamilton Tobon Mosquera for (auto &Arg : RuntimeCall.args()) 1297496f8e5bSHamilton Tobon Mosquera Args.push_back(Arg.get()); 1298bd31abc1SHamilton Tobon Mosquera Args.push_back(Handle); 1299496f8e5bSHamilton Tobon Mosquera 1300496f8e5bSHamilton Tobon Mosquera CallInst *IssueCallsite = 1301bd31abc1SHamilton Tobon Mosquera CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); 1302bd2fa181SHamilton Tobon Mosquera RuntimeCall.eraseFromParent(); 1303496f8e5bSHamilton Tobon Mosquera 1304496f8e5bSHamilton Tobon Mosquera // Add "wait" runtime call declaration: 1305496f8e5bSHamilton Tobon Mosquera // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) 1306496f8e5bSHamilton Tobon Mosquera FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( 1307496f8e5bSHamilton Tobon Mosquera M, OMPRTL___tgt_target_data_begin_mapper_wait); 1308496f8e5bSHamilton Tobon Mosquera 1309496f8e5bSHamilton Tobon Mosquera Value *WaitParams[2] = { 1310da8bec47SJoseph Huber IssueCallsite->getArgOperand( 1311da8bec47SJoseph Huber OffloadArray::DeviceIDArgNum), // device_id. 1312bd31abc1SHamilton Tobon Mosquera Handle // handle to wait on. 1313496f8e5bSHamilton Tobon Mosquera }; 1314bd2fa181SHamilton Tobon Mosquera CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); 1315496f8e5bSHamilton Tobon Mosquera 1316496f8e5bSHamilton Tobon Mosquera return true; 1317496f8e5bSHamilton Tobon Mosquera } 1318496f8e5bSHamilton Tobon Mosquera 1319dc3b5b00SJohannes Doerfert static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, 1320dc3b5b00SJohannes Doerfert bool GlobalOnly, bool &SingleChoice) { 1321dc3b5b00SJohannes Doerfert if (CurrentIdent == NextIdent) 1322dc3b5b00SJohannes Doerfert return CurrentIdent; 1323dc3b5b00SJohannes Doerfert 1324396b7253SJohannes Doerfert // TODO: Figure out how to actually combine multiple debug locations. For 1325dc3b5b00SJohannes Doerfert // now we just keep an existing one if there is a single choice. 1326dc3b5b00SJohannes Doerfert if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { 1327dc3b5b00SJohannes Doerfert SingleChoice = !CurrentIdent; 1328dc3b5b00SJohannes Doerfert return NextIdent; 1329dc3b5b00SJohannes Doerfert } 1330396b7253SJohannes Doerfert return nullptr; 1331396b7253SJohannes Doerfert } 1332396b7253SJohannes Doerfert 1333396b7253SJohannes Doerfert /// Return an `struct ident_t*` value that represents the ones used in the 1334396b7253SJohannes Doerfert /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not 1335396b7253SJohannes Doerfert /// return a local `struct ident_t*`. For now, if we cannot find a suitable 1336396b7253SJohannes Doerfert /// return value we create one from scratch. We also do not yet combine 1337396b7253SJohannes Doerfert /// information, e.g., the source locations, see combinedIdentStruct. 13387cfd267cSsstefan1 Value * 13397cfd267cSsstefan1 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, 13407cfd267cSsstefan1 Function &F, bool GlobalOnly) { 1341dc3b5b00SJohannes Doerfert bool SingleChoice = true; 1342396b7253SJohannes Doerfert Value *Ident = nullptr; 1343396b7253SJohannes Doerfert auto CombineIdentStruct = [&](Use &U, Function &Caller) { 1344396b7253SJohannes Doerfert CallInst *CI = getCallIfRegularCall(U, &RFI); 1345396b7253SJohannes Doerfert if (!CI || &F != &Caller) 1346396b7253SJohannes Doerfert return false; 1347396b7253SJohannes Doerfert Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), 1348dc3b5b00SJohannes Doerfert /* GlobalOnly */ true, SingleChoice); 1349396b7253SJohannes Doerfert return false; 1350396b7253SJohannes Doerfert }; 1351624d34afSJohannes Doerfert RFI.foreachUse(SCC, CombineIdentStruct); 1352396b7253SJohannes Doerfert 1353dc3b5b00SJohannes Doerfert if (!Ident || !SingleChoice) { 1354396b7253SJohannes Doerfert // The IRBuilder uses the insertion block to get to the module, this is 1355396b7253SJohannes Doerfert // unfortunate but we work around it for now. 13567cfd267cSsstefan1 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) 13577cfd267cSsstefan1 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( 1358396b7253SJohannes Doerfert &F.getEntryBlock(), F.getEntryBlock().begin())); 1359396b7253SJohannes Doerfert // Create a fallback location if non was found. 1360396b7253SJohannes Doerfert // TODO: Use the debug locations of the calls instead. 13617cfd267cSsstefan1 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); 13627cfd267cSsstefan1 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); 1363396b7253SJohannes Doerfert } 1364396b7253SJohannes Doerfert return Ident; 1365396b7253SJohannes Doerfert } 1366396b7253SJohannes Doerfert 1367b726c557SJohannes Doerfert /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or 13689548b74aSJohannes Doerfert /// \p ReplVal if given. 13697cfd267cSsstefan1 bool deduplicateRuntimeCalls(Function &F, 13707cfd267cSsstefan1 OMPInformationCache::RuntimeFunctionInfo &RFI, 13719548b74aSJohannes Doerfert Value *ReplVal = nullptr) { 13728855fec3SJohannes Doerfert auto *UV = RFI.getUseVector(F); 13738855fec3SJohannes Doerfert if (!UV || UV->size() + (ReplVal != nullptr) < 2) 1374b1fbf438SRoman Lebedev return false; 1375b1fbf438SRoman Lebedev 13767cfd267cSsstefan1 LLVM_DEBUG( 13777cfd267cSsstefan1 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name 13787cfd267cSsstefan1 << (ReplVal ? " with an existing value\n" : "\n") << "\n"); 13797cfd267cSsstefan1 1380ab3da5ddSMichael Liao assert((!ReplVal || (isa<Argument>(ReplVal) && 1381ab3da5ddSMichael Liao cast<Argument>(ReplVal)->getParent() == &F)) && 13829548b74aSJohannes Doerfert "Unexpected replacement value!"); 1383396b7253SJohannes Doerfert 1384396b7253SJohannes Doerfert // TODO: Use dominance to find a good position instead. 13856aab27baSsstefan1 auto CanBeMoved = [this](CallBase &CB) { 1386396b7253SJohannes Doerfert unsigned NumArgs = CB.getNumArgOperands(); 1387396b7253SJohannes Doerfert if (NumArgs == 0) 1388396b7253SJohannes Doerfert return true; 13896aab27baSsstefan1 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) 1390396b7253SJohannes Doerfert return false; 1391396b7253SJohannes Doerfert for (unsigned u = 1; u < NumArgs; ++u) 1392396b7253SJohannes Doerfert if (isa<Instruction>(CB.getArgOperand(u))) 1393396b7253SJohannes Doerfert return false; 1394396b7253SJohannes Doerfert return true; 1395396b7253SJohannes Doerfert }; 1396396b7253SJohannes Doerfert 13979548b74aSJohannes Doerfert if (!ReplVal) { 13988855fec3SJohannes Doerfert for (Use *U : *UV) 13999548b74aSJohannes Doerfert if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { 1400396b7253SJohannes Doerfert if (!CanBeMoved(*CI)) 1401396b7253SJohannes Doerfert continue; 14024d4ea9acSHuber, Joseph 14034d4ea9acSHuber, Joseph auto Remark = [&](OptimizationRemark OR) { 14044d4ea9acSHuber, Joseph return OR << "OpenMP runtime call " 14052db182ffSJoseph Huber << ore::NV("OpenMPOptRuntime", RFI.Name) 14062db182ffSJoseph Huber << " moved to beginning of OpenMP region"; 14074d4ea9acSHuber, Joseph }; 14082db182ffSJoseph Huber emitRemark<OptimizationRemark>(&F, "OpenMPRuntimeCodeMotion", Remark); 14094d4ea9acSHuber, Joseph 14109548b74aSJohannes Doerfert CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); 14119548b74aSJohannes Doerfert ReplVal = CI; 14129548b74aSJohannes Doerfert break; 14139548b74aSJohannes Doerfert } 14149548b74aSJohannes Doerfert if (!ReplVal) 14159548b74aSJohannes Doerfert return false; 14169548b74aSJohannes Doerfert } 14179548b74aSJohannes Doerfert 1418396b7253SJohannes Doerfert // If we use a call as a replacement value we need to make sure the ident is 1419396b7253SJohannes Doerfert // valid at the new location. For now we just pick a global one, either 1420396b7253SJohannes Doerfert // existing and used by one of the calls, or created from scratch. 1421396b7253SJohannes Doerfert if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { 1422396b7253SJohannes Doerfert if (CI->getNumArgOperands() > 0 && 14236aab27baSsstefan1 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { 1424396b7253SJohannes Doerfert Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, 1425396b7253SJohannes Doerfert /* GlobalOnly */ true); 1426396b7253SJohannes Doerfert CI->setArgOperand(0, Ident); 1427396b7253SJohannes Doerfert } 1428396b7253SJohannes Doerfert } 1429396b7253SJohannes Doerfert 14309548b74aSJohannes Doerfert bool Changed = false; 14319548b74aSJohannes Doerfert auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { 14329548b74aSJohannes Doerfert CallInst *CI = getCallIfRegularCall(U, &RFI); 14339548b74aSJohannes Doerfert if (!CI || CI == ReplVal || &F != &Caller) 14349548b74aSJohannes Doerfert return false; 14359548b74aSJohannes Doerfert assert(CI->getCaller() == &F && "Unexpected call!"); 14364d4ea9acSHuber, Joseph 14374d4ea9acSHuber, Joseph auto Remark = [&](OptimizationRemark OR) { 14384d4ea9acSHuber, Joseph return OR << "OpenMP runtime call " 14394d4ea9acSHuber, Joseph << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; 14404d4ea9acSHuber, Joseph }; 14412db182ffSJoseph Huber emitRemark<OptimizationRemark>(&F, "OpenMPRuntimeDeduplicated", Remark); 14424d4ea9acSHuber, Joseph 14439548b74aSJohannes Doerfert CGUpdater.removeCallSite(*CI); 14449548b74aSJohannes Doerfert CI->replaceAllUsesWith(ReplVal); 14459548b74aSJohannes Doerfert CI->eraseFromParent(); 14469548b74aSJohannes Doerfert ++NumOpenMPRuntimeCallsDeduplicated; 14479548b74aSJohannes Doerfert Changed = true; 14489548b74aSJohannes Doerfert return true; 14499548b74aSJohannes Doerfert }; 1450624d34afSJohannes Doerfert RFI.foreachUse(SCC, ReplaceAndDeleteCB); 14519548b74aSJohannes Doerfert 14529548b74aSJohannes Doerfert return Changed; 14539548b74aSJohannes Doerfert } 14549548b74aSJohannes Doerfert 14559548b74aSJohannes Doerfert /// Collect arguments that represent the global thread id in \p GTIdArgs. 14569548b74aSJohannes Doerfert void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) { 14579548b74aSJohannes Doerfert // TODO: Below we basically perform a fixpoint iteration with a pessimistic 14589548b74aSJohannes Doerfert // initialization. We could define an AbstractAttribute instead and 14599548b74aSJohannes Doerfert // run the Attributor here once it can be run as an SCC pass. 14609548b74aSJohannes Doerfert 14619548b74aSJohannes Doerfert // Helper to check the argument \p ArgNo at all call sites of \p F for 14629548b74aSJohannes Doerfert // a GTId. 14639548b74aSJohannes Doerfert auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { 14649548b74aSJohannes Doerfert if (!F.hasLocalLinkage()) 14659548b74aSJohannes Doerfert return false; 14669548b74aSJohannes Doerfert for (Use &U : F.uses()) { 14679548b74aSJohannes Doerfert if (CallInst *CI = getCallIfRegularCall(U)) { 14689548b74aSJohannes Doerfert Value *ArgOp = CI->getArgOperand(ArgNo); 14699548b74aSJohannes Doerfert if (CI == &RefCI || GTIdArgs.count(ArgOp) || 14707cfd267cSsstefan1 getCallIfRegularCall( 14717cfd267cSsstefan1 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) 14729548b74aSJohannes Doerfert continue; 14739548b74aSJohannes Doerfert } 14749548b74aSJohannes Doerfert return false; 14759548b74aSJohannes Doerfert } 14769548b74aSJohannes Doerfert return true; 14779548b74aSJohannes Doerfert }; 14789548b74aSJohannes Doerfert 14799548b74aSJohannes Doerfert // Helper to identify uses of a GTId as GTId arguments. 14809548b74aSJohannes Doerfert auto AddUserArgs = [&](Value >Id) { 14819548b74aSJohannes Doerfert for (Use &U : GTId.uses()) 14829548b74aSJohannes Doerfert if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) 14839548b74aSJohannes Doerfert if (CI->isArgOperand(&U)) 14849548b74aSJohannes Doerfert if (Function *Callee = CI->getCalledFunction()) 14859548b74aSJohannes Doerfert if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) 14869548b74aSJohannes Doerfert GTIdArgs.insert(Callee->getArg(U.getOperandNo())); 14879548b74aSJohannes Doerfert }; 14889548b74aSJohannes Doerfert 14899548b74aSJohannes Doerfert // The argument users of __kmpc_global_thread_num calls are GTIds. 14907cfd267cSsstefan1 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = 14917cfd267cSsstefan1 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; 14927cfd267cSsstefan1 1493624d34afSJohannes Doerfert GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { 14948855fec3SJohannes Doerfert if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) 14959548b74aSJohannes Doerfert AddUserArgs(*CI); 14968855fec3SJohannes Doerfert return false; 14978855fec3SJohannes Doerfert }); 14989548b74aSJohannes Doerfert 14999548b74aSJohannes Doerfert // Transitively search for more arguments by looking at the users of the 15009548b74aSJohannes Doerfert // ones we know already. During the search the GTIdArgs vector is extended 15019548b74aSJohannes Doerfert // so we cannot cache the size nor can we use a range based for. 15029548b74aSJohannes Doerfert for (unsigned u = 0; u < GTIdArgs.size(); ++u) 15039548b74aSJohannes Doerfert AddUserArgs(*GTIdArgs[u]); 15049548b74aSJohannes Doerfert } 15059548b74aSJohannes Doerfert 15065b0581aeSJohannes Doerfert /// Kernel (=GPU) optimizations and utility functions 15075b0581aeSJohannes Doerfert /// 15085b0581aeSJohannes Doerfert ///{{ 15095b0581aeSJohannes Doerfert 15105b0581aeSJohannes Doerfert /// Check if \p F is a kernel, hence entry point for target offloading. 15115b0581aeSJohannes Doerfert bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } 15125b0581aeSJohannes Doerfert 15135b0581aeSJohannes Doerfert /// Cache to remember the unique kernel for a function. 15145b0581aeSJohannes Doerfert DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; 15155b0581aeSJohannes Doerfert 15165b0581aeSJohannes Doerfert /// Find the unique kernel that will execute \p F, if any. 15175b0581aeSJohannes Doerfert Kernel getUniqueKernelFor(Function &F); 15185b0581aeSJohannes Doerfert 15195b0581aeSJohannes Doerfert /// Find the unique kernel that will execute \p I, if any. 15205b0581aeSJohannes Doerfert Kernel getUniqueKernelFor(Instruction &I) { 15215b0581aeSJohannes Doerfert return getUniqueKernelFor(*I.getFunction()); 15225b0581aeSJohannes Doerfert } 15235b0581aeSJohannes Doerfert 15245b0581aeSJohannes Doerfert /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in 15255b0581aeSJohannes Doerfert /// the cases we can avoid taking the address of a function. 15265b0581aeSJohannes Doerfert bool rewriteDeviceCodeStateMachine(); 15275b0581aeSJohannes Doerfert 15285b0581aeSJohannes Doerfert /// 15295b0581aeSJohannes Doerfert ///}} 15305b0581aeSJohannes Doerfert 15314d4ea9acSHuber, Joseph /// Emit a remark generically 15324d4ea9acSHuber, Joseph /// 15334d4ea9acSHuber, Joseph /// This template function can be used to generically emit a remark. The 15344d4ea9acSHuber, Joseph /// RemarkKind should be one of the following: 15354d4ea9acSHuber, Joseph /// - OptimizationRemark to indicate a successful optimization attempt 15364d4ea9acSHuber, Joseph /// - OptimizationRemarkMissed to report a failed optimization attempt 15374d4ea9acSHuber, Joseph /// - OptimizationRemarkAnalysis to provide additional information about an 15384d4ea9acSHuber, Joseph /// optimization attempt 15394d4ea9acSHuber, Joseph /// 15404d4ea9acSHuber, Joseph /// The remark is built using a callback function provided by the caller that 15414d4ea9acSHuber, Joseph /// takes a RemarkKind as input and returns a RemarkKind. 15422db182ffSJoseph Huber template <typename RemarkKind, typename RemarkCallBack> 15432db182ffSJoseph Huber void emitRemark(Instruction *I, StringRef RemarkName, 1544e8039ad4SJohannes Doerfert RemarkCallBack &&RemarkCB) const { 15452db182ffSJoseph Huber Function *F = I->getParent()->getParent(); 15464d4ea9acSHuber, Joseph auto &ORE = OREGetter(F); 15474d4ea9acSHuber, Joseph 15482db182ffSJoseph Huber ORE.emit([&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); }); 15494d4ea9acSHuber, Joseph } 15504d4ea9acSHuber, Joseph 15512db182ffSJoseph Huber /// Emit a remark on a function. 15522db182ffSJoseph Huber template <typename RemarkKind, typename RemarkCallBack> 15532db182ffSJoseph Huber void emitRemark(Function *F, StringRef RemarkName, 15542db182ffSJoseph Huber RemarkCallBack &&RemarkCB) const { 15550f426935Ssstefan1 auto &ORE = OREGetter(F); 15560f426935Ssstefan1 15572db182ffSJoseph Huber ORE.emit([&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); }); 15580f426935Ssstefan1 } 15590f426935Ssstefan1 1560b726c557SJohannes Doerfert /// The underlying module. 15619548b74aSJohannes Doerfert Module &M; 15629548b74aSJohannes Doerfert 15639548b74aSJohannes Doerfert /// The SCC we are operating on. 1564ee17263aSJohannes Doerfert SmallVectorImpl<Function *> &SCC; 15659548b74aSJohannes Doerfert 15669548b74aSJohannes Doerfert /// Callback to update the call graph, the first argument is a removed call, 15679548b74aSJohannes Doerfert /// the second an optional replacement call. 15689548b74aSJohannes Doerfert CallGraphUpdater &CGUpdater; 15699548b74aSJohannes Doerfert 15704d4ea9acSHuber, Joseph /// Callback to get an OptimizationRemarkEmitter from a Function * 15714d4ea9acSHuber, Joseph OptimizationRemarkGetter OREGetter; 15724d4ea9acSHuber, Joseph 15737cfd267cSsstefan1 /// OpenMP-specific information cache. Also Used for Attributor runs. 15747cfd267cSsstefan1 OMPInformationCache &OMPInfoCache; 1575b8235d2bSsstefan1 1576b8235d2bSsstefan1 /// Attributor instance. 1577b8235d2bSsstefan1 Attributor &A; 1578b8235d2bSsstefan1 1579b8235d2bSsstefan1 /// Helper function to run Attributor on SCC. 1580b8235d2bSsstefan1 bool runAttributor() { 1581b8235d2bSsstefan1 if (SCC.empty()) 1582b8235d2bSsstefan1 return false; 1583b8235d2bSsstefan1 1584b8235d2bSsstefan1 registerAAs(); 1585b8235d2bSsstefan1 1586b8235d2bSsstefan1 ChangeStatus Changed = A.run(); 1587b8235d2bSsstefan1 1588b8235d2bSsstefan1 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size() 1589b8235d2bSsstefan1 << " functions, result: " << Changed << ".\n"); 1590b8235d2bSsstefan1 1591b8235d2bSsstefan1 return Changed == ChangeStatus::CHANGED; 1592b8235d2bSsstefan1 } 1593b8235d2bSsstefan1 1594b8235d2bSsstefan1 /// Populate the Attributor with abstract attribute opportunities in the 1595b8235d2bSsstefan1 /// function. 1596b8235d2bSsstefan1 void registerAAs() { 15975dfd7cc4Ssstefan1 if (SCC.empty()) 15985dfd7cc4Ssstefan1 return; 1599b8235d2bSsstefan1 16005dfd7cc4Ssstefan1 // Create CallSite AA for all Getters. 16015dfd7cc4Ssstefan1 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { 16025dfd7cc4Ssstefan1 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; 16035dfd7cc4Ssstefan1 16045dfd7cc4Ssstefan1 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; 16055dfd7cc4Ssstefan1 16065dfd7cc4Ssstefan1 auto CreateAA = [&](Use &U, Function &Caller) { 16075dfd7cc4Ssstefan1 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); 16085dfd7cc4Ssstefan1 if (!CI) 16095dfd7cc4Ssstefan1 return false; 16105dfd7cc4Ssstefan1 16115dfd7cc4Ssstefan1 auto &CB = cast<CallBase>(*CI); 16125dfd7cc4Ssstefan1 16135dfd7cc4Ssstefan1 IRPosition CBPos = IRPosition::callsite_function(CB); 16145dfd7cc4Ssstefan1 A.getOrCreateAAFor<AAICVTracker>(CBPos); 16155dfd7cc4Ssstefan1 return false; 16165dfd7cc4Ssstefan1 }; 16175dfd7cc4Ssstefan1 16185dfd7cc4Ssstefan1 GetterRFI.foreachUse(SCC, CreateAA); 1619b8235d2bSsstefan1 } 16206fc51c9fSJoseph Huber auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; 16216fc51c9fSJoseph Huber auto CreateAA = [&](Use &U, Function &F) { 16226fc51c9fSJoseph Huber A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F)); 16236fc51c9fSJoseph Huber return false; 16246fc51c9fSJoseph Huber }; 16256fc51c9fSJoseph Huber GlobalizationRFI.foreachUse(SCC, CreateAA); 162618283125SJoseph Huber 16277d69da71SJoseph Huber // Create an ExecutionDomain AA for every function and a HeapToStack AA for 16287d69da71SJoseph Huber // every function if there is a device kernel. 162903d7e61cSJoseph Huber for (auto *F : SCC) { 163003d7e61cSJoseph Huber if (!F->isDeclaration()) 163103d7e61cSJoseph Huber A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F)); 16325ccb7424SJoseph Huber if (isOpenMPDevice(M)) 16337d69da71SJoseph Huber A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F)); 163418283125SJoseph Huber } 1635b8235d2bSsstefan1 } 1636b8235d2bSsstefan1 }; 1637b8235d2bSsstefan1 16385b0581aeSJohannes Doerfert Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { 16395b0581aeSJohannes Doerfert if (!OMPInfoCache.ModuleSlice.count(&F)) 16405b0581aeSJohannes Doerfert return nullptr; 16415b0581aeSJohannes Doerfert 16425b0581aeSJohannes Doerfert // Use a scope to keep the lifetime of the CachedKernel short. 16435b0581aeSJohannes Doerfert { 16445b0581aeSJohannes Doerfert Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; 16455b0581aeSJohannes Doerfert if (CachedKernel) 16465b0581aeSJohannes Doerfert return *CachedKernel; 16475b0581aeSJohannes Doerfert 16485b0581aeSJohannes Doerfert // TODO: We should use an AA to create an (optimistic and callback 16495b0581aeSJohannes Doerfert // call-aware) call graph. For now we stick to simple patterns that 16505b0581aeSJohannes Doerfert // are less powerful, basically the worst fixpoint. 16515b0581aeSJohannes Doerfert if (isKernel(F)) { 16525b0581aeSJohannes Doerfert CachedKernel = Kernel(&F); 16535b0581aeSJohannes Doerfert return *CachedKernel; 16545b0581aeSJohannes Doerfert } 16555b0581aeSJohannes Doerfert 16565b0581aeSJohannes Doerfert CachedKernel = nullptr; 1657994bb6ebSJohannes Doerfert if (!F.hasLocalLinkage()) { 1658994bb6ebSJohannes Doerfert 1659994bb6ebSJohannes Doerfert // See https://openmp.llvm.org/remarks/OptimizationRemarks.html 16602db182ffSJoseph Huber auto Remark = [&](OptimizationRemarkAnalysis ORA) { 16612db182ffSJoseph Huber return ORA 16622db182ffSJoseph Huber << "[OMP100] Potentially unknown OpenMP target region caller"; 1663994bb6ebSJohannes Doerfert }; 16642db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark); 1665994bb6ebSJohannes Doerfert 16665b0581aeSJohannes Doerfert return nullptr; 16675b0581aeSJohannes Doerfert } 1668994bb6ebSJohannes Doerfert } 16695b0581aeSJohannes Doerfert 16705b0581aeSJohannes Doerfert auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { 16715b0581aeSJohannes Doerfert if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 16725b0581aeSJohannes Doerfert // Allow use in equality comparisons. 16735b0581aeSJohannes Doerfert if (Cmp->isEquality()) 16745b0581aeSJohannes Doerfert return getUniqueKernelFor(*Cmp); 16755b0581aeSJohannes Doerfert return nullptr; 16765b0581aeSJohannes Doerfert } 16775b0581aeSJohannes Doerfert if (auto *CB = dyn_cast<CallBase>(U.getUser())) { 16785b0581aeSJohannes Doerfert // Allow direct calls. 16795b0581aeSJohannes Doerfert if (CB->isCallee(&U)) 16805b0581aeSJohannes Doerfert return getUniqueKernelFor(*CB); 1681a2dbfb6bSGiorgis Georgakoudis 1682a2dbfb6bSGiorgis Georgakoudis OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = 1683a2dbfb6bSGiorgis Georgakoudis OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; 1684a2dbfb6bSGiorgis Georgakoudis // Allow the use in __kmpc_parallel_51 calls. 1685a2dbfb6bSGiorgis Georgakoudis if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI)) 16865b0581aeSJohannes Doerfert return getUniqueKernelFor(*CB); 16875b0581aeSJohannes Doerfert return nullptr; 16885b0581aeSJohannes Doerfert } 16895b0581aeSJohannes Doerfert // Disallow every other use. 16905b0581aeSJohannes Doerfert return nullptr; 16915b0581aeSJohannes Doerfert }; 16925b0581aeSJohannes Doerfert 16935b0581aeSJohannes Doerfert // TODO: In the future we want to track more than just a unique kernel. 16945b0581aeSJohannes Doerfert SmallPtrSet<Kernel, 2> PotentialKernels; 16958d8ce85bSsstefan1 OMPInformationCache::foreachUse(F, [&](const Use &U) { 16965b0581aeSJohannes Doerfert PotentialKernels.insert(GetUniqueKernelForUse(U)); 16975b0581aeSJohannes Doerfert }); 16985b0581aeSJohannes Doerfert 16995b0581aeSJohannes Doerfert Kernel K = nullptr; 17005b0581aeSJohannes Doerfert if (PotentialKernels.size() == 1) 17015b0581aeSJohannes Doerfert K = *PotentialKernels.begin(); 17025b0581aeSJohannes Doerfert 17035b0581aeSJohannes Doerfert // Cache the result. 17045b0581aeSJohannes Doerfert UniqueKernelMap[&F] = K; 17055b0581aeSJohannes Doerfert 17065b0581aeSJohannes Doerfert return K; 17075b0581aeSJohannes Doerfert } 17085b0581aeSJohannes Doerfert 17095b0581aeSJohannes Doerfert bool OpenMPOpt::rewriteDeviceCodeStateMachine() { 1710a2dbfb6bSGiorgis Georgakoudis OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = 1711a2dbfb6bSGiorgis Georgakoudis OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; 17125b0581aeSJohannes Doerfert 17135b0581aeSJohannes Doerfert bool Changed = false; 1714a2dbfb6bSGiorgis Georgakoudis if (!KernelParallelRFI) 17155b0581aeSJohannes Doerfert return Changed; 17165b0581aeSJohannes Doerfert 17175b0581aeSJohannes Doerfert for (Function *F : SCC) { 17185b0581aeSJohannes Doerfert 1719a2dbfb6bSGiorgis Georgakoudis // Check if the function is a use in a __kmpc_parallel_51 call at 17205b0581aeSJohannes Doerfert // all. 17215b0581aeSJohannes Doerfert bool UnknownUse = false; 1722a2dbfb6bSGiorgis Georgakoudis bool KernelParallelUse = false; 17235b0581aeSJohannes Doerfert unsigned NumDirectCalls = 0; 17245b0581aeSJohannes Doerfert 17255b0581aeSJohannes Doerfert SmallVector<Use *, 2> ToBeReplacedStateMachineUses; 17268d8ce85bSsstefan1 OMPInformationCache::foreachUse(*F, [&](Use &U) { 17275b0581aeSJohannes Doerfert if (auto *CB = dyn_cast<CallBase>(U.getUser())) 17285b0581aeSJohannes Doerfert if (CB->isCallee(&U)) { 17295b0581aeSJohannes Doerfert ++NumDirectCalls; 17305b0581aeSJohannes Doerfert return; 17315b0581aeSJohannes Doerfert } 17325b0581aeSJohannes Doerfert 173381db6144SMichael Liao if (isa<ICmpInst>(U.getUser())) { 17345b0581aeSJohannes Doerfert ToBeReplacedStateMachineUses.push_back(&U); 17355b0581aeSJohannes Doerfert return; 17365b0581aeSJohannes Doerfert } 1737a2dbfb6bSGiorgis Georgakoudis 1738a2dbfb6bSGiorgis Georgakoudis // Find wrapper functions that represent parallel kernels. 1739a2dbfb6bSGiorgis Georgakoudis CallInst *CI = 1740a2dbfb6bSGiorgis Georgakoudis OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI); 1741a2dbfb6bSGiorgis Georgakoudis const unsigned int WrapperFunctionArgNo = 6; 1742a2dbfb6bSGiorgis Georgakoudis if (!KernelParallelUse && CI && 1743a2dbfb6bSGiorgis Georgakoudis CI->getArgOperandNo(&U) == WrapperFunctionArgNo) { 1744a2dbfb6bSGiorgis Georgakoudis KernelParallelUse = true; 17455b0581aeSJohannes Doerfert ToBeReplacedStateMachineUses.push_back(&U); 17465b0581aeSJohannes Doerfert return; 17475b0581aeSJohannes Doerfert } 17485b0581aeSJohannes Doerfert UnknownUse = true; 17495b0581aeSJohannes Doerfert }); 17505b0581aeSJohannes Doerfert 1751a2dbfb6bSGiorgis Georgakoudis // Do not emit a remark if we haven't seen a __kmpc_parallel_51 1752fec1f210SJohannes Doerfert // use. 1753a2dbfb6bSGiorgis Georgakoudis if (!KernelParallelUse) 17545b0581aeSJohannes Doerfert continue; 17555b0581aeSJohannes Doerfert 1756fec1f210SJohannes Doerfert { 17572db182ffSJoseph Huber auto Remark = [&](OptimizationRemarkAnalysis ORA) { 17582db182ffSJoseph Huber return ORA << "Found a parallel region that is called in a target " 1759fec1f210SJohannes Doerfert "region but not part of a combined target construct nor " 1760a2dbfb6bSGiorgis Georgakoudis "nested inside a target construct without intermediate " 1761fec1f210SJohannes Doerfert "code. This can lead to excessive register usage for " 1762fec1f210SJohannes Doerfert "unrelated target regions in the same translation unit " 1763fec1f210SJohannes Doerfert "due to spurious call edges assumed by ptxas."; 1764fec1f210SJohannes Doerfert }; 17652db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPParallelRegionInNonSPMD", 17662db182ffSJoseph Huber Remark); 1767fec1f210SJohannes Doerfert } 1768fec1f210SJohannes Doerfert 1769fec1f210SJohannes Doerfert // If this ever hits, we should investigate. 1770fec1f210SJohannes Doerfert // TODO: Checking the number of uses is not a necessary restriction and 1771fec1f210SJohannes Doerfert // should be lifted. 1772fec1f210SJohannes Doerfert if (UnknownUse || NumDirectCalls != 1 || 1773fec1f210SJohannes Doerfert ToBeReplacedStateMachineUses.size() != 2) { 1774fec1f210SJohannes Doerfert { 17752db182ffSJoseph Huber auto Remark = [&](OptimizationRemarkAnalysis ORA) { 17762db182ffSJoseph Huber return ORA << "Parallel region is used in " 1777fec1f210SJohannes Doerfert << (UnknownUse ? "unknown" : "unexpected") 1778fec1f210SJohannes Doerfert << " ways; will not attempt to rewrite the state machine."; 1779fec1f210SJohannes Doerfert }; 17802db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>( 17812db182ffSJoseph Huber F, "OpenMPParallelRegionInNonSPMD", Remark); 1782fec1f210SJohannes Doerfert } 17835b0581aeSJohannes Doerfert continue; 1784fec1f210SJohannes Doerfert } 17855b0581aeSJohannes Doerfert 1786a2dbfb6bSGiorgis Georgakoudis // Even if we have __kmpc_parallel_51 calls, we (for now) give 17875b0581aeSJohannes Doerfert // up if the function is not called from a unique kernel. 17885b0581aeSJohannes Doerfert Kernel K = getUniqueKernelFor(*F); 1789fec1f210SJohannes Doerfert if (!K) { 1790fec1f210SJohannes Doerfert { 17912db182ffSJoseph Huber auto Remark = [&](OptimizationRemarkAnalysis ORA) { 17922db182ffSJoseph Huber return ORA << "Parallel region is not known to be called from a " 1793fec1f210SJohannes Doerfert "unique single target region, maybe the surrounding " 1794fec1f210SJohannes Doerfert "function has external linkage?; will not attempt to " 1795fec1f210SJohannes Doerfert "rewrite the state machine use."; 1796fec1f210SJohannes Doerfert }; 17972db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>( 17982db182ffSJoseph Huber F, "OpenMPParallelRegionInMultipleKernesl", Remark); 1799fec1f210SJohannes Doerfert } 18005b0581aeSJohannes Doerfert continue; 1801fec1f210SJohannes Doerfert } 18025b0581aeSJohannes Doerfert 18035b0581aeSJohannes Doerfert // We now know F is a parallel body function called only from the kernel K. 18045b0581aeSJohannes Doerfert // We also identified the state machine uses in which we replace the 18055b0581aeSJohannes Doerfert // function pointer by a new global symbol for identification purposes. This 18065b0581aeSJohannes Doerfert // ensures only direct calls to the function are left. 18075b0581aeSJohannes Doerfert 1808fec1f210SJohannes Doerfert { 18092db182ffSJoseph Huber auto RemarkParalleRegion = [&](OptimizationRemarkAnalysis ORA) { 18102db182ffSJoseph Huber return ORA << "Specialize parallel region that is only reached from a " 1811fec1f210SJohannes Doerfert "single target region to avoid spurious call edges and " 1812fec1f210SJohannes Doerfert "excessive register usage in other target regions. " 1813fec1f210SJohannes Doerfert "(parallel region ID: " 1814fec1f210SJohannes Doerfert << ore::NV("OpenMPParallelRegion", F->getName()) 1815fec1f210SJohannes Doerfert << ", kernel ID: " 1816fec1f210SJohannes Doerfert << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1817fec1f210SJohannes Doerfert }; 18182db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPParallelRegionInNonSPMD", 1819fec1f210SJohannes Doerfert RemarkParalleRegion); 18202db182ffSJoseph Huber auto RemarkKernel = [&](OptimizationRemarkAnalysis ORA) { 18212db182ffSJoseph Huber return ORA << "Target region containing the parallel region that is " 1822fec1f210SJohannes Doerfert "specialized. (parallel region ID: " 1823fec1f210SJohannes Doerfert << ore::NV("OpenMPParallelRegion", F->getName()) 1824fec1f210SJohannes Doerfert << ", kernel ID: " 1825fec1f210SJohannes Doerfert << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1826fec1f210SJohannes Doerfert }; 18272db182ffSJoseph Huber emitRemark<OptimizationRemarkAnalysis>(K, "OpenMPParallelRegionInNonSPMD", 18282db182ffSJoseph Huber RemarkKernel); 1829fec1f210SJohannes Doerfert } 1830fec1f210SJohannes Doerfert 18315b0581aeSJohannes Doerfert Module &M = *F->getParent(); 18325b0581aeSJohannes Doerfert Type *Int8Ty = Type::getInt8Ty(M.getContext()); 18335b0581aeSJohannes Doerfert 18345b0581aeSJohannes Doerfert auto *ID = new GlobalVariable( 18355b0581aeSJohannes Doerfert M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, 18365b0581aeSJohannes Doerfert UndefValue::get(Int8Ty), F->getName() + ".ID"); 18375b0581aeSJohannes Doerfert 18385b0581aeSJohannes Doerfert for (Use *U : ToBeReplacedStateMachineUses) 18395b0581aeSJohannes Doerfert U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); 18405b0581aeSJohannes Doerfert 18415b0581aeSJohannes Doerfert ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; 18425b0581aeSJohannes Doerfert 18435b0581aeSJohannes Doerfert Changed = true; 18445b0581aeSJohannes Doerfert } 18455b0581aeSJohannes Doerfert 18465b0581aeSJohannes Doerfert return Changed; 18475b0581aeSJohannes Doerfert } 18485b0581aeSJohannes Doerfert 1849b8235d2bSsstefan1 /// Abstract Attribute for tracking ICV values. 1850b8235d2bSsstefan1 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { 1851b8235d2bSsstefan1 using Base = StateWrapper<BooleanState, AbstractAttribute>; 1852b8235d2bSsstefan1 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1853b8235d2bSsstefan1 18545dfd7cc4Ssstefan1 void initialize(Attributor &A) override { 18555dfd7cc4Ssstefan1 Function *F = getAnchorScope(); 18565dfd7cc4Ssstefan1 if (!F || !A.isFunctionIPOAmendable(*F)) 18575dfd7cc4Ssstefan1 indicatePessimisticFixpoint(); 18585dfd7cc4Ssstefan1 } 18595dfd7cc4Ssstefan1 1860b8235d2bSsstefan1 /// Returns true if value is assumed to be tracked. 1861b8235d2bSsstefan1 bool isAssumedTracked() const { return getAssumed(); } 1862b8235d2bSsstefan1 1863b8235d2bSsstefan1 /// Returns true if value is known to be tracked. 1864b8235d2bSsstefan1 bool isKnownTracked() const { return getAssumed(); } 1865b8235d2bSsstefan1 1866b8235d2bSsstefan1 /// Create an abstract attribute biew for the position \p IRP. 1867b8235d2bSsstefan1 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); 1868b8235d2bSsstefan1 1869b8235d2bSsstefan1 /// Return the value with which \p I can be replaced for specific \p ICV. 18705dfd7cc4Ssstefan1 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, 18715dfd7cc4Ssstefan1 const Instruction *I, 18725dfd7cc4Ssstefan1 Attributor &A) const { 18735dfd7cc4Ssstefan1 return None; 18745dfd7cc4Ssstefan1 } 18755dfd7cc4Ssstefan1 18765dfd7cc4Ssstefan1 /// Return an assumed unique ICV value if a single candidate is found. If 18775dfd7cc4Ssstefan1 /// there cannot be one, return a nullptr. If it is not clear yet, return the 18785dfd7cc4Ssstefan1 /// Optional::NoneType. 18795dfd7cc4Ssstefan1 virtual Optional<Value *> 18805dfd7cc4Ssstefan1 getUniqueReplacementValue(InternalControlVar ICV) const = 0; 18815dfd7cc4Ssstefan1 18825dfd7cc4Ssstefan1 // Currently only nthreads is being tracked. 18835dfd7cc4Ssstefan1 // this array will only grow with time. 18845dfd7cc4Ssstefan1 InternalControlVar TrackableICVs[1] = {ICV_nthreads}; 1885b8235d2bSsstefan1 1886b8235d2bSsstefan1 /// See AbstractAttribute::getName() 1887b8235d2bSsstefan1 const std::string getName() const override { return "AAICVTracker"; } 1888b8235d2bSsstefan1 1889233af895SLuofan Chen /// See AbstractAttribute::getIdAddr() 1890233af895SLuofan Chen const char *getIdAddr() const override { return &ID; } 1891233af895SLuofan Chen 1892233af895SLuofan Chen /// This function should return true if the type of the \p AA is AAICVTracker 1893233af895SLuofan Chen static bool classof(const AbstractAttribute *AA) { 1894233af895SLuofan Chen return (AA->getIdAddr() == &ID); 1895233af895SLuofan Chen } 1896233af895SLuofan Chen 1897b8235d2bSsstefan1 static const char ID; 1898b8235d2bSsstefan1 }; 1899b8235d2bSsstefan1 1900b8235d2bSsstefan1 struct AAICVTrackerFunction : public AAICVTracker { 1901b8235d2bSsstefan1 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) 1902b8235d2bSsstefan1 : AAICVTracker(IRP, A) {} 1903b8235d2bSsstefan1 1904b8235d2bSsstefan1 // FIXME: come up with better string. 19055dfd7cc4Ssstefan1 const std::string getAsStr() const override { return "ICVTrackerFunction"; } 1906b8235d2bSsstefan1 1907b8235d2bSsstefan1 // FIXME: come up with some stats. 1908b8235d2bSsstefan1 void trackStatistics() const override {} 1909b8235d2bSsstefan1 19105dfd7cc4Ssstefan1 /// We don't manifest anything for this AA. 1911b8235d2bSsstefan1 ChangeStatus manifest(Attributor &A) override { 19125dfd7cc4Ssstefan1 return ChangeStatus::UNCHANGED; 1913b8235d2bSsstefan1 } 1914b8235d2bSsstefan1 1915b8235d2bSsstefan1 // Map of ICV to their values at specific program point. 19165dfd7cc4Ssstefan1 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar, 1917b8235d2bSsstefan1 InternalControlVar::ICV___last> 19185dfd7cc4Ssstefan1 ICVReplacementValuesMap; 1919b8235d2bSsstefan1 1920b8235d2bSsstefan1 ChangeStatus updateImpl(Attributor &A) override { 1921b8235d2bSsstefan1 ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 1922b8235d2bSsstefan1 1923b8235d2bSsstefan1 Function *F = getAnchorScope(); 1924b8235d2bSsstefan1 1925b8235d2bSsstefan1 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1926b8235d2bSsstefan1 1927b8235d2bSsstefan1 for (InternalControlVar ICV : TrackableICVs) { 1928b8235d2bSsstefan1 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1929b8235d2bSsstefan1 19305dfd7cc4Ssstefan1 auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1931b8235d2bSsstefan1 auto TrackValues = [&](Use &U, Function &) { 1932b8235d2bSsstefan1 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); 1933b8235d2bSsstefan1 if (!CI) 1934b8235d2bSsstefan1 return false; 1935b8235d2bSsstefan1 1936b8235d2bSsstefan1 // FIXME: handle setters with more that 1 arguments. 1937b8235d2bSsstefan1 /// Track new value. 19385dfd7cc4Ssstefan1 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second) 1939b8235d2bSsstefan1 HasChanged = ChangeStatus::CHANGED; 1940b8235d2bSsstefan1 1941b8235d2bSsstefan1 return false; 1942b8235d2bSsstefan1 }; 1943b8235d2bSsstefan1 19445dfd7cc4Ssstefan1 auto CallCheck = [&](Instruction &I) { 19455dfd7cc4Ssstefan1 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); 19465dfd7cc4Ssstefan1 if (ReplVal.hasValue() && 19475dfd7cc4Ssstefan1 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) 19485dfd7cc4Ssstefan1 HasChanged = ChangeStatus::CHANGED; 19495dfd7cc4Ssstefan1 19505dfd7cc4Ssstefan1 return true; 19515dfd7cc4Ssstefan1 }; 19525dfd7cc4Ssstefan1 19535dfd7cc4Ssstefan1 // Track all changes of an ICV. 1954b8235d2bSsstefan1 SetterRFI.foreachUse(TrackValues, F); 19555dfd7cc4Ssstefan1 19565dfd7cc4Ssstefan1 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call}, 19575dfd7cc4Ssstefan1 /* CheckBBLivenessOnly */ true); 19585dfd7cc4Ssstefan1 19595dfd7cc4Ssstefan1 /// TODO: Figure out a way to avoid adding entry in 19605dfd7cc4Ssstefan1 /// ICVReplacementValuesMap 19615dfd7cc4Ssstefan1 Instruction *Entry = &F->getEntryBlock().front(); 19625dfd7cc4Ssstefan1 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry)) 19635dfd7cc4Ssstefan1 ValuesMap.insert(std::make_pair(Entry, nullptr)); 1964b8235d2bSsstefan1 } 1965b8235d2bSsstefan1 1966b8235d2bSsstefan1 return HasChanged; 1967b8235d2bSsstefan1 } 1968b8235d2bSsstefan1 19695dfd7cc4Ssstefan1 /// Hepler to check if \p I is a call and get the value for it if it is 19705dfd7cc4Ssstefan1 /// unique. 19715dfd7cc4Ssstefan1 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, 19725dfd7cc4Ssstefan1 InternalControlVar &ICV) const { 1973b8235d2bSsstefan1 19745dfd7cc4Ssstefan1 const auto *CB = dyn_cast<CallBase>(I); 1975dcaec812SJohannes Doerfert if (!CB || CB->hasFnAttr("no_openmp") || 1976dcaec812SJohannes Doerfert CB->hasFnAttr("no_openmp_routines")) 19775dfd7cc4Ssstefan1 return None; 19785dfd7cc4Ssstefan1 1979b8235d2bSsstefan1 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1980b8235d2bSsstefan1 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; 19815dfd7cc4Ssstefan1 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 19825dfd7cc4Ssstefan1 Function *CalledFunction = CB->getCalledFunction(); 1983b8235d2bSsstefan1 19844eef14f9SWei Wang // Indirect call, assume ICV changes. 19854eef14f9SWei Wang if (CalledFunction == nullptr) 19864eef14f9SWei Wang return nullptr; 19875dfd7cc4Ssstefan1 if (CalledFunction == GetterRFI.Declaration) 19885dfd7cc4Ssstefan1 return None; 19895dfd7cc4Ssstefan1 if (CalledFunction == SetterRFI.Declaration) { 19905dfd7cc4Ssstefan1 if (ICVReplacementValuesMap[ICV].count(I)) 19915dfd7cc4Ssstefan1 return ICVReplacementValuesMap[ICV].lookup(I); 19925dfd7cc4Ssstefan1 19935dfd7cc4Ssstefan1 return nullptr; 19945dfd7cc4Ssstefan1 } 19955dfd7cc4Ssstefan1 19965dfd7cc4Ssstefan1 // Since we don't know, assume it changes the ICV. 19975dfd7cc4Ssstefan1 if (CalledFunction->isDeclaration()) 19985dfd7cc4Ssstefan1 return nullptr; 19995dfd7cc4Ssstefan1 20005b70c12fSJohannes Doerfert const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 20015b70c12fSJohannes Doerfert *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); 20025dfd7cc4Ssstefan1 20035dfd7cc4Ssstefan1 if (ICVTrackingAA.isAssumedTracked()) 20045dfd7cc4Ssstefan1 return ICVTrackingAA.getUniqueReplacementValue(ICV); 20055dfd7cc4Ssstefan1 20065dfd7cc4Ssstefan1 // If we don't know, assume it changes. 20075dfd7cc4Ssstefan1 return nullptr; 20085dfd7cc4Ssstefan1 } 20095dfd7cc4Ssstefan1 20105dfd7cc4Ssstefan1 // We don't check unique value for a function, so return None. 20115dfd7cc4Ssstefan1 Optional<Value *> 20125dfd7cc4Ssstefan1 getUniqueReplacementValue(InternalControlVar ICV) const override { 20135dfd7cc4Ssstefan1 return None; 20145dfd7cc4Ssstefan1 } 20155dfd7cc4Ssstefan1 20165dfd7cc4Ssstefan1 /// Return the value with which \p I can be replaced for specific \p ICV. 20175dfd7cc4Ssstefan1 Optional<Value *> getReplacementValue(InternalControlVar ICV, 20185dfd7cc4Ssstefan1 const Instruction *I, 20195dfd7cc4Ssstefan1 Attributor &A) const override { 20205dfd7cc4Ssstefan1 const auto &ValuesMap = ICVReplacementValuesMap[ICV]; 20215dfd7cc4Ssstefan1 if (ValuesMap.count(I)) 20225dfd7cc4Ssstefan1 return ValuesMap.lookup(I); 20235dfd7cc4Ssstefan1 20245dfd7cc4Ssstefan1 SmallVector<const Instruction *, 16> Worklist; 20255dfd7cc4Ssstefan1 SmallPtrSet<const Instruction *, 16> Visited; 20265dfd7cc4Ssstefan1 Worklist.push_back(I); 20275dfd7cc4Ssstefan1 20285dfd7cc4Ssstefan1 Optional<Value *> ReplVal; 20295dfd7cc4Ssstefan1 20305dfd7cc4Ssstefan1 while (!Worklist.empty()) { 20315dfd7cc4Ssstefan1 const Instruction *CurrInst = Worklist.pop_back_val(); 20325dfd7cc4Ssstefan1 if (!Visited.insert(CurrInst).second) 2033b8235d2bSsstefan1 continue; 2034b8235d2bSsstefan1 20355dfd7cc4Ssstefan1 const BasicBlock *CurrBB = CurrInst->getParent(); 20365dfd7cc4Ssstefan1 20375dfd7cc4Ssstefan1 // Go up and look for all potential setters/calls that might change the 20385dfd7cc4Ssstefan1 // ICV. 20395dfd7cc4Ssstefan1 while ((CurrInst = CurrInst->getPrevNode())) { 20405dfd7cc4Ssstefan1 if (ValuesMap.count(CurrInst)) { 20415dfd7cc4Ssstefan1 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); 20425dfd7cc4Ssstefan1 // Unknown value, track new. 20435dfd7cc4Ssstefan1 if (!ReplVal.hasValue()) { 20445dfd7cc4Ssstefan1 ReplVal = NewReplVal; 20455dfd7cc4Ssstefan1 break; 20465dfd7cc4Ssstefan1 } 20475dfd7cc4Ssstefan1 20485dfd7cc4Ssstefan1 // If we found a new value, we can't know the icv value anymore. 20495dfd7cc4Ssstefan1 if (NewReplVal.hasValue()) 20505dfd7cc4Ssstefan1 if (ReplVal != NewReplVal) 2051b8235d2bSsstefan1 return nullptr; 2052b8235d2bSsstefan1 20535dfd7cc4Ssstefan1 break; 2054b8235d2bSsstefan1 } 2055b8235d2bSsstefan1 20565dfd7cc4Ssstefan1 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); 20575dfd7cc4Ssstefan1 if (!NewReplVal.hasValue()) 20585dfd7cc4Ssstefan1 continue; 20595dfd7cc4Ssstefan1 20605dfd7cc4Ssstefan1 // Unknown value, track new. 20615dfd7cc4Ssstefan1 if (!ReplVal.hasValue()) { 20625dfd7cc4Ssstefan1 ReplVal = NewReplVal; 20635dfd7cc4Ssstefan1 break; 2064b8235d2bSsstefan1 } 2065b8235d2bSsstefan1 20665dfd7cc4Ssstefan1 // if (NewReplVal.hasValue()) 20675dfd7cc4Ssstefan1 // We found a new value, we can't know the icv value anymore. 20685dfd7cc4Ssstefan1 if (ReplVal != NewReplVal) 2069b8235d2bSsstefan1 return nullptr; 2070b8235d2bSsstefan1 } 20715dfd7cc4Ssstefan1 20725dfd7cc4Ssstefan1 // If we are in the same BB and we have a value, we are done. 20735dfd7cc4Ssstefan1 if (CurrBB == I->getParent() && ReplVal.hasValue()) 20745dfd7cc4Ssstefan1 return ReplVal; 20755dfd7cc4Ssstefan1 20765dfd7cc4Ssstefan1 // Go through all predecessors and add terminators for analysis. 20775dfd7cc4Ssstefan1 for (const BasicBlock *Pred : predecessors(CurrBB)) 20785dfd7cc4Ssstefan1 if (const Instruction *Terminator = Pred->getTerminator()) 20795dfd7cc4Ssstefan1 Worklist.push_back(Terminator); 20805dfd7cc4Ssstefan1 } 20815dfd7cc4Ssstefan1 20825dfd7cc4Ssstefan1 return ReplVal; 20835dfd7cc4Ssstefan1 } 20845dfd7cc4Ssstefan1 }; 20855dfd7cc4Ssstefan1 20865dfd7cc4Ssstefan1 struct AAICVTrackerFunctionReturned : AAICVTracker { 20875dfd7cc4Ssstefan1 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A) 20885dfd7cc4Ssstefan1 : AAICVTracker(IRP, A) {} 20895dfd7cc4Ssstefan1 20905dfd7cc4Ssstefan1 // FIXME: come up with better string. 20915dfd7cc4Ssstefan1 const std::string getAsStr() const override { 20925dfd7cc4Ssstefan1 return "ICVTrackerFunctionReturned"; 20935dfd7cc4Ssstefan1 } 20945dfd7cc4Ssstefan1 20955dfd7cc4Ssstefan1 // FIXME: come up with some stats. 20965dfd7cc4Ssstefan1 void trackStatistics() const override {} 20975dfd7cc4Ssstefan1 20985dfd7cc4Ssstefan1 /// We don't manifest anything for this AA. 20995dfd7cc4Ssstefan1 ChangeStatus manifest(Attributor &A) override { 21005dfd7cc4Ssstefan1 return ChangeStatus::UNCHANGED; 21015dfd7cc4Ssstefan1 } 21025dfd7cc4Ssstefan1 21035dfd7cc4Ssstefan1 // Map of ICV to their values at specific program point. 21045dfd7cc4Ssstefan1 EnumeratedArray<Optional<Value *>, InternalControlVar, 21055dfd7cc4Ssstefan1 InternalControlVar::ICV___last> 21065dfd7cc4Ssstefan1 ICVReplacementValuesMap; 21075dfd7cc4Ssstefan1 21085dfd7cc4Ssstefan1 /// Return the value with which \p I can be replaced for specific \p ICV. 21095dfd7cc4Ssstefan1 Optional<Value *> 21105dfd7cc4Ssstefan1 getUniqueReplacementValue(InternalControlVar ICV) const override { 21115dfd7cc4Ssstefan1 return ICVReplacementValuesMap[ICV]; 21125dfd7cc4Ssstefan1 } 21135dfd7cc4Ssstefan1 21145dfd7cc4Ssstefan1 ChangeStatus updateImpl(Attributor &A) override { 21155dfd7cc4Ssstefan1 ChangeStatus Changed = ChangeStatus::UNCHANGED; 21165dfd7cc4Ssstefan1 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 21175b70c12fSJohannes Doerfert *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); 21185dfd7cc4Ssstefan1 21195dfd7cc4Ssstefan1 if (!ICVTrackingAA.isAssumedTracked()) 21205dfd7cc4Ssstefan1 return indicatePessimisticFixpoint(); 21215dfd7cc4Ssstefan1 21225dfd7cc4Ssstefan1 for (InternalControlVar ICV : TrackableICVs) { 21235dfd7cc4Ssstefan1 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 21245dfd7cc4Ssstefan1 Optional<Value *> UniqueICVValue; 21255dfd7cc4Ssstefan1 21265dfd7cc4Ssstefan1 auto CheckReturnInst = [&](Instruction &I) { 21275dfd7cc4Ssstefan1 Optional<Value *> NewReplVal = 21285dfd7cc4Ssstefan1 ICVTrackingAA.getReplacementValue(ICV, &I, A); 21295dfd7cc4Ssstefan1 21305dfd7cc4Ssstefan1 // If we found a second ICV value there is no unique returned value. 21315dfd7cc4Ssstefan1 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) 21325dfd7cc4Ssstefan1 return false; 21335dfd7cc4Ssstefan1 21345dfd7cc4Ssstefan1 UniqueICVValue = NewReplVal; 21355dfd7cc4Ssstefan1 21365dfd7cc4Ssstefan1 return true; 21375dfd7cc4Ssstefan1 }; 21385dfd7cc4Ssstefan1 21395dfd7cc4Ssstefan1 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}, 21405dfd7cc4Ssstefan1 /* CheckBBLivenessOnly */ true)) 21415dfd7cc4Ssstefan1 UniqueICVValue = nullptr; 21425dfd7cc4Ssstefan1 21435dfd7cc4Ssstefan1 if (UniqueICVValue == ReplVal) 21445dfd7cc4Ssstefan1 continue; 21455dfd7cc4Ssstefan1 21465dfd7cc4Ssstefan1 ReplVal = UniqueICVValue; 21475dfd7cc4Ssstefan1 Changed = ChangeStatus::CHANGED; 21485dfd7cc4Ssstefan1 } 21495dfd7cc4Ssstefan1 21505dfd7cc4Ssstefan1 return Changed; 21515dfd7cc4Ssstefan1 } 21525dfd7cc4Ssstefan1 }; 21535dfd7cc4Ssstefan1 21545dfd7cc4Ssstefan1 struct AAICVTrackerCallSite : AAICVTracker { 21555dfd7cc4Ssstefan1 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A) 21565dfd7cc4Ssstefan1 : AAICVTracker(IRP, A) {} 21575dfd7cc4Ssstefan1 21585dfd7cc4Ssstefan1 void initialize(Attributor &A) override { 21595dfd7cc4Ssstefan1 Function *F = getAnchorScope(); 21605dfd7cc4Ssstefan1 if (!F || !A.isFunctionIPOAmendable(*F)) 21615dfd7cc4Ssstefan1 indicatePessimisticFixpoint(); 21625dfd7cc4Ssstefan1 21635dfd7cc4Ssstefan1 // We only initialize this AA for getters, so we need to know which ICV it 21645dfd7cc4Ssstefan1 // gets. 21655dfd7cc4Ssstefan1 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 21665dfd7cc4Ssstefan1 for (InternalControlVar ICV : TrackableICVs) { 21675dfd7cc4Ssstefan1 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 21685dfd7cc4Ssstefan1 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter]; 21695dfd7cc4Ssstefan1 if (Getter.Declaration == getAssociatedFunction()) { 21705dfd7cc4Ssstefan1 AssociatedICV = ICVInfo.Kind; 21715dfd7cc4Ssstefan1 return; 21725dfd7cc4Ssstefan1 } 21735dfd7cc4Ssstefan1 } 21745dfd7cc4Ssstefan1 21755dfd7cc4Ssstefan1 /// Unknown ICV. 21765dfd7cc4Ssstefan1 indicatePessimisticFixpoint(); 21775dfd7cc4Ssstefan1 } 21785dfd7cc4Ssstefan1 21795dfd7cc4Ssstefan1 ChangeStatus manifest(Attributor &A) override { 21805dfd7cc4Ssstefan1 if (!ReplVal.hasValue() || !ReplVal.getValue()) 21815dfd7cc4Ssstefan1 return ChangeStatus::UNCHANGED; 21825dfd7cc4Ssstefan1 21835dfd7cc4Ssstefan1 A.changeValueAfterManifest(*getCtxI(), **ReplVal); 21845dfd7cc4Ssstefan1 A.deleteAfterManifest(*getCtxI()); 21855dfd7cc4Ssstefan1 21865dfd7cc4Ssstefan1 return ChangeStatus::CHANGED; 21875dfd7cc4Ssstefan1 } 21885dfd7cc4Ssstefan1 21895dfd7cc4Ssstefan1 // FIXME: come up with better string. 21905dfd7cc4Ssstefan1 const std::string getAsStr() const override { return "ICVTrackerCallSite"; } 21915dfd7cc4Ssstefan1 21925dfd7cc4Ssstefan1 // FIXME: come up with some stats. 21935dfd7cc4Ssstefan1 void trackStatistics() const override {} 21945dfd7cc4Ssstefan1 21955dfd7cc4Ssstefan1 InternalControlVar AssociatedICV; 21965dfd7cc4Ssstefan1 Optional<Value *> ReplVal; 21975dfd7cc4Ssstefan1 21985dfd7cc4Ssstefan1 ChangeStatus updateImpl(Attributor &A) override { 21995dfd7cc4Ssstefan1 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 22005b70c12fSJohannes Doerfert *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); 22015dfd7cc4Ssstefan1 22025dfd7cc4Ssstefan1 // We don't have any information, so we assume it changes the ICV. 22035dfd7cc4Ssstefan1 if (!ICVTrackingAA.isAssumedTracked()) 22045dfd7cc4Ssstefan1 return indicatePessimisticFixpoint(); 22055dfd7cc4Ssstefan1 22065dfd7cc4Ssstefan1 Optional<Value *> NewReplVal = 22075dfd7cc4Ssstefan1 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); 22085dfd7cc4Ssstefan1 22095dfd7cc4Ssstefan1 if (ReplVal == NewReplVal) 22105dfd7cc4Ssstefan1 return ChangeStatus::UNCHANGED; 22115dfd7cc4Ssstefan1 22125dfd7cc4Ssstefan1 ReplVal = NewReplVal; 22135dfd7cc4Ssstefan1 return ChangeStatus::CHANGED; 22145dfd7cc4Ssstefan1 } 22155dfd7cc4Ssstefan1 22165dfd7cc4Ssstefan1 // Return the value with which associated value can be replaced for specific 22175dfd7cc4Ssstefan1 // \p ICV. 22185dfd7cc4Ssstefan1 Optional<Value *> 22195dfd7cc4Ssstefan1 getUniqueReplacementValue(InternalControlVar ICV) const override { 22205dfd7cc4Ssstefan1 return ReplVal; 22215dfd7cc4Ssstefan1 } 22225dfd7cc4Ssstefan1 }; 22235dfd7cc4Ssstefan1 22245dfd7cc4Ssstefan1 struct AAICVTrackerCallSiteReturned : AAICVTracker { 22255dfd7cc4Ssstefan1 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A) 22265dfd7cc4Ssstefan1 : AAICVTracker(IRP, A) {} 22275dfd7cc4Ssstefan1 22285dfd7cc4Ssstefan1 // FIXME: come up with better string. 22295dfd7cc4Ssstefan1 const std::string getAsStr() const override { 22305dfd7cc4Ssstefan1 return "ICVTrackerCallSiteReturned"; 22315dfd7cc4Ssstefan1 } 22325dfd7cc4Ssstefan1 22335dfd7cc4Ssstefan1 // FIXME: come up with some stats. 22345dfd7cc4Ssstefan1 void trackStatistics() const override {} 22355dfd7cc4Ssstefan1 22365dfd7cc4Ssstefan1 /// We don't manifest anything for this AA. 22375dfd7cc4Ssstefan1 ChangeStatus manifest(Attributor &A) override { 22385dfd7cc4Ssstefan1 return ChangeStatus::UNCHANGED; 22395dfd7cc4Ssstefan1 } 22405dfd7cc4Ssstefan1 22415dfd7cc4Ssstefan1 // Map of ICV to their values at specific program point. 22425dfd7cc4Ssstefan1 EnumeratedArray<Optional<Value *>, InternalControlVar, 22435dfd7cc4Ssstefan1 InternalControlVar::ICV___last> 22445dfd7cc4Ssstefan1 ICVReplacementValuesMap; 22455dfd7cc4Ssstefan1 22465dfd7cc4Ssstefan1 /// Return the value with which associated value can be replaced for specific 22475dfd7cc4Ssstefan1 /// \p ICV. 22485dfd7cc4Ssstefan1 Optional<Value *> 22495dfd7cc4Ssstefan1 getUniqueReplacementValue(InternalControlVar ICV) const override { 22505dfd7cc4Ssstefan1 return ICVReplacementValuesMap[ICV]; 22515dfd7cc4Ssstefan1 } 22525dfd7cc4Ssstefan1 22535dfd7cc4Ssstefan1 ChangeStatus updateImpl(Attributor &A) override { 22545dfd7cc4Ssstefan1 ChangeStatus Changed = ChangeStatus::UNCHANGED; 22555dfd7cc4Ssstefan1 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 22565b70c12fSJohannes Doerfert *this, IRPosition::returned(*getAssociatedFunction()), 22575b70c12fSJohannes Doerfert DepClassTy::REQUIRED); 22585dfd7cc4Ssstefan1 22595dfd7cc4Ssstefan1 // We don't have any information, so we assume it changes the ICV. 22605dfd7cc4Ssstefan1 if (!ICVTrackingAA.isAssumedTracked()) 22615dfd7cc4Ssstefan1 return indicatePessimisticFixpoint(); 22625dfd7cc4Ssstefan1 22635dfd7cc4Ssstefan1 for (InternalControlVar ICV : TrackableICVs) { 22645dfd7cc4Ssstefan1 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 22655dfd7cc4Ssstefan1 Optional<Value *> NewReplVal = 22665dfd7cc4Ssstefan1 ICVTrackingAA.getUniqueReplacementValue(ICV); 22675dfd7cc4Ssstefan1 22685dfd7cc4Ssstefan1 if (ReplVal == NewReplVal) 22695dfd7cc4Ssstefan1 continue; 22705dfd7cc4Ssstefan1 22715dfd7cc4Ssstefan1 ReplVal = NewReplVal; 22725dfd7cc4Ssstefan1 Changed = ChangeStatus::CHANGED; 22735dfd7cc4Ssstefan1 } 22745dfd7cc4Ssstefan1 return Changed; 22755dfd7cc4Ssstefan1 } 22769548b74aSJohannes Doerfert }; 227718283125SJoseph Huber 227818283125SJoseph Huber struct AAExecutionDomainFunction : public AAExecutionDomain { 227918283125SJoseph Huber AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) 228018283125SJoseph Huber : AAExecutionDomain(IRP, A) {} 228118283125SJoseph Huber 228218283125SJoseph Huber const std::string getAsStr() const override { 228318283125SJoseph Huber return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) + 228418283125SJoseph Huber "/" + std::to_string(NumBBs) + " BBs thread 0 only."; 228518283125SJoseph Huber } 228618283125SJoseph Huber 228718283125SJoseph Huber /// See AbstractAttribute::trackStatistics(). 228818283125SJoseph Huber void trackStatistics() const override {} 228918283125SJoseph Huber 229018283125SJoseph Huber void initialize(Attributor &A) override { 229118283125SJoseph Huber Function *F = getAnchorScope(); 229218283125SJoseph Huber for (const auto &BB : *F) 229318283125SJoseph Huber SingleThreadedBBs.insert(&BB); 229418283125SJoseph Huber NumBBs = SingleThreadedBBs.size(); 229518283125SJoseph Huber } 229618283125SJoseph Huber 229718283125SJoseph Huber ChangeStatus manifest(Attributor &A) override { 229818283125SJoseph Huber LLVM_DEBUG({ 229918283125SJoseph Huber for (const BasicBlock *BB : SingleThreadedBBs) 230018283125SJoseph Huber dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " " 230118283125SJoseph Huber << BB->getName() << " is executed by a single thread.\n"; 230218283125SJoseph Huber }); 230318283125SJoseph Huber return ChangeStatus::UNCHANGED; 230418283125SJoseph Huber } 230518283125SJoseph Huber 230618283125SJoseph Huber ChangeStatus updateImpl(Attributor &A) override; 230718283125SJoseph Huber 230818283125SJoseph Huber /// Check if an instruction is executed by a single thread. 23099a23e673SJohannes Doerfert bool isExecutedByInitialThreadOnly(const Instruction &I) const override { 23109a23e673SJohannes Doerfert return isExecutedByInitialThreadOnly(*I.getParent()); 231118283125SJoseph Huber } 231218283125SJoseph Huber 23139a23e673SJohannes Doerfert bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override { 23141cfdcae6SJoseph Huber return isValidState() && SingleThreadedBBs.contains(&BB); 231518283125SJoseph Huber } 231618283125SJoseph Huber 231718283125SJoseph Huber /// Set of basic blocks that are executed by a single thread. 231818283125SJoseph Huber DenseSet<const BasicBlock *> SingleThreadedBBs; 231918283125SJoseph Huber 232018283125SJoseph Huber /// Total number of basic blocks in this function. 232118283125SJoseph Huber long unsigned NumBBs; 232218283125SJoseph Huber }; 232318283125SJoseph Huber 232418283125SJoseph Huber ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { 232518283125SJoseph Huber Function *F = getAnchorScope(); 232618283125SJoseph Huber ReversePostOrderTraversal<Function *> RPOT(F); 232718283125SJoseph Huber auto NumSingleThreadedBBs = SingleThreadedBBs.size(); 232818283125SJoseph Huber 232918283125SJoseph Huber bool AllCallSitesKnown; 233018283125SJoseph Huber auto PredForCallSite = [&](AbstractCallSite ACS) { 233118283125SJoseph Huber const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>( 233218283125SJoseph Huber *this, IRPosition::function(*ACS.getInstruction()->getFunction()), 233318283125SJoseph Huber DepClassTy::REQUIRED); 23341cfdcae6SJoseph Huber return ACS.isDirectCall() && 23351cfdcae6SJoseph Huber ExecutionDomainAA.isExecutedByInitialThreadOnly( 23369a23e673SJohannes Doerfert *ACS.getInstruction()); 233718283125SJoseph Huber }; 233818283125SJoseph Huber 233918283125SJoseph Huber if (!A.checkForAllCallSites(PredForCallSite, *this, 234018283125SJoseph Huber /* RequiresAllCallSites */ true, 234118283125SJoseph Huber AllCallSitesKnown)) 234218283125SJoseph Huber SingleThreadedBBs.erase(&F->getEntryBlock()); 234318283125SJoseph Huber 234418283125SJoseph Huber // Check if the edge into the successor block compares a thread-id function to 234518283125SJoseph Huber // a constant zero. 234618283125SJoseph Huber // TODO: Use AAValueSimplify to simplify and propogate constants. 234718283125SJoseph Huber // TODO: Check more than a single use for thread ID's. 23486fc51c9fSJoseph Huber auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { 234918283125SJoseph Huber if (!Edge || !Edge->isConditional()) 235018283125SJoseph Huber return false; 235118283125SJoseph Huber if (Edge->getSuccessor(0) != SuccessorBB) 235218283125SJoseph Huber return false; 235318283125SJoseph Huber 235418283125SJoseph Huber auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition()); 235518283125SJoseph Huber if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality()) 235618283125SJoseph Huber return false; 235718283125SJoseph Huber 23586fc51c9fSJoseph Huber // Temporarily match the pattern generated by clang for teams regions. 23596fc51c9fSJoseph Huber // TODO: Remove this once the new runtime is in place. 23606fc51c9fSJoseph Huber ConstantInt *One, *NegOne; 23616fc51c9fSJoseph Huber CmpInst::Predicate Pred; 23626fc51c9fSJoseph Huber auto &&m_ThreadID = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_tid_x>(); 23636fc51c9fSJoseph Huber auto &&m_WarpSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_warpsize>(); 23646fc51c9fSJoseph Huber auto &&m_BlockSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_ntid_x>(); 23656fc51c9fSJoseph Huber if (match(Cmp, m_Cmp(Pred, m_ThreadID, 23666fc51c9fSJoseph Huber m_And(m_Sub(m_BlockSize, m_ConstantInt(One)), 23676fc51c9fSJoseph Huber m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)), 23686fc51c9fSJoseph Huber m_ConstantInt(NegOne)))))) 23696fc51c9fSJoseph Huber if (One->isOne() && NegOne->isMinusOne() && 23706fc51c9fSJoseph Huber Pred == CmpInst::Predicate::ICMP_EQ) 23716fc51c9fSJoseph Huber return true; 23726fc51c9fSJoseph Huber 237318283125SJoseph Huber ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1)); 237418283125SJoseph Huber if (!C || !C->isZero()) 237518283125SJoseph Huber return false; 237618283125SJoseph Huber 237768abc3d2SJoseph Huber if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0))) 237868abc3d2SJoseph Huber if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x) 237918283125SJoseph Huber return true; 238068abc3d2SJoseph Huber if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0))) 238168abc3d2SJoseph Huber if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x) 238268abc3d2SJoseph Huber return true; 238318283125SJoseph Huber 238418283125SJoseph Huber return false; 238518283125SJoseph Huber }; 238618283125SJoseph Huber 238718283125SJoseph Huber // Merge all the predecessor states into the current basic block. A basic 238818283125SJoseph Huber // block is executed by a single thread if all of its predecessors are. 238918283125SJoseph Huber auto MergePredecessorStates = [&](BasicBlock *BB) { 239018283125SJoseph Huber if (pred_begin(BB) == pred_end(BB)) 239118283125SJoseph Huber return SingleThreadedBBs.contains(BB); 239218283125SJoseph Huber 23936fc51c9fSJoseph Huber bool IsInitialThread = true; 239418283125SJoseph Huber for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB); 239518283125SJoseph Huber PredBB != PredEndBB; ++PredBB) { 23966fc51c9fSJoseph Huber if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()), 239718283125SJoseph Huber BB)) 23986fc51c9fSJoseph Huber IsInitialThread &= SingleThreadedBBs.contains(*PredBB); 239918283125SJoseph Huber } 240018283125SJoseph Huber 24016fc51c9fSJoseph Huber return IsInitialThread; 240218283125SJoseph Huber }; 240318283125SJoseph Huber 240418283125SJoseph Huber for (auto *BB : RPOT) { 240518283125SJoseph Huber if (!MergePredecessorStates(BB)) 240618283125SJoseph Huber SingleThreadedBBs.erase(BB); 240718283125SJoseph Huber } 240818283125SJoseph Huber 240918283125SJoseph Huber return (NumSingleThreadedBBs == SingleThreadedBBs.size()) 241018283125SJoseph Huber ? ChangeStatus::UNCHANGED 241118283125SJoseph Huber : ChangeStatus::CHANGED; 241218283125SJoseph Huber } 241318283125SJoseph Huber 24146fc51c9fSJoseph Huber /// Try to replace memory allocation calls called by a single thread with a 24156fc51c9fSJoseph Huber /// static buffer of shared memory. 24166fc51c9fSJoseph Huber struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> { 24176fc51c9fSJoseph Huber using Base = StateWrapper<BooleanState, AbstractAttribute>; 24186fc51c9fSJoseph Huber AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 24196fc51c9fSJoseph Huber 24206fc51c9fSJoseph Huber /// Create an abstract attribute view for the position \p IRP. 24216fc51c9fSJoseph Huber static AAHeapToShared &createForPosition(const IRPosition &IRP, 24226fc51c9fSJoseph Huber Attributor &A); 24236fc51c9fSJoseph Huber 24246fc51c9fSJoseph Huber /// See AbstractAttribute::getName(). 24256fc51c9fSJoseph Huber const std::string getName() const override { return "AAHeapToShared"; } 24266fc51c9fSJoseph Huber 24276fc51c9fSJoseph Huber /// See AbstractAttribute::getIdAddr(). 24286fc51c9fSJoseph Huber const char *getIdAddr() const override { return &ID; } 24296fc51c9fSJoseph Huber 24306fc51c9fSJoseph Huber /// This function should return true if the type of the \p AA is 24316fc51c9fSJoseph Huber /// AAHeapToShared. 24326fc51c9fSJoseph Huber static bool classof(const AbstractAttribute *AA) { 24336fc51c9fSJoseph Huber return (AA->getIdAddr() == &ID); 24346fc51c9fSJoseph Huber } 24356fc51c9fSJoseph Huber 24366fc51c9fSJoseph Huber /// Unique ID (due to the unique address) 24376fc51c9fSJoseph Huber static const char ID; 24386fc51c9fSJoseph Huber }; 24396fc51c9fSJoseph Huber 24406fc51c9fSJoseph Huber struct AAHeapToSharedFunction : public AAHeapToShared { 24416fc51c9fSJoseph Huber AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A) 24426fc51c9fSJoseph Huber : AAHeapToShared(IRP, A) {} 24436fc51c9fSJoseph Huber 24446fc51c9fSJoseph Huber const std::string getAsStr() const override { 24456fc51c9fSJoseph Huber return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) + 24466fc51c9fSJoseph Huber " malloc calls eligible."; 24476fc51c9fSJoseph Huber } 24486fc51c9fSJoseph Huber 24496fc51c9fSJoseph Huber /// See AbstractAttribute::trackStatistics(). 24506fc51c9fSJoseph Huber void trackStatistics() const override {} 24516fc51c9fSJoseph Huber 24526fc51c9fSJoseph Huber void initialize(Attributor &A) override { 24536fc51c9fSJoseph Huber auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 24546fc51c9fSJoseph Huber auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; 24556fc51c9fSJoseph Huber 24566fc51c9fSJoseph Huber for (User *U : RFI.Declaration->users()) 24576fc51c9fSJoseph Huber if (CallBase *CB = dyn_cast<CallBase>(U)) 24586fc51c9fSJoseph Huber MallocCalls.insert(CB); 24596fc51c9fSJoseph Huber } 24606fc51c9fSJoseph Huber 24616fc51c9fSJoseph Huber ChangeStatus manifest(Attributor &A) override { 24626fc51c9fSJoseph Huber if (MallocCalls.empty()) 24636fc51c9fSJoseph Huber return ChangeStatus::UNCHANGED; 24646fc51c9fSJoseph Huber 24656fc51c9fSJoseph Huber auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 24666fc51c9fSJoseph Huber auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; 24676fc51c9fSJoseph Huber 24686fc51c9fSJoseph Huber Function *F = getAnchorScope(); 24696fc51c9fSJoseph Huber auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this, 24706fc51c9fSJoseph Huber DepClassTy::OPTIONAL); 24716fc51c9fSJoseph Huber 24726fc51c9fSJoseph Huber ChangeStatus Changed = ChangeStatus::UNCHANGED; 24736fc51c9fSJoseph Huber for (CallBase *CB : MallocCalls) { 24746fc51c9fSJoseph Huber // Skip replacing this if HeapToStack has already claimed it. 24756fc51c9fSJoseph Huber if (HS && HS->isKnownHeapToStack(*CB)) 24766fc51c9fSJoseph Huber continue; 24776fc51c9fSJoseph Huber 24786fc51c9fSJoseph Huber // Find the unique free call to remove it. 24796fc51c9fSJoseph Huber SmallVector<CallBase *, 4> FreeCalls; 24806fc51c9fSJoseph Huber for (auto *U : CB->users()) { 24816fc51c9fSJoseph Huber CallBase *C = dyn_cast<CallBase>(U); 24826fc51c9fSJoseph Huber if (C && C->getCalledFunction() == FreeCall.Declaration) 24836fc51c9fSJoseph Huber FreeCalls.push_back(C); 24846fc51c9fSJoseph Huber } 24856fc51c9fSJoseph Huber if (FreeCalls.size() != 1) 24866fc51c9fSJoseph Huber continue; 24876fc51c9fSJoseph Huber 24886fc51c9fSJoseph Huber ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0)); 24896fc51c9fSJoseph Huber 24906fc51c9fSJoseph Huber LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in " 24916fc51c9fSJoseph Huber << CB->getCaller()->getName() << " with " 24926fc51c9fSJoseph Huber << AllocSize->getZExtValue() 24936fc51c9fSJoseph Huber << " bytes of shared memory\n"); 24946fc51c9fSJoseph Huber 24956fc51c9fSJoseph Huber // Create a new shared memory buffer of the same size as the allocation 24966fc51c9fSJoseph Huber // and replace all the uses of the original allocation with it. 24976fc51c9fSJoseph Huber Module *M = CB->getModule(); 24986fc51c9fSJoseph Huber Type *Int8Ty = Type::getInt8Ty(M->getContext()); 24996fc51c9fSJoseph Huber Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); 25006fc51c9fSJoseph Huber auto *SharedMem = new GlobalVariable( 25016fc51c9fSJoseph Huber *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, 25026fc51c9fSJoseph Huber UndefValue::get(Int8ArrTy), CB->getName(), nullptr, 25036fc51c9fSJoseph Huber GlobalValue::NotThreadLocal, 25046fc51c9fSJoseph Huber static_cast<unsigned>(AddressSpace::Shared)); 25056fc51c9fSJoseph Huber auto *NewBuffer = 25066fc51c9fSJoseph Huber ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo()); 25076fc51c9fSJoseph Huber 250830e36c9bSJoseph Huber auto Remark = [&](OptimizationRemark OR) { 250930e36c9bSJoseph Huber return OR << "Replaced globalized variable with " 251030e36c9bSJoseph Huber << ore::NV("SharedMemory", AllocSize->getZExtValue()) 251130e36c9bSJoseph Huber << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ") 251230e36c9bSJoseph Huber << "of shared memory"; 251330e36c9bSJoseph Huber }; 251430e36c9bSJoseph Huber A.emitRemark<OptimizationRemark>(CB, "OpenMPReplaceGlobalization", 251530e36c9bSJoseph Huber Remark); 251630e36c9bSJoseph Huber 25176fc51c9fSJoseph Huber SharedMem->setAlignment(MaybeAlign(32)); 25186fc51c9fSJoseph Huber 25196fc51c9fSJoseph Huber A.changeValueAfterManifest(*CB, *NewBuffer); 25206fc51c9fSJoseph Huber A.deleteAfterManifest(*CB); 25216fc51c9fSJoseph Huber A.deleteAfterManifest(*FreeCalls.front()); 25226fc51c9fSJoseph Huber 25236fc51c9fSJoseph Huber NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); 25246fc51c9fSJoseph Huber Changed = ChangeStatus::CHANGED; 25256fc51c9fSJoseph Huber } 25266fc51c9fSJoseph Huber 25276fc51c9fSJoseph Huber return Changed; 25286fc51c9fSJoseph Huber } 25296fc51c9fSJoseph Huber 25306fc51c9fSJoseph Huber ChangeStatus updateImpl(Attributor &A) override { 25316fc51c9fSJoseph Huber auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 25326fc51c9fSJoseph Huber auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; 25336fc51c9fSJoseph Huber Function *F = getAnchorScope(); 25346fc51c9fSJoseph Huber 25356fc51c9fSJoseph Huber auto NumMallocCalls = MallocCalls.size(); 25366fc51c9fSJoseph Huber 25376fc51c9fSJoseph Huber // Only consider malloc calls executed by a single thread with a constant. 25386fc51c9fSJoseph Huber for (User *U : RFI.Declaration->users()) { 25396fc51c9fSJoseph Huber const auto &ED = A.getAAFor<AAExecutionDomain>( 25406fc51c9fSJoseph Huber *this, IRPosition::function(*F), DepClassTy::REQUIRED); 25416fc51c9fSJoseph Huber if (CallBase *CB = dyn_cast<CallBase>(U)) 25426fc51c9fSJoseph Huber if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) || 25436fc51c9fSJoseph Huber !ED.isExecutedByInitialThreadOnly(*CB)) 25446fc51c9fSJoseph Huber MallocCalls.erase(CB); 25456fc51c9fSJoseph Huber } 25466fc51c9fSJoseph Huber 25476fc51c9fSJoseph Huber if (NumMallocCalls != MallocCalls.size()) 25486fc51c9fSJoseph Huber return ChangeStatus::CHANGED; 25496fc51c9fSJoseph Huber 25506fc51c9fSJoseph Huber return ChangeStatus::UNCHANGED; 25516fc51c9fSJoseph Huber } 25526fc51c9fSJoseph Huber 25536fc51c9fSJoseph Huber /// Collection of all malloc calls in a function. 25546fc51c9fSJoseph Huber SmallPtrSet<CallBase *, 4> MallocCalls; 25556fc51c9fSJoseph Huber }; 25566fc51c9fSJoseph Huber 25579548b74aSJohannes Doerfert } // namespace 25589548b74aSJohannes Doerfert 2559b8235d2bSsstefan1 const char AAICVTracker::ID = 0; 256018283125SJoseph Huber const char AAExecutionDomain::ID = 0; 25616fc51c9fSJoseph Huber const char AAHeapToShared::ID = 0; 2562b8235d2bSsstefan1 2563b8235d2bSsstefan1 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, 2564b8235d2bSsstefan1 Attributor &A) { 2565b8235d2bSsstefan1 AAICVTracker *AA = nullptr; 2566b8235d2bSsstefan1 switch (IRP.getPositionKind()) { 2567b8235d2bSsstefan1 case IRPosition::IRP_INVALID: 2568b8235d2bSsstefan1 case IRPosition::IRP_FLOAT: 2569b8235d2bSsstefan1 case IRPosition::IRP_ARGUMENT: 2570b8235d2bSsstefan1 case IRPosition::IRP_CALL_SITE_ARGUMENT: 25711de70a72SJohannes Doerfert llvm_unreachable("ICVTracker can only be created for function position!"); 25725dfd7cc4Ssstefan1 case IRPosition::IRP_RETURNED: 25735dfd7cc4Ssstefan1 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A); 25745dfd7cc4Ssstefan1 break; 25755dfd7cc4Ssstefan1 case IRPosition::IRP_CALL_SITE_RETURNED: 25765dfd7cc4Ssstefan1 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A); 25775dfd7cc4Ssstefan1 break; 25785dfd7cc4Ssstefan1 case IRPosition::IRP_CALL_SITE: 25795dfd7cc4Ssstefan1 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A); 25805dfd7cc4Ssstefan1 break; 2581b8235d2bSsstefan1 case IRPosition::IRP_FUNCTION: 2582b8235d2bSsstefan1 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); 2583b8235d2bSsstefan1 break; 2584b8235d2bSsstefan1 } 2585b8235d2bSsstefan1 2586b8235d2bSsstefan1 return *AA; 2587b8235d2bSsstefan1 } 2588b8235d2bSsstefan1 258918283125SJoseph Huber AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP, 259018283125SJoseph Huber Attributor &A) { 259118283125SJoseph Huber AAExecutionDomainFunction *AA = nullptr; 259218283125SJoseph Huber switch (IRP.getPositionKind()) { 259318283125SJoseph Huber case IRPosition::IRP_INVALID: 259418283125SJoseph Huber case IRPosition::IRP_FLOAT: 259518283125SJoseph Huber case IRPosition::IRP_ARGUMENT: 259618283125SJoseph Huber case IRPosition::IRP_CALL_SITE_ARGUMENT: 259718283125SJoseph Huber case IRPosition::IRP_RETURNED: 259818283125SJoseph Huber case IRPosition::IRP_CALL_SITE_RETURNED: 259918283125SJoseph Huber case IRPosition::IRP_CALL_SITE: 260018283125SJoseph Huber llvm_unreachable( 260118283125SJoseph Huber "AAExecutionDomain can only be created for function position!"); 260218283125SJoseph Huber case IRPosition::IRP_FUNCTION: 260318283125SJoseph Huber AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A); 260418283125SJoseph Huber break; 260518283125SJoseph Huber } 260618283125SJoseph Huber 260718283125SJoseph Huber return *AA; 260818283125SJoseph Huber } 260918283125SJoseph Huber 26106fc51c9fSJoseph Huber AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP, 26116fc51c9fSJoseph Huber Attributor &A) { 26126fc51c9fSJoseph Huber AAHeapToSharedFunction *AA = nullptr; 26136fc51c9fSJoseph Huber switch (IRP.getPositionKind()) { 26146fc51c9fSJoseph Huber case IRPosition::IRP_INVALID: 26156fc51c9fSJoseph Huber case IRPosition::IRP_FLOAT: 26166fc51c9fSJoseph Huber case IRPosition::IRP_ARGUMENT: 26176fc51c9fSJoseph Huber case IRPosition::IRP_CALL_SITE_ARGUMENT: 26186fc51c9fSJoseph Huber case IRPosition::IRP_RETURNED: 26196fc51c9fSJoseph Huber case IRPosition::IRP_CALL_SITE_RETURNED: 26206fc51c9fSJoseph Huber case IRPosition::IRP_CALL_SITE: 26216fc51c9fSJoseph Huber llvm_unreachable( 26226fc51c9fSJoseph Huber "AAHeapToShared can only be created for function position!"); 26236fc51c9fSJoseph Huber case IRPosition::IRP_FUNCTION: 26246fc51c9fSJoseph Huber AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A); 26256fc51c9fSJoseph Huber break; 26266fc51c9fSJoseph Huber } 26276fc51c9fSJoseph Huber 26286fc51c9fSJoseph Huber return *AA; 26296fc51c9fSJoseph Huber } 26306fc51c9fSJoseph Huber 2631b2ad63d3SJoseph Huber PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { 26325ccb7424SJoseph Huber if (!containsOpenMP(M)) 2633b2ad63d3SJoseph Huber return PreservedAnalyses::all(); 2634b2ad63d3SJoseph Huber if (DisableOpenMPOptimizations) 2635b2ad63d3SJoseph Huber return PreservedAnalyses::all(); 2636b2ad63d3SJoseph Huber 26375ccb7424SJoseph Huber KernelSet Kernels = getDeviceKernels(M); 26385ccb7424SJoseph Huber 263903d7e61cSJoseph Huber // Create internal copies of each function if this is a kernel Module. 264003d7e61cSJoseph Huber DenseSet<const Function *> InternalizedFuncs; 26415ccb7424SJoseph Huber if (isOpenMPDevice(M)) 264203d7e61cSJoseph Huber for (Function &F : M) 26435ccb7424SJoseph Huber if (!F.isDeclaration() && !Kernels.contains(&F)) 264403d7e61cSJoseph Huber if (Attributor::internalizeFunction(F, /* Force */ true)) 264503d7e61cSJoseph Huber InternalizedFuncs.insert(&F); 264603d7e61cSJoseph Huber 264703d7e61cSJoseph Huber // Look at every function definition in the Module that wasn't internalized. 2648b2ad63d3SJoseph Huber SmallVector<Function *, 16> SCC; 264903d7e61cSJoseph Huber for (Function &F : M) 265003d7e61cSJoseph Huber if (!F.isDeclaration() && !InternalizedFuncs.contains(&F)) 265103d7e61cSJoseph Huber SCC.push_back(&F); 2652b2ad63d3SJoseph Huber 2653b2ad63d3SJoseph Huber if (SCC.empty()) 2654b2ad63d3SJoseph Huber return PreservedAnalyses::all(); 2655b2ad63d3SJoseph Huber 2656b2ad63d3SJoseph Huber FunctionAnalysisManager &FAM = 2657b2ad63d3SJoseph Huber AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 2658b2ad63d3SJoseph Huber 2659b2ad63d3SJoseph Huber AnalysisGetter AG(FAM); 2660b2ad63d3SJoseph Huber 2661b2ad63d3SJoseph Huber auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 2662b2ad63d3SJoseph Huber return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 2663b2ad63d3SJoseph Huber }; 2664b2ad63d3SJoseph Huber 2665b2ad63d3SJoseph Huber BumpPtrAllocator Allocator; 2666b2ad63d3SJoseph Huber CallGraphUpdater CGUpdater; 2667b2ad63d3SJoseph Huber 2668b2ad63d3SJoseph Huber SetVector<Function *> Functions(SCC.begin(), SCC.end()); 26695ccb7424SJoseph Huber OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels); 2670b2ad63d3SJoseph Huber 2671*13b2fba2SJoseph Huber unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; 26724a6bd8e3SJoseph Huber Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false, 2673*13b2fba2SJoseph Huber MaxFixpointIterations, OREGetter, DEBUG_TYPE); 2674b2ad63d3SJoseph Huber 2675b2ad63d3SJoseph Huber OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2676b2ad63d3SJoseph Huber bool Changed = OMPOpt.run(true); 2677b2ad63d3SJoseph Huber if (Changed) 2678b2ad63d3SJoseph Huber return PreservedAnalyses::none(); 2679b2ad63d3SJoseph Huber 2680b2ad63d3SJoseph Huber return PreservedAnalyses::all(); 2681b2ad63d3SJoseph Huber } 2682b2ad63d3SJoseph Huber 2683b2ad63d3SJoseph Huber PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, 26849548b74aSJohannes Doerfert CGSCCAnalysisManager &AM, 2685b2ad63d3SJoseph Huber LazyCallGraph &CG, 2686b2ad63d3SJoseph Huber CGSCCUpdateResult &UR) { 26875ccb7424SJoseph Huber if (!containsOpenMP(*C.begin()->getFunction().getParent())) 26889548b74aSJohannes Doerfert return PreservedAnalyses::all(); 26899548b74aSJohannes Doerfert if (DisableOpenMPOptimizations) 26909548b74aSJohannes Doerfert return PreservedAnalyses::all(); 26919548b74aSJohannes Doerfert 2692ee17263aSJohannes Doerfert SmallVector<Function *, 16> SCC; 2693351d234dSRoman Lebedev // If there are kernels in the module, we have to run on all SCC's. 2694351d234dSRoman Lebedev for (LazyCallGraph::Node &N : C) { 2695351d234dSRoman Lebedev Function *Fn = &N.getFunction(); 2696351d234dSRoman Lebedev SCC.push_back(Fn); 2697351d234dSRoman Lebedev } 2698351d234dSRoman Lebedev 26995ccb7424SJoseph Huber if (SCC.empty()) 27009548b74aSJohannes Doerfert return PreservedAnalyses::all(); 27019548b74aSJohannes Doerfert 27025ccb7424SJoseph Huber Module &M = *C.begin()->getFunction().getParent(); 27035ccb7424SJoseph Huber 27045ccb7424SJoseph Huber KernelSet Kernels = getDeviceKernels(M); 27055ccb7424SJoseph Huber 27064d4ea9acSHuber, Joseph FunctionAnalysisManager &FAM = 27074d4ea9acSHuber, Joseph AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 27087cfd267cSsstefan1 27097cfd267cSsstefan1 AnalysisGetter AG(FAM); 27107cfd267cSsstefan1 27117cfd267cSsstefan1 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 27124d4ea9acSHuber, Joseph return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 27134d4ea9acSHuber, Joseph }; 27144d4ea9acSHuber, Joseph 2715b2ad63d3SJoseph Huber BumpPtrAllocator Allocator; 27169548b74aSJohannes Doerfert CallGraphUpdater CGUpdater; 27179548b74aSJohannes Doerfert CGUpdater.initialize(CG, C, AM, UR); 27187cfd267cSsstefan1 27197cfd267cSsstefan1 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 27207cfd267cSsstefan1 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, 27215ccb7424SJoseph Huber /*CGSCC*/ Functions, Kernels); 27227cfd267cSsstefan1 2723*13b2fba2SJoseph Huber unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; 27244a6bd8e3SJoseph Huber Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, 2725*13b2fba2SJoseph Huber MaxFixpointIterations, OREGetter, DEBUG_TYPE); 2726b8235d2bSsstefan1 2727b8235d2bSsstefan1 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2728b2ad63d3SJoseph Huber bool Changed = OMPOpt.run(false); 2729694ded37SGiorgis Georgakoudis if (Changed) 2730694ded37SGiorgis Georgakoudis return PreservedAnalyses::none(); 2731694ded37SGiorgis Georgakoudis 27329548b74aSJohannes Doerfert return PreservedAnalyses::all(); 27339548b74aSJohannes Doerfert } 27348b57ed09SJoseph Huber 27359548b74aSJohannes Doerfert namespace { 27369548b74aSJohannes Doerfert 2737b2ad63d3SJoseph Huber struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass { 27389548b74aSJohannes Doerfert CallGraphUpdater CGUpdater; 27399548b74aSJohannes Doerfert static char ID; 27409548b74aSJohannes Doerfert 2741b2ad63d3SJoseph Huber OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) { 2742b2ad63d3SJoseph Huber initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry()); 27439548b74aSJohannes Doerfert } 27449548b74aSJohannes Doerfert 27459548b74aSJohannes Doerfert void getAnalysisUsage(AnalysisUsage &AU) const override { 27469548b74aSJohannes Doerfert CallGraphSCCPass::getAnalysisUsage(AU); 27479548b74aSJohannes Doerfert } 27489548b74aSJohannes Doerfert 27499548b74aSJohannes Doerfert bool runOnSCC(CallGraphSCC &CGSCC) override { 27505ccb7424SJoseph Huber if (!containsOpenMP(CGSCC.getCallGraph().getModule())) 27519548b74aSJohannes Doerfert return false; 27529548b74aSJohannes Doerfert if (DisableOpenMPOptimizations || skipSCC(CGSCC)) 27539548b74aSJohannes Doerfert return false; 27549548b74aSJohannes Doerfert 2755ee17263aSJohannes Doerfert SmallVector<Function *, 16> SCC; 2756351d234dSRoman Lebedev // If there are kernels in the module, we have to run on all SCC's. 2757351d234dSRoman Lebedev for (CallGraphNode *CGN : CGSCC) { 2758351d234dSRoman Lebedev Function *Fn = CGN->getFunction(); 2759351d234dSRoman Lebedev if (!Fn || Fn->isDeclaration()) 2760351d234dSRoman Lebedev continue; 2761ee17263aSJohannes Doerfert SCC.push_back(Fn); 2762351d234dSRoman Lebedev } 2763351d234dSRoman Lebedev 27645ccb7424SJoseph Huber if (SCC.empty()) 27659548b74aSJohannes Doerfert return false; 27669548b74aSJohannes Doerfert 27675ccb7424SJoseph Huber Module &M = CGSCC.getCallGraph().getModule(); 27685ccb7424SJoseph Huber KernelSet Kernels = getDeviceKernels(M); 27695ccb7424SJoseph Huber 27709548b74aSJohannes Doerfert CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 27719548b74aSJohannes Doerfert CGUpdater.initialize(CG, CGSCC); 27729548b74aSJohannes Doerfert 27734d4ea9acSHuber, Joseph // Maintain a map of functions to avoid rebuilding the ORE 27744d4ea9acSHuber, Joseph DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; 27754d4ea9acSHuber, Joseph auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { 27764d4ea9acSHuber, Joseph std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; 27774d4ea9acSHuber, Joseph if (!ORE) 27784d4ea9acSHuber, Joseph ORE = std::make_unique<OptimizationRemarkEmitter>(F); 27794d4ea9acSHuber, Joseph return *ORE; 27804d4ea9acSHuber, Joseph }; 27814d4ea9acSHuber, Joseph 27827cfd267cSsstefan1 AnalysisGetter AG; 27837cfd267cSsstefan1 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 27847cfd267cSsstefan1 BumpPtrAllocator Allocator; 27855ccb7424SJoseph Huber OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, 27865ccb7424SJoseph Huber Allocator, 27875ccb7424SJoseph Huber /*CGSCC*/ Functions, Kernels); 27887cfd267cSsstefan1 2789*13b2fba2SJoseph Huber unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; 279030e36c9bSJoseph Huber Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, 2791*13b2fba2SJoseph Huber MaxFixpointIterations, OREGetter, DEBUG_TYPE); 2792b8235d2bSsstefan1 2793b8235d2bSsstefan1 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2794b2ad63d3SJoseph Huber return OMPOpt.run(false); 27959548b74aSJohannes Doerfert } 27969548b74aSJohannes Doerfert 27979548b74aSJohannes Doerfert bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } 27989548b74aSJohannes Doerfert }; 27999548b74aSJohannes Doerfert 28009548b74aSJohannes Doerfert } // end anonymous namespace 28019548b74aSJohannes Doerfert 28025ccb7424SJoseph Huber KernelSet llvm::omp::getDeviceKernels(Module &M) { 28035ccb7424SJoseph Huber // TODO: Create a more cross-platform way of determining device kernels. 2804e8039ad4SJohannes Doerfert NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 28055ccb7424SJoseph Huber KernelSet Kernels; 28065ccb7424SJoseph Huber 2807e8039ad4SJohannes Doerfert if (!MD) 28085ccb7424SJoseph Huber return Kernels; 2809e8039ad4SJohannes Doerfert 2810e8039ad4SJohannes Doerfert for (auto *Op : MD->operands()) { 2811e8039ad4SJohannes Doerfert if (Op->getNumOperands() < 2) 2812e8039ad4SJohannes Doerfert continue; 2813e8039ad4SJohannes Doerfert MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); 2814e8039ad4SJohannes Doerfert if (!KindID || KindID->getString() != "kernel") 2815e8039ad4SJohannes Doerfert continue; 2816e8039ad4SJohannes Doerfert 2817e8039ad4SJohannes Doerfert Function *KernelFn = 2818e8039ad4SJohannes Doerfert mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); 2819e8039ad4SJohannes Doerfert if (!KernelFn) 2820e8039ad4SJohannes Doerfert continue; 2821e8039ad4SJohannes Doerfert 2822e8039ad4SJohannes Doerfert ++NumOpenMPTargetRegionKernels; 2823e8039ad4SJohannes Doerfert 2824e8039ad4SJohannes Doerfert Kernels.insert(KernelFn); 2825e8039ad4SJohannes Doerfert } 28265ccb7424SJoseph Huber 28275ccb7424SJoseph Huber return Kernels; 2828e8039ad4SJohannes Doerfert } 2829e8039ad4SJohannes Doerfert 28305ccb7424SJoseph Huber bool llvm::omp::containsOpenMP(Module &M) { 28315ccb7424SJoseph Huber Metadata *MD = M.getModuleFlag("openmp"); 28325ccb7424SJoseph Huber if (!MD) 28335ccb7424SJoseph Huber return false; 2834dce6bc18SJohannes Doerfert 2835e8039ad4SJohannes Doerfert return true; 2836e8039ad4SJohannes Doerfert } 2837e8039ad4SJohannes Doerfert 28385ccb7424SJoseph Huber bool llvm::omp::isOpenMPDevice(Module &M) { 28395ccb7424SJoseph Huber Metadata *MD = M.getModuleFlag("openmp-device"); 28405ccb7424SJoseph Huber if (!MD) 28415ccb7424SJoseph Huber return false; 28425ccb7424SJoseph Huber 28435ccb7424SJoseph Huber return true; 28449548b74aSJohannes Doerfert } 28459548b74aSJohannes Doerfert 2846b2ad63d3SJoseph Huber char OpenMPOptCGSCCLegacyPass::ID = 0; 28479548b74aSJohannes Doerfert 2848b2ad63d3SJoseph Huber INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc", 28499548b74aSJohannes Doerfert "OpenMP specific optimizations", false, false) 28509548b74aSJohannes Doerfert INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 2851b2ad63d3SJoseph Huber INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc", 28529548b74aSJohannes Doerfert "OpenMP specific optimizations", false, false) 28539548b74aSJohannes Doerfert 2854b2ad63d3SJoseph Huber Pass *llvm::createOpenMPOptCGSCCLegacyPass() { 2855b2ad63d3SJoseph Huber return new OpenMPOptCGSCCLegacyPass(); 2856b2ad63d3SJoseph Huber } 2857