1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // OpenMP specific optimizations: 10 // 11 // - Deduplication of runtime calls, e.g., omp_get_thread_num. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/OpenMPOpt.h" 16 17 #include "llvm/ADT/EnumeratedArray.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/CallGraph.h" 20 #include "llvm/Analysis/CallGraphSCCPass.h" 21 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Frontend/OpenMP/OMPConstants.h" 24 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 25 #include "llvm/InitializePasses.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Transforms/IPO.h" 28 #include "llvm/Transforms/IPO/Attributor.h" 29 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 30 #include "llvm/Transforms/Utils/CallGraphUpdater.h" 31 #include "llvm/Transforms/Utils/CodeExtractor.h" 32 33 using namespace llvm; 34 using namespace omp; 35 36 #define DEBUG_TYPE "openmp-opt" 37 38 static cl::opt<bool> DisableOpenMPOptimizations( 39 "openmp-opt-disable", cl::ZeroOrMore, 40 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, 41 cl::init(false)); 42 43 static cl::opt<bool> EnableParallelRegionMerging( 44 "openmp-opt-enable-merging", cl::ZeroOrMore, 45 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, 46 cl::init(false)); 47 48 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), 49 cl::Hidden); 50 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", 51 cl::init(false), cl::Hidden); 52 53 static cl::opt<bool> HideMemoryTransferLatency( 54 "openmp-hide-memory-transfer-latency", 55 cl::desc("[WIP] Tries to hide the latency of host to device memory" 56 " transfers"), 57 cl::Hidden, cl::init(false)); 58 59 STATISTIC(NumOpenMPRuntimeCallsDeduplicated, 60 "Number of OpenMP runtime calls deduplicated"); 61 STATISTIC(NumOpenMPParallelRegionsDeleted, 62 "Number of OpenMP parallel regions deleted"); 63 STATISTIC(NumOpenMPRuntimeFunctionsIdentified, 64 "Number of OpenMP runtime functions identified"); 65 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, 66 "Number of OpenMP runtime function uses identified"); 67 STATISTIC(NumOpenMPTargetRegionKernels, 68 "Number of OpenMP target region entry points (=kernels) identified"); 69 STATISTIC( 70 NumOpenMPParallelRegionsReplacedInGPUStateMachine, 71 "Number of OpenMP parallel regions replaced with ID in GPU state machines"); 72 STATISTIC(NumOpenMPParallelRegionsMerged, 73 "Number of OpenMP parallel regions merged"); 74 75 #if !defined(NDEBUG) 76 static constexpr auto TAG = "[" DEBUG_TYPE "]"; 77 #endif 78 79 namespace { 80 81 struct AAICVTracker; 82 83 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for 84 /// Attributor runs. 85 struct OMPInformationCache : public InformationCache { 86 OMPInformationCache(Module &M, AnalysisGetter &AG, 87 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, 88 SmallPtrSetImpl<Kernel> &Kernels) 89 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), 90 Kernels(Kernels) { 91 92 OMPBuilder.initialize(); 93 initializeRuntimeFunctions(); 94 initializeInternalControlVars(); 95 } 96 97 /// Generic information that describes an internal control variable. 98 struct InternalControlVarInfo { 99 /// The kind, as described by InternalControlVar enum. 100 InternalControlVar Kind; 101 102 /// The name of the ICV. 103 StringRef Name; 104 105 /// Environment variable associated with this ICV. 106 StringRef EnvVarName; 107 108 /// Initial value kind. 109 ICVInitValue InitKind; 110 111 /// Initial value. 112 ConstantInt *InitValue; 113 114 /// Setter RTL function associated with this ICV. 115 RuntimeFunction Setter; 116 117 /// Getter RTL function associated with this ICV. 118 RuntimeFunction Getter; 119 120 /// RTL Function corresponding to the override clause of this ICV 121 RuntimeFunction Clause; 122 }; 123 124 /// Generic information that describes a runtime function 125 struct RuntimeFunctionInfo { 126 127 /// The kind, as described by the RuntimeFunction enum. 128 RuntimeFunction Kind; 129 130 /// The name of the function. 131 StringRef Name; 132 133 /// Flag to indicate a variadic function. 134 bool IsVarArg; 135 136 /// The return type of the function. 137 Type *ReturnType; 138 139 /// The argument types of the function. 140 SmallVector<Type *, 8> ArgumentTypes; 141 142 /// The declaration if available. 143 Function *Declaration = nullptr; 144 145 /// Uses of this runtime function per function containing the use. 146 using UseVector = SmallVector<Use *, 16>; 147 148 /// Clear UsesMap for runtime function. 149 void clearUsesMap() { UsesMap.clear(); } 150 151 /// Boolean conversion that is true if the runtime function was found. 152 operator bool() const { return Declaration; } 153 154 /// Return the vector of uses in function \p F. 155 UseVector &getOrCreateUseVector(Function *F) { 156 std::shared_ptr<UseVector> &UV = UsesMap[F]; 157 if (!UV) 158 UV = std::make_shared<UseVector>(); 159 return *UV; 160 } 161 162 /// Return the vector of uses in function \p F or `nullptr` if there are 163 /// none. 164 const UseVector *getUseVector(Function &F) const { 165 auto I = UsesMap.find(&F); 166 if (I != UsesMap.end()) 167 return I->second.get(); 168 return nullptr; 169 } 170 171 /// Return how many functions contain uses of this runtime function. 172 size_t getNumFunctionsWithUses() const { return UsesMap.size(); } 173 174 /// Return the number of arguments (or the minimal number for variadic 175 /// functions). 176 size_t getNumArgs() const { return ArgumentTypes.size(); } 177 178 /// Run the callback \p CB on each use and forget the use if the result is 179 /// true. The callback will be fed the function in which the use was 180 /// encountered as second argument. 181 void foreachUse(SmallVectorImpl<Function *> &SCC, 182 function_ref<bool(Use &, Function &)> CB) { 183 for (Function *F : SCC) 184 foreachUse(CB, F); 185 } 186 187 /// Run the callback \p CB on each use within the function \p F and forget 188 /// the use if the result is true. 189 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { 190 SmallVector<unsigned, 8> ToBeDeleted; 191 ToBeDeleted.clear(); 192 193 unsigned Idx = 0; 194 UseVector &UV = getOrCreateUseVector(F); 195 196 for (Use *U : UV) { 197 if (CB(*U, *F)) 198 ToBeDeleted.push_back(Idx); 199 ++Idx; 200 } 201 202 // Remove the to-be-deleted indices in reverse order as prior 203 // modifications will not modify the smaller indices. 204 while (!ToBeDeleted.empty()) { 205 unsigned Idx = ToBeDeleted.pop_back_val(); 206 UV[Idx] = UV.back(); 207 UV.pop_back(); 208 } 209 } 210 211 private: 212 /// Map from functions to all uses of this runtime function contained in 213 /// them. 214 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; 215 }; 216 217 /// An OpenMP-IR-Builder instance 218 OpenMPIRBuilder OMPBuilder; 219 220 /// Map from runtime function kind to the runtime function description. 221 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, 222 RuntimeFunction::OMPRTL___last> 223 RFIs; 224 225 /// Map from ICV kind to the ICV description. 226 EnumeratedArray<InternalControlVarInfo, InternalControlVar, 227 InternalControlVar::ICV___last> 228 ICVs; 229 230 /// Helper to initialize all internal control variable information for those 231 /// defined in OMPKinds.def. 232 void initializeInternalControlVars() { 233 #define ICV_RT_SET(_Name, RTL) \ 234 { \ 235 auto &ICV = ICVs[_Name]; \ 236 ICV.Setter = RTL; \ 237 } 238 #define ICV_RT_GET(Name, RTL) \ 239 { \ 240 auto &ICV = ICVs[Name]; \ 241 ICV.Getter = RTL; \ 242 } 243 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \ 244 { \ 245 auto &ICV = ICVs[Enum]; \ 246 ICV.Name = _Name; \ 247 ICV.Kind = Enum; \ 248 ICV.InitKind = Init; \ 249 ICV.EnvVarName = _EnvVarName; \ 250 switch (ICV.InitKind) { \ 251 case ICV_IMPLEMENTATION_DEFINED: \ 252 ICV.InitValue = nullptr; \ 253 break; \ 254 case ICV_ZERO: \ 255 ICV.InitValue = ConstantInt::get( \ 256 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \ 257 break; \ 258 case ICV_FALSE: \ 259 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \ 260 break; \ 261 case ICV_LAST: \ 262 break; \ 263 } \ 264 } 265 #include "llvm/Frontend/OpenMP/OMPKinds.def" 266 } 267 268 /// Returns true if the function declaration \p F matches the runtime 269 /// function types, that is, return type \p RTFRetType, and argument types 270 /// \p RTFArgTypes. 271 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, 272 SmallVector<Type *, 8> &RTFArgTypes) { 273 // TODO: We should output information to the user (under debug output 274 // and via remarks). 275 276 if (!F) 277 return false; 278 if (F->getReturnType() != RTFRetType) 279 return false; 280 if (F->arg_size() != RTFArgTypes.size()) 281 return false; 282 283 auto RTFTyIt = RTFArgTypes.begin(); 284 for (Argument &Arg : F->args()) { 285 if (Arg.getType() != *RTFTyIt) 286 return false; 287 288 ++RTFTyIt; 289 } 290 291 return true; 292 } 293 294 // Helper to collect all uses of the declaration in the UsesMap. 295 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { 296 unsigned NumUses = 0; 297 if (!RFI.Declaration) 298 return NumUses; 299 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); 300 301 if (CollectStats) { 302 NumOpenMPRuntimeFunctionsIdentified += 1; 303 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); 304 } 305 306 // TODO: We directly convert uses into proper calls and unknown uses. 307 for (Use &U : RFI.Declaration->uses()) { 308 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { 309 if (ModuleSlice.count(UserI->getFunction())) { 310 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); 311 ++NumUses; 312 } 313 } else { 314 RFI.getOrCreateUseVector(nullptr).push_back(&U); 315 ++NumUses; 316 } 317 } 318 return NumUses; 319 } 320 321 // Helper function to recollect uses of a runtime function. 322 void recollectUsesForFunction(RuntimeFunction RTF) { 323 auto &RFI = RFIs[RTF]; 324 RFI.clearUsesMap(); 325 collectUses(RFI, /*CollectStats*/ false); 326 } 327 328 // Helper function to recollect uses of all runtime functions. 329 void recollectUses() { 330 for (int Idx = 0; Idx < RFIs.size(); ++Idx) 331 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx)); 332 } 333 334 /// Helper to initialize all runtime function information for those defined 335 /// in OpenMPKinds.def. 336 void initializeRuntimeFunctions() { 337 Module &M = *((*ModuleSlice.begin())->getParent()); 338 339 // Helper macros for handling __VA_ARGS__ in OMP_RTL 340 #define OMP_TYPE(VarName, ...) \ 341 Type *VarName = OMPBuilder.VarName; \ 342 (void)VarName; 343 344 #define OMP_ARRAY_TYPE(VarName, ...) \ 345 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \ 346 (void)VarName##Ty; \ 347 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \ 348 (void)VarName##PtrTy; 349 350 #define OMP_FUNCTION_TYPE(VarName, ...) \ 351 FunctionType *VarName = OMPBuilder.VarName; \ 352 (void)VarName; \ 353 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 354 (void)VarName##Ptr; 355 356 #define OMP_STRUCT_TYPE(VarName, ...) \ 357 StructType *VarName = OMPBuilder.VarName; \ 358 (void)VarName; \ 359 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 360 (void)VarName##Ptr; 361 362 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ 363 { \ 364 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ 365 Function *F = M.getFunction(_Name); \ 366 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ 367 auto &RFI = RFIs[_Enum]; \ 368 RFI.Kind = _Enum; \ 369 RFI.Name = _Name; \ 370 RFI.IsVarArg = _IsVarArg; \ 371 RFI.ReturnType = OMPBuilder._ReturnType; \ 372 RFI.ArgumentTypes = std::move(ArgsTypes); \ 373 RFI.Declaration = F; \ 374 unsigned NumUses = collectUses(RFI); \ 375 (void)NumUses; \ 376 LLVM_DEBUG({ \ 377 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ 378 << " found\n"; \ 379 if (RFI.Declaration) \ 380 dbgs() << TAG << "-> got " << NumUses << " uses in " \ 381 << RFI.getNumFunctionsWithUses() \ 382 << " different functions.\n"; \ 383 }); \ 384 } \ 385 } 386 #include "llvm/Frontend/OpenMP/OMPKinds.def" 387 388 // TODO: We should attach the attributes defined in OMPKinds.def. 389 } 390 391 /// Collection of known kernels (\see Kernel) in the module. 392 SmallPtrSetImpl<Kernel> &Kernels; 393 }; 394 395 /// Used to map the values physically (in the IR) stored in an offload 396 /// array, to a vector in memory. 397 struct OffloadArray { 398 /// Physical array (in the IR). 399 AllocaInst *Array = nullptr; 400 /// Mapped values. 401 SmallVector<Value *, 8> StoredValues; 402 /// Last stores made in the offload array. 403 SmallVector<StoreInst *, 8> LastAccesses; 404 405 OffloadArray() = default; 406 407 /// Initializes the OffloadArray with the values stored in \p Array before 408 /// instruction \p Before is reached. Returns false if the initialization 409 /// fails. 410 /// This MUST be used immediately after the construction of the object. 411 bool initialize(AllocaInst &Array, Instruction &Before) { 412 if (!Array.getAllocatedType()->isArrayTy()) 413 return false; 414 415 if (!getValues(Array, Before)) 416 return false; 417 418 this->Array = &Array; 419 return true; 420 } 421 422 static const unsigned DeviceIDArgNum = 1; 423 static const unsigned BasePtrsArgNum = 3; 424 static const unsigned PtrsArgNum = 4; 425 static const unsigned SizesArgNum = 5; 426 427 private: 428 /// Traverses the BasicBlock where \p Array is, collecting the stores made to 429 /// \p Array, leaving StoredValues with the values stored before the 430 /// instruction \p Before is reached. 431 bool getValues(AllocaInst &Array, Instruction &Before) { 432 // Initialize container. 433 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements(); 434 StoredValues.assign(NumValues, nullptr); 435 LastAccesses.assign(NumValues, nullptr); 436 437 // TODO: This assumes the instruction \p Before is in the same 438 // BasicBlock as Array. Make it general, for any control flow graph. 439 BasicBlock *BB = Array.getParent(); 440 if (BB != Before.getParent()) 441 return false; 442 443 const DataLayout &DL = Array.getModule()->getDataLayout(); 444 const unsigned int PointerSize = DL.getPointerSize(); 445 446 for (Instruction &I : *BB) { 447 if (&I == &Before) 448 break; 449 450 if (!isa<StoreInst>(&I)) 451 continue; 452 453 auto *S = cast<StoreInst>(&I); 454 int64_t Offset = -1; 455 auto *Dst = 456 GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL); 457 if (Dst == &Array) { 458 int64_t Idx = Offset / PointerSize; 459 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); 460 LastAccesses[Idx] = S; 461 } 462 } 463 464 return isFilled(); 465 } 466 467 /// Returns true if all values in StoredValues and 468 /// LastAccesses are not nullptrs. 469 bool isFilled() { 470 const unsigned NumValues = StoredValues.size(); 471 for (unsigned I = 0; I < NumValues; ++I) { 472 if (!StoredValues[I] || !LastAccesses[I]) 473 return false; 474 } 475 476 return true; 477 } 478 }; 479 480 struct OpenMPOpt { 481 482 using OptimizationRemarkGetter = 483 function_ref<OptimizationRemarkEmitter &(Function *)>; 484 485 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, 486 OptimizationRemarkGetter OREGetter, 487 OMPInformationCache &OMPInfoCache, Attributor &A) 488 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), 489 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} 490 491 /// Check if any remarks are enabled for openmp-opt 492 bool remarksEnabled() { 493 auto &Ctx = M.getContext(); 494 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE); 495 } 496 497 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. 498 bool run() { 499 if (SCC.empty()) 500 return false; 501 502 bool Changed = false; 503 504 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() 505 << " functions in a slice with " 506 << OMPInfoCache.ModuleSlice.size() << " functions\n"); 507 508 if (PrintICVValues) 509 printICVs(); 510 if (PrintOpenMPKernels) 511 printKernels(); 512 513 Changed |= rewriteDeviceCodeStateMachine(); 514 515 Changed |= runAttributor(); 516 517 // Recollect uses, in case Attributor deleted any. 518 OMPInfoCache.recollectUses(); 519 520 Changed |= deleteParallelRegions(); 521 if (HideMemoryTransferLatency) 522 Changed |= hideMemTransfersLatency(); 523 if (remarksEnabled()) 524 analysisGlobalization(); 525 Changed |= deduplicateRuntimeCalls(); 526 if (EnableParallelRegionMerging) { 527 if (mergeParallelRegions()) { 528 deduplicateRuntimeCalls(); 529 Changed = true; 530 } 531 } 532 533 return Changed; 534 } 535 536 /// Print initial ICV values for testing. 537 /// FIXME: This should be done from the Attributor once it is added. 538 void printICVs() const { 539 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, 540 ICV_proc_bind}; 541 542 for (Function *F : OMPInfoCache.ModuleSlice) { 543 for (auto ICV : ICVs) { 544 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 545 auto Remark = [&](OptimizationRemark OR) { 546 return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) 547 << " Value: " 548 << (ICVInfo.InitValue 549 ? ICVInfo.InitValue->getValue().toString(10, true) 550 : "IMPLEMENTATION_DEFINED"); 551 }; 552 553 emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); 554 } 555 } 556 } 557 558 /// Print OpenMP GPU kernels for testing. 559 void printKernels() const { 560 for (Function *F : SCC) { 561 if (!OMPInfoCache.Kernels.count(F)) 562 continue; 563 564 auto Remark = [&](OptimizationRemark OR) { 565 return OR << "OpenMP GPU kernel " 566 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; 567 }; 568 569 emitRemarkOnFunction(F, "OpenMPGPU", Remark); 570 } 571 } 572 573 /// Return the call if \p U is a callee use in a regular call. If \p RFI is 574 /// given it has to be the callee or a nullptr is returned. 575 static CallInst *getCallIfRegularCall( 576 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 577 CallInst *CI = dyn_cast<CallInst>(U.getUser()); 578 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && 579 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 580 return CI; 581 return nullptr; 582 } 583 584 /// Return the call if \p V is a regular call. If \p RFI is given it has to be 585 /// the callee or a nullptr is returned. 586 static CallInst *getCallIfRegularCall( 587 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 588 CallInst *CI = dyn_cast<CallInst>(&V); 589 if (CI && !CI->hasOperandBundles() && 590 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 591 return CI; 592 return nullptr; 593 } 594 595 private: 596 /// Merge parallel regions when it is safe. 597 bool mergeParallelRegions() { 598 const unsigned CallbackCalleeOperand = 2; 599 const unsigned CallbackFirstArgOperand = 3; 600 using InsertPointTy = OpenMPIRBuilder::InsertPointTy; 601 602 // Check if there are any __kmpc_fork_call calls to merge. 603 OMPInformationCache::RuntimeFunctionInfo &RFI = 604 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 605 606 if (!RFI.Declaration) 607 return false; 608 609 // Unmergable calls that prevent merging a parallel region. 610 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = { 611 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind], 612 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads], 613 }; 614 615 bool Changed = false; 616 LoopInfo *LI = nullptr; 617 DominatorTree *DT = nullptr; 618 619 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; 620 621 BasicBlock *StartBB = nullptr, *EndBB = nullptr; 622 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 623 BasicBlock &ContinuationIP) { 624 BasicBlock *CGStartBB = CodeGenIP.getBlock(); 625 BasicBlock *CGEndBB = 626 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 627 assert(StartBB != nullptr && "StartBB should not be null"); 628 CGStartBB->getTerminator()->setSuccessor(0, StartBB); 629 assert(EndBB != nullptr && "EndBB should not be null"); 630 EndBB->getTerminator()->setSuccessor(0, CGEndBB); 631 }; 632 633 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &, 634 Value &Inner, Value *&ReplacementValue) -> InsertPointTy { 635 ReplacementValue = &Inner; 636 return CodeGenIP; 637 }; 638 639 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 640 641 /// Create a sequential execution region within a merged parallel region, 642 /// encapsulated in a master construct with a barrier for synchronization. 643 auto CreateSequentialRegion = [&](Function *OuterFn, 644 BasicBlock *OuterPredBB, 645 Instruction *SeqStartI, 646 Instruction *SeqEndI) { 647 // Isolate the instructions of the sequential region to a separate 648 // block. 649 BasicBlock *ParentBB = SeqStartI->getParent(); 650 BasicBlock *SeqEndBB = 651 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI); 652 BasicBlock *SeqAfterBB = 653 SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI); 654 BasicBlock *SeqStartBB = 655 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged"); 656 657 assert(ParentBB->getUniqueSuccessor() == SeqStartBB && 658 "Expected a different CFG"); 659 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); 660 ParentBB->getTerminator()->eraseFromParent(); 661 662 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 663 BasicBlock &ContinuationIP) { 664 BasicBlock *CGStartBB = CodeGenIP.getBlock(); 665 BasicBlock *CGEndBB = 666 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 667 assert(SeqStartBB != nullptr && "SeqStartBB should not be null"); 668 CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB); 669 assert(SeqEndBB != nullptr && "SeqEndBB should not be null"); 670 SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB); 671 }; 672 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 673 674 // Find outputs from the sequential region to outside users and 675 // broadcast their values to them. 676 for (Instruction &I : *SeqStartBB) { 677 SmallPtrSet<Instruction *, 4> OutsideUsers; 678 for (User *Usr : I.users()) { 679 Instruction &UsrI = *cast<Instruction>(Usr); 680 // Ignore outputs to LT intrinsics, code extraction for the merged 681 // parallel region will fix them. 682 if (UsrI.isLifetimeStartOrEnd()) 683 continue; 684 685 if (UsrI.getParent() != SeqStartBB) 686 OutsideUsers.insert(&UsrI); 687 } 688 689 if (OutsideUsers.empty()) 690 continue; 691 692 // Emit an alloca in the outer region to store the broadcasted 693 // value. 694 const DataLayout &DL = M.getDataLayout(); 695 AllocaInst *AllocaI = new AllocaInst( 696 I.getType(), DL.getAllocaAddrSpace(), nullptr, 697 I.getName() + ".seq.output.alloc", &OuterFn->front().front()); 698 699 // Emit a store instruction in the sequential BB to update the 700 // value. 701 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()); 702 703 // Emit a load instruction and replace the use of the output value 704 // with it. 705 for (Instruction *UsrI : OutsideUsers) { 706 LoadInst *LoadI = new LoadInst(I.getType(), AllocaI, 707 I.getName() + ".seq.output.load", UsrI); 708 UsrI->replaceUsesOfWith(&I, LoadI); 709 } 710 } 711 712 OpenMPIRBuilder::LocationDescription Loc( 713 InsertPointTy(ParentBB, ParentBB->end()), DL); 714 InsertPointTy SeqAfterIP = 715 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB); 716 717 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel); 718 719 BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock()); 720 721 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn 722 << "\n"); 723 }; 724 725 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all 726 // contained in BB and only separated by instructions that can be 727 // redundantly executed in parallel. The block BB is split before the first 728 // call (in MergableCIs) and after the last so the entire region we merge 729 // into a single parallel region is contained in a single basic block 730 // without any other instructions. We use the OpenMPIRBuilder to outline 731 // that block and call the resulting function via __kmpc_fork_call. 732 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { 733 // TODO: Change the interface to allow single CIs expanded, e.g, to 734 // include an outer loop. 735 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); 736 737 auto Remark = [&](OptimizationRemark OR) { 738 OR << "Parallel region at " 739 << ore::NV("OpenMPParallelMergeFront", 740 MergableCIs.front()->getDebugLoc()) 741 << " merged with parallel regions at "; 742 for (auto *CI : 743 llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) { 744 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); 745 if (CI != MergableCIs.back()) 746 OR << ", "; 747 } 748 return OR; 749 }; 750 751 emitRemark<OptimizationRemark>(MergableCIs.front(), 752 "OpenMPParallelRegionMerging", Remark); 753 754 Function *OriginalFn = BB->getParent(); 755 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() 756 << " parallel regions in " << OriginalFn->getName() 757 << "\n"); 758 759 // Isolate the calls to merge in a separate block. 760 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); 761 BasicBlock *AfterBB = 762 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); 763 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, 764 "omp.par.merged"); 765 766 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); 767 const DebugLoc DL = BB->getTerminator()->getDebugLoc(); 768 BB->getTerminator()->eraseFromParent(); 769 770 // Create sequential regions for sequential instructions that are 771 // in-between mergable parallel regions. 772 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1; 773 It != End; ++It) { 774 Instruction *ForkCI = *It; 775 Instruction *NextForkCI = *(It + 1); 776 777 // Continue if there are not in-between instructions. 778 if (ForkCI->getNextNode() == NextForkCI) 779 continue; 780 781 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(), 782 NextForkCI->getPrevNode()); 783 } 784 785 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), 786 DL); 787 IRBuilder<>::InsertPoint AllocaIP( 788 &OriginalFn->getEntryBlock(), 789 OriginalFn->getEntryBlock().getFirstInsertionPt()); 790 // Create the merged parallel region with default proc binding, to 791 // avoid overriding binding settings, and without explicit cancellation. 792 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel( 793 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, 794 OMP_PROC_BIND_default, /* IsCancellable */ false); 795 BranchInst::Create(AfterBB, AfterIP.getBlock()); 796 797 // Perform the actual outlining. 798 OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true); 799 800 Function *OutlinedFn = MergableCIs.front()->getCaller(); 801 802 // Replace the __kmpc_fork_call calls with direct calls to the outlined 803 // callbacks. 804 SmallVector<Value *, 8> Args; 805 for (auto *CI : MergableCIs) { 806 Value *Callee = 807 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); 808 FunctionType *FT = 809 cast<FunctionType>(Callee->getType()->getPointerElementType()); 810 Args.clear(); 811 Args.push_back(OutlinedFn->getArg(0)); 812 Args.push_back(OutlinedFn->getArg(1)); 813 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 814 U < E; ++U) 815 Args.push_back(CI->getArgOperand(U)); 816 817 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); 818 if (CI->getDebugLoc()) 819 NewCI->setDebugLoc(CI->getDebugLoc()); 820 821 // Forward parameter attributes from the callback to the callee. 822 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 823 U < E; ++U) 824 for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) 825 NewCI->addParamAttr( 826 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); 827 828 // Emit an explicit barrier to replace the implicit fork-join barrier. 829 if (CI != MergableCIs.back()) { 830 // TODO: Remove barrier if the merged parallel region includes the 831 // 'nowait' clause. 832 OMPInfoCache.OMPBuilder.createBarrier( 833 InsertPointTy(NewCI->getParent(), 834 NewCI->getNextNode()->getIterator()), 835 OMPD_parallel); 836 } 837 838 auto Remark = [&](OptimizationRemark OR) { 839 return OR << "Parallel region at " 840 << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()) 841 << " merged with " 842 << ore::NV("OpenMPParallelMergeFront", 843 MergableCIs.front()->getDebugLoc()); 844 }; 845 if (CI != MergableCIs.front()) 846 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging", 847 Remark); 848 849 CI->eraseFromParent(); 850 } 851 852 assert(OutlinedFn != OriginalFn && "Outlining failed"); 853 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn); 854 CGUpdater.reanalyzeFunction(*OriginalFn); 855 856 NumOpenMPParallelRegionsMerged += MergableCIs.size(); 857 858 return true; 859 }; 860 861 // Helper function that identifes sequences of 862 // __kmpc_fork_call uses in a basic block. 863 auto DetectPRsCB = [&](Use &U, Function &F) { 864 CallInst *CI = getCallIfRegularCall(U, &RFI); 865 BB2PRMap[CI->getParent()].insert(CI); 866 867 return false; 868 }; 869 870 BB2PRMap.clear(); 871 RFI.foreachUse(SCC, DetectPRsCB); 872 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector; 873 // Find mergable parallel regions within a basic block that are 874 // safe to merge, that is any in-between instructions can safely 875 // execute in parallel after merging. 876 // TODO: support merging across basic-blocks. 877 for (auto &It : BB2PRMap) { 878 auto &CIs = It.getSecond(); 879 if (CIs.size() < 2) 880 continue; 881 882 BasicBlock *BB = It.getFirst(); 883 SmallVector<CallInst *, 4> MergableCIs; 884 885 /// Returns true if the instruction is mergable, false otherwise. 886 /// A terminator instruction is unmergable by definition since merging 887 /// works within a BB. Instructions before the mergable region are 888 /// mergable if they are not calls to OpenMP runtime functions that may 889 /// set different execution parameters for subsequent parallel regions. 890 /// Instructions in-between parallel regions are mergable if they are not 891 /// calls to any non-intrinsic function since that may call a non-mergable 892 /// OpenMP runtime function. 893 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) { 894 // We do not merge across BBs, hence return false (unmergable) if the 895 // instruction is a terminator. 896 if (I.isTerminator()) 897 return false; 898 899 if (!isa<CallInst>(&I)) 900 return true; 901 902 CallInst *CI = cast<CallInst>(&I); 903 if (IsBeforeMergableRegion) { 904 Function *CalledFunction = CI->getCalledFunction(); 905 if (!CalledFunction) 906 return false; 907 // Return false (unmergable) if the call before the parallel 908 // region calls an explicit affinity (proc_bind) or number of 909 // threads (num_threads) compiler-generated function. Those settings 910 // may be incompatible with following parallel regions. 911 // TODO: ICV tracking to detect compatibility. 912 for (const auto &RFI : UnmergableCallsInfo) { 913 if (CalledFunction == RFI.Declaration) 914 return false; 915 } 916 } else { 917 // Return false (unmergable) if there is a call instruction 918 // in-between parallel regions when it is not an intrinsic. It 919 // may call an unmergable OpenMP runtime function in its callpath. 920 // TODO: Keep track of possible OpenMP calls in the callpath. 921 if (!isa<IntrinsicInst>(CI)) 922 return false; 923 } 924 925 return true; 926 }; 927 // Find maximal number of parallel region CIs that are safe to merge. 928 for (auto It = BB->begin(), End = BB->end(); It != End;) { 929 Instruction &I = *It; 930 ++It; 931 932 if (CIs.count(&I)) { 933 MergableCIs.push_back(cast<CallInst>(&I)); 934 continue; 935 } 936 937 // Continue expanding if the instruction is mergable. 938 if (IsMergable(I, MergableCIs.empty())) 939 continue; 940 941 // Forward the instruction iterator to skip the next parallel region 942 // since there is an unmergable instruction which can affect it. 943 for (; It != End; ++It) { 944 Instruction &SkipI = *It; 945 if (CIs.count(&SkipI)) { 946 LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI 947 << " due to " << I << "\n"); 948 ++It; 949 break; 950 } 951 } 952 953 // Store mergable regions found. 954 if (MergableCIs.size() > 1) { 955 MergableCIsVector.push_back(MergableCIs); 956 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size() 957 << " parallel regions in block " << BB->getName() 958 << " of function " << BB->getParent()->getName() 959 << "\n";); 960 } 961 962 MergableCIs.clear(); 963 } 964 965 if (!MergableCIsVector.empty()) { 966 Changed = true; 967 968 for (auto &MergableCIs : MergableCIsVector) 969 Merge(MergableCIs, BB); 970 } 971 } 972 973 if (Changed) { 974 /// Re-collect use for fork calls, emitted barrier calls, and 975 /// any emitted master/end_master calls. 976 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call); 977 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier); 978 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master); 979 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master); 980 } 981 982 return Changed; 983 } 984 985 /// Try to delete parallel regions if possible. 986 bool deleteParallelRegions() { 987 const unsigned CallbackCalleeOperand = 2; 988 989 OMPInformationCache::RuntimeFunctionInfo &RFI = 990 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 991 992 if (!RFI.Declaration) 993 return false; 994 995 bool Changed = false; 996 auto DeleteCallCB = [&](Use &U, Function &) { 997 CallInst *CI = getCallIfRegularCall(U); 998 if (!CI) 999 return false; 1000 auto *Fn = dyn_cast<Function>( 1001 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); 1002 if (!Fn) 1003 return false; 1004 if (!Fn->onlyReadsMemory()) 1005 return false; 1006 if (!Fn->hasFnAttribute(Attribute::WillReturn)) 1007 return false; 1008 1009 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " 1010 << CI->getCaller()->getName() << "\n"); 1011 1012 auto Remark = [&](OptimizationRemark OR) { 1013 return OR << "Parallel region in " 1014 << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) 1015 << " deleted"; 1016 }; 1017 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion", 1018 Remark); 1019 1020 CGUpdater.removeCallSite(*CI); 1021 CI->eraseFromParent(); 1022 Changed = true; 1023 ++NumOpenMPParallelRegionsDeleted; 1024 return true; 1025 }; 1026 1027 RFI.foreachUse(SCC, DeleteCallCB); 1028 1029 return Changed; 1030 } 1031 1032 /// Try to eliminate runtime calls by reusing existing ones. 1033 bool deduplicateRuntimeCalls() { 1034 bool Changed = false; 1035 1036 RuntimeFunction DeduplicableRuntimeCallIDs[] = { 1037 OMPRTL_omp_get_num_threads, 1038 OMPRTL_omp_in_parallel, 1039 OMPRTL_omp_get_cancellation, 1040 OMPRTL_omp_get_thread_limit, 1041 OMPRTL_omp_get_supported_active_levels, 1042 OMPRTL_omp_get_level, 1043 OMPRTL_omp_get_ancestor_thread_num, 1044 OMPRTL_omp_get_team_size, 1045 OMPRTL_omp_get_active_level, 1046 OMPRTL_omp_in_final, 1047 OMPRTL_omp_get_proc_bind, 1048 OMPRTL_omp_get_num_places, 1049 OMPRTL_omp_get_num_procs, 1050 OMPRTL_omp_get_place_num, 1051 OMPRTL_omp_get_partition_num_places, 1052 OMPRTL_omp_get_partition_place_nums}; 1053 1054 // Global-tid is handled separately. 1055 SmallSetVector<Value *, 16> GTIdArgs; 1056 collectGlobalThreadIdArguments(GTIdArgs); 1057 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() 1058 << " global thread ID arguments\n"); 1059 1060 for (Function *F : SCC) { 1061 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) 1062 Changed |= deduplicateRuntimeCalls( 1063 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); 1064 1065 // __kmpc_global_thread_num is special as we can replace it with an 1066 // argument in enough cases to make it worth trying. 1067 Value *GTIdArg = nullptr; 1068 for (Argument &Arg : F->args()) 1069 if (GTIdArgs.count(&Arg)) { 1070 GTIdArg = &Arg; 1071 break; 1072 } 1073 Changed |= deduplicateRuntimeCalls( 1074 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); 1075 } 1076 1077 return Changed; 1078 } 1079 1080 /// Tries to hide the latency of runtime calls that involve host to 1081 /// device memory transfers by splitting them into their "issue" and "wait" 1082 /// versions. The "issue" is moved upwards as much as possible. The "wait" is 1083 /// moved downards as much as possible. The "issue" issues the memory transfer 1084 /// asynchronously, returning a handle. The "wait" waits in the returned 1085 /// handle for the memory transfer to finish. 1086 bool hideMemTransfersLatency() { 1087 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; 1088 bool Changed = false; 1089 auto SplitMemTransfers = [&](Use &U, Function &Decl) { 1090 auto *RTCall = getCallIfRegularCall(U, &RFI); 1091 if (!RTCall) 1092 return false; 1093 1094 OffloadArray OffloadArrays[3]; 1095 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) 1096 return false; 1097 1098 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); 1099 1100 // TODO: Check if can be moved upwards. 1101 bool WasSplit = false; 1102 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); 1103 if (WaitMovementPoint) 1104 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); 1105 1106 Changed |= WasSplit; 1107 return WasSplit; 1108 }; 1109 RFI.foreachUse(SCC, SplitMemTransfers); 1110 1111 return Changed; 1112 } 1113 1114 void analysisGlobalization() { 1115 RuntimeFunction GlobalizationRuntimeIDs[] = { 1116 OMPRTL___kmpc_data_sharing_coalesced_push_stack, 1117 OMPRTL___kmpc_data_sharing_push_stack}; 1118 1119 for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { 1120 auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; 1121 1122 auto CheckGlobalization = [&](Use &U, Function &Decl) { 1123 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { 1124 auto Remark = [&](OptimizationRemarkAnalysis ORA) { 1125 return ORA 1126 << "Found thread data sharing on the GPU. " 1127 << "Expect degraded performance due to data globalization."; 1128 }; 1129 emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization", 1130 Remark); 1131 } 1132 1133 return false; 1134 }; 1135 1136 RFI.foreachUse(SCC, CheckGlobalization); 1137 } 1138 return; 1139 } 1140 1141 /// Maps the values stored in the offload arrays passed as arguments to 1142 /// \p RuntimeCall into the offload arrays in \p OAs. 1143 bool getValuesInOffloadArrays(CallInst &RuntimeCall, 1144 MutableArrayRef<OffloadArray> OAs) { 1145 assert(OAs.size() == 3 && "Need space for three offload arrays!"); 1146 1147 // A runtime call that involves memory offloading looks something like: 1148 // call void @__tgt_target_data_begin_mapper(arg0, arg1, 1149 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, 1150 // ...) 1151 // So, the idea is to access the allocas that allocate space for these 1152 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. 1153 // Therefore: 1154 // i8** %offload_baseptrs. 1155 Value *BasePtrsArg = 1156 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); 1157 // i8** %offload_ptrs. 1158 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); 1159 // i8** %offload_sizes. 1160 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); 1161 1162 // Get values stored in **offload_baseptrs. 1163 auto *V = getUnderlyingObject(BasePtrsArg); 1164 if (!isa<AllocaInst>(V)) 1165 return false; 1166 auto *BasePtrsArray = cast<AllocaInst>(V); 1167 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) 1168 return false; 1169 1170 // Get values stored in **offload_baseptrs. 1171 V = getUnderlyingObject(PtrsArg); 1172 if (!isa<AllocaInst>(V)) 1173 return false; 1174 auto *PtrsArray = cast<AllocaInst>(V); 1175 if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) 1176 return false; 1177 1178 // Get values stored in **offload_sizes. 1179 V = getUnderlyingObject(SizesArg); 1180 // If it's a [constant] global array don't analyze it. 1181 if (isa<GlobalValue>(V)) 1182 return isa<Constant>(V); 1183 if (!isa<AllocaInst>(V)) 1184 return false; 1185 1186 auto *SizesArray = cast<AllocaInst>(V); 1187 if (!OAs[2].initialize(*SizesArray, RuntimeCall)) 1188 return false; 1189 1190 return true; 1191 } 1192 1193 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. 1194 /// For now this is a way to test that the function getValuesInOffloadArrays 1195 /// is working properly. 1196 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. 1197 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) { 1198 assert(OAs.size() == 3 && "There are three offload arrays to debug!"); 1199 1200 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); 1201 std::string ValuesStr; 1202 raw_string_ostream Printer(ValuesStr); 1203 std::string Separator = " --- "; 1204 1205 for (auto *BP : OAs[0].StoredValues) { 1206 BP->print(Printer); 1207 Printer << Separator; 1208 } 1209 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); 1210 ValuesStr.clear(); 1211 1212 for (auto *P : OAs[1].StoredValues) { 1213 P->print(Printer); 1214 Printer << Separator; 1215 } 1216 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); 1217 ValuesStr.clear(); 1218 1219 for (auto *S : OAs[2].StoredValues) { 1220 S->print(Printer); 1221 Printer << Separator; 1222 } 1223 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); 1224 } 1225 1226 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be 1227 /// moved. Returns nullptr if the movement is not possible, or not worth it. 1228 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { 1229 // FIXME: This traverses only the BasicBlock where RuntimeCall is. 1230 // Make it traverse the CFG. 1231 1232 Instruction *CurrentI = &RuntimeCall; 1233 bool IsWorthIt = false; 1234 while ((CurrentI = CurrentI->getNextNode())) { 1235 1236 // TODO: Once we detect the regions to be offloaded we should use the 1237 // alias analysis manager to check if CurrentI may modify one of 1238 // the offloaded regions. 1239 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { 1240 if (IsWorthIt) 1241 return CurrentI; 1242 1243 return nullptr; 1244 } 1245 1246 // FIXME: For now if we move it over anything without side effect 1247 // is worth it. 1248 IsWorthIt = true; 1249 } 1250 1251 // Return end of BasicBlock. 1252 return RuntimeCall.getParent()->getTerminator(); 1253 } 1254 1255 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. 1256 bool splitTargetDataBeginRTC(CallInst &RuntimeCall, 1257 Instruction &WaitMovementPoint) { 1258 // Create stack allocated handle (__tgt_async_info) at the beginning of the 1259 // function. Used for storing information of the async transfer, allowing to 1260 // wait on it later. 1261 auto &IRBuilder = OMPInfoCache.OMPBuilder; 1262 auto *F = RuntimeCall.getCaller(); 1263 Instruction *FirstInst = &(F->getEntryBlock().front()); 1264 AllocaInst *Handle = new AllocaInst( 1265 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); 1266 1267 // Add "issue" runtime call declaration: 1268 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, 1269 // i8**, i8**, i64*, i64*) 1270 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( 1271 M, OMPRTL___tgt_target_data_begin_mapper_issue); 1272 1273 // Change RuntimeCall call site for its asynchronous version. 1274 SmallVector<Value *, 16> Args; 1275 for (auto &Arg : RuntimeCall.args()) 1276 Args.push_back(Arg.get()); 1277 Args.push_back(Handle); 1278 1279 CallInst *IssueCallsite = 1280 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); 1281 RuntimeCall.eraseFromParent(); 1282 1283 // Add "wait" runtime call declaration: 1284 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) 1285 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( 1286 M, OMPRTL___tgt_target_data_begin_mapper_wait); 1287 1288 Value *WaitParams[2] = { 1289 IssueCallsite->getArgOperand( 1290 OffloadArray::DeviceIDArgNum), // device_id. 1291 Handle // handle to wait on. 1292 }; 1293 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); 1294 1295 return true; 1296 } 1297 1298 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, 1299 bool GlobalOnly, bool &SingleChoice) { 1300 if (CurrentIdent == NextIdent) 1301 return CurrentIdent; 1302 1303 // TODO: Figure out how to actually combine multiple debug locations. For 1304 // now we just keep an existing one if there is a single choice. 1305 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { 1306 SingleChoice = !CurrentIdent; 1307 return NextIdent; 1308 } 1309 return nullptr; 1310 } 1311 1312 /// Return an `struct ident_t*` value that represents the ones used in the 1313 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not 1314 /// return a local `struct ident_t*`. For now, if we cannot find a suitable 1315 /// return value we create one from scratch. We also do not yet combine 1316 /// information, e.g., the source locations, see combinedIdentStruct. 1317 Value * 1318 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, 1319 Function &F, bool GlobalOnly) { 1320 bool SingleChoice = true; 1321 Value *Ident = nullptr; 1322 auto CombineIdentStruct = [&](Use &U, Function &Caller) { 1323 CallInst *CI = getCallIfRegularCall(U, &RFI); 1324 if (!CI || &F != &Caller) 1325 return false; 1326 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), 1327 /* GlobalOnly */ true, SingleChoice); 1328 return false; 1329 }; 1330 RFI.foreachUse(SCC, CombineIdentStruct); 1331 1332 if (!Ident || !SingleChoice) { 1333 // The IRBuilder uses the insertion block to get to the module, this is 1334 // unfortunate but we work around it for now. 1335 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) 1336 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( 1337 &F.getEntryBlock(), F.getEntryBlock().begin())); 1338 // Create a fallback location if non was found. 1339 // TODO: Use the debug locations of the calls instead. 1340 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); 1341 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); 1342 } 1343 return Ident; 1344 } 1345 1346 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or 1347 /// \p ReplVal if given. 1348 bool deduplicateRuntimeCalls(Function &F, 1349 OMPInformationCache::RuntimeFunctionInfo &RFI, 1350 Value *ReplVal = nullptr) { 1351 auto *UV = RFI.getUseVector(F); 1352 if (!UV || UV->size() + (ReplVal != nullptr) < 2) 1353 return false; 1354 1355 LLVM_DEBUG( 1356 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name 1357 << (ReplVal ? " with an existing value\n" : "\n") << "\n"); 1358 1359 assert((!ReplVal || (isa<Argument>(ReplVal) && 1360 cast<Argument>(ReplVal)->getParent() == &F)) && 1361 "Unexpected replacement value!"); 1362 1363 // TODO: Use dominance to find a good position instead. 1364 auto CanBeMoved = [this](CallBase &CB) { 1365 unsigned NumArgs = CB.getNumArgOperands(); 1366 if (NumArgs == 0) 1367 return true; 1368 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) 1369 return false; 1370 for (unsigned u = 1; u < NumArgs; ++u) 1371 if (isa<Instruction>(CB.getArgOperand(u))) 1372 return false; 1373 return true; 1374 }; 1375 1376 if (!ReplVal) { 1377 for (Use *U : *UV) 1378 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { 1379 if (!CanBeMoved(*CI)) 1380 continue; 1381 1382 auto Remark = [&](OptimizationRemark OR) { 1383 auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); 1384 return OR << "OpenMP runtime call " 1385 << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " 1386 << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); 1387 }; 1388 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark); 1389 1390 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); 1391 ReplVal = CI; 1392 break; 1393 } 1394 if (!ReplVal) 1395 return false; 1396 } 1397 1398 // If we use a call as a replacement value we need to make sure the ident is 1399 // valid at the new location. For now we just pick a global one, either 1400 // existing and used by one of the calls, or created from scratch. 1401 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { 1402 if (CI->getNumArgOperands() > 0 && 1403 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { 1404 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, 1405 /* GlobalOnly */ true); 1406 CI->setArgOperand(0, Ident); 1407 } 1408 } 1409 1410 bool Changed = false; 1411 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { 1412 CallInst *CI = getCallIfRegularCall(U, &RFI); 1413 if (!CI || CI == ReplVal || &F != &Caller) 1414 return false; 1415 assert(CI->getCaller() == &F && "Unexpected call!"); 1416 1417 auto Remark = [&](OptimizationRemark OR) { 1418 return OR << "OpenMP runtime call " 1419 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; 1420 }; 1421 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark); 1422 1423 CGUpdater.removeCallSite(*CI); 1424 CI->replaceAllUsesWith(ReplVal); 1425 CI->eraseFromParent(); 1426 ++NumOpenMPRuntimeCallsDeduplicated; 1427 Changed = true; 1428 return true; 1429 }; 1430 RFI.foreachUse(SCC, ReplaceAndDeleteCB); 1431 1432 return Changed; 1433 } 1434 1435 /// Collect arguments that represent the global thread id in \p GTIdArgs. 1436 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) { 1437 // TODO: Below we basically perform a fixpoint iteration with a pessimistic 1438 // initialization. We could define an AbstractAttribute instead and 1439 // run the Attributor here once it can be run as an SCC pass. 1440 1441 // Helper to check the argument \p ArgNo at all call sites of \p F for 1442 // a GTId. 1443 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { 1444 if (!F.hasLocalLinkage()) 1445 return false; 1446 for (Use &U : F.uses()) { 1447 if (CallInst *CI = getCallIfRegularCall(U)) { 1448 Value *ArgOp = CI->getArgOperand(ArgNo); 1449 if (CI == &RefCI || GTIdArgs.count(ArgOp) || 1450 getCallIfRegularCall( 1451 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) 1452 continue; 1453 } 1454 return false; 1455 } 1456 return true; 1457 }; 1458 1459 // Helper to identify uses of a GTId as GTId arguments. 1460 auto AddUserArgs = [&](Value >Id) { 1461 for (Use &U : GTId.uses()) 1462 if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) 1463 if (CI->isArgOperand(&U)) 1464 if (Function *Callee = CI->getCalledFunction()) 1465 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) 1466 GTIdArgs.insert(Callee->getArg(U.getOperandNo())); 1467 }; 1468 1469 // The argument users of __kmpc_global_thread_num calls are GTIds. 1470 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = 1471 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; 1472 1473 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { 1474 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) 1475 AddUserArgs(*CI); 1476 return false; 1477 }); 1478 1479 // Transitively search for more arguments by looking at the users of the 1480 // ones we know already. During the search the GTIdArgs vector is extended 1481 // so we cannot cache the size nor can we use a range based for. 1482 for (unsigned u = 0; u < GTIdArgs.size(); ++u) 1483 AddUserArgs(*GTIdArgs[u]); 1484 } 1485 1486 /// Kernel (=GPU) optimizations and utility functions 1487 /// 1488 ///{{ 1489 1490 /// Check if \p F is a kernel, hence entry point for target offloading. 1491 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } 1492 1493 /// Cache to remember the unique kernel for a function. 1494 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; 1495 1496 /// Find the unique kernel that will execute \p F, if any. 1497 Kernel getUniqueKernelFor(Function &F); 1498 1499 /// Find the unique kernel that will execute \p I, if any. 1500 Kernel getUniqueKernelFor(Instruction &I) { 1501 return getUniqueKernelFor(*I.getFunction()); 1502 } 1503 1504 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in 1505 /// the cases we can avoid taking the address of a function. 1506 bool rewriteDeviceCodeStateMachine(); 1507 1508 /// 1509 ///}} 1510 1511 /// Emit a remark generically 1512 /// 1513 /// This template function can be used to generically emit a remark. The 1514 /// RemarkKind should be one of the following: 1515 /// - OptimizationRemark to indicate a successful optimization attempt 1516 /// - OptimizationRemarkMissed to report a failed optimization attempt 1517 /// - OptimizationRemarkAnalysis to provide additional information about an 1518 /// optimization attempt 1519 /// 1520 /// The remark is built using a callback function provided by the caller that 1521 /// takes a RemarkKind as input and returns a RemarkKind. 1522 template <typename RemarkKind, 1523 typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>> 1524 void emitRemark(Instruction *Inst, StringRef RemarkName, 1525 RemarkCallBack &&RemarkCB) const { 1526 Function *F = Inst->getParent()->getParent(); 1527 auto &ORE = OREGetter(F); 1528 1529 ORE.emit( 1530 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); 1531 } 1532 1533 /// Emit a remark on a function. Since only OptimizationRemark is supporting 1534 /// this, it can't be made generic. 1535 void 1536 emitRemarkOnFunction(Function *F, StringRef RemarkName, 1537 function_ref<OptimizationRemark(OptimizationRemark &&)> 1538 &&RemarkCB) const { 1539 auto &ORE = OREGetter(F); 1540 1541 ORE.emit([&]() { 1542 return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); 1543 }); 1544 } 1545 1546 /// The underlying module. 1547 Module &M; 1548 1549 /// The SCC we are operating on. 1550 SmallVectorImpl<Function *> &SCC; 1551 1552 /// Callback to update the call graph, the first argument is a removed call, 1553 /// the second an optional replacement call. 1554 CallGraphUpdater &CGUpdater; 1555 1556 /// Callback to get an OptimizationRemarkEmitter from a Function * 1557 OptimizationRemarkGetter OREGetter; 1558 1559 /// OpenMP-specific information cache. Also Used for Attributor runs. 1560 OMPInformationCache &OMPInfoCache; 1561 1562 /// Attributor instance. 1563 Attributor &A; 1564 1565 /// Helper function to run Attributor on SCC. 1566 bool runAttributor() { 1567 if (SCC.empty()) 1568 return false; 1569 1570 registerAAs(); 1571 1572 ChangeStatus Changed = A.run(); 1573 1574 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size() 1575 << " functions, result: " << Changed << ".\n"); 1576 1577 return Changed == ChangeStatus::CHANGED; 1578 } 1579 1580 /// Populate the Attributor with abstract attribute opportunities in the 1581 /// function. 1582 void registerAAs() { 1583 if (SCC.empty()) 1584 return; 1585 1586 // Create CallSite AA for all Getters. 1587 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { 1588 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; 1589 1590 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; 1591 1592 auto CreateAA = [&](Use &U, Function &Caller) { 1593 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); 1594 if (!CI) 1595 return false; 1596 1597 auto &CB = cast<CallBase>(*CI); 1598 1599 IRPosition CBPos = IRPosition::callsite_function(CB); 1600 A.getOrCreateAAFor<AAICVTracker>(CBPos); 1601 return false; 1602 }; 1603 1604 GetterRFI.foreachUse(SCC, CreateAA); 1605 } 1606 } 1607 }; 1608 1609 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { 1610 if (!OMPInfoCache.ModuleSlice.count(&F)) 1611 return nullptr; 1612 1613 // Use a scope to keep the lifetime of the CachedKernel short. 1614 { 1615 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; 1616 if (CachedKernel) 1617 return *CachedKernel; 1618 1619 // TODO: We should use an AA to create an (optimistic and callback 1620 // call-aware) call graph. For now we stick to simple patterns that 1621 // are less powerful, basically the worst fixpoint. 1622 if (isKernel(F)) { 1623 CachedKernel = Kernel(&F); 1624 return *CachedKernel; 1625 } 1626 1627 CachedKernel = nullptr; 1628 if (!F.hasLocalLinkage()) { 1629 1630 // See https://openmp.llvm.org/remarks/OptimizationRemarks.html 1631 auto Remark = [&](OptimizationRemark OR) { 1632 return OR << "[OMP100] Potentially unknown OpenMP target region caller"; 1633 }; 1634 emitRemarkOnFunction(&F, "OMP100", Remark); 1635 1636 return nullptr; 1637 } 1638 } 1639 1640 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { 1641 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 1642 // Allow use in equality comparisons. 1643 if (Cmp->isEquality()) 1644 return getUniqueKernelFor(*Cmp); 1645 return nullptr; 1646 } 1647 if (auto *CB = dyn_cast<CallBase>(U.getUser())) { 1648 // Allow direct calls. 1649 if (CB->isCallee(&U)) 1650 return getUniqueKernelFor(*CB); 1651 // Allow the use in __kmpc_kernel_prepare_parallel calls. 1652 if (Function *Callee = CB->getCalledFunction()) 1653 if (Callee->getName() == "__kmpc_kernel_prepare_parallel") 1654 return getUniqueKernelFor(*CB); 1655 return nullptr; 1656 } 1657 // Disallow every other use. 1658 return nullptr; 1659 }; 1660 1661 // TODO: In the future we want to track more than just a unique kernel. 1662 SmallPtrSet<Kernel, 2> PotentialKernels; 1663 OMPInformationCache::foreachUse(F, [&](const Use &U) { 1664 PotentialKernels.insert(GetUniqueKernelForUse(U)); 1665 }); 1666 1667 Kernel K = nullptr; 1668 if (PotentialKernels.size() == 1) 1669 K = *PotentialKernels.begin(); 1670 1671 // Cache the result. 1672 UniqueKernelMap[&F] = K; 1673 1674 return K; 1675 } 1676 1677 bool OpenMPOpt::rewriteDeviceCodeStateMachine() { 1678 OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI = 1679 OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel]; 1680 1681 bool Changed = false; 1682 if (!KernelPrepareParallelRFI) 1683 return Changed; 1684 1685 for (Function *F : SCC) { 1686 1687 // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at 1688 // all. 1689 bool UnknownUse = false; 1690 bool KernelPrepareUse = false; 1691 unsigned NumDirectCalls = 0; 1692 1693 SmallVector<Use *, 2> ToBeReplacedStateMachineUses; 1694 OMPInformationCache::foreachUse(*F, [&](Use &U) { 1695 if (auto *CB = dyn_cast<CallBase>(U.getUser())) 1696 if (CB->isCallee(&U)) { 1697 ++NumDirectCalls; 1698 return; 1699 } 1700 1701 if (isa<ICmpInst>(U.getUser())) { 1702 ToBeReplacedStateMachineUses.push_back(&U); 1703 return; 1704 } 1705 if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( 1706 *U.getUser(), &KernelPrepareParallelRFI)) { 1707 KernelPrepareUse = true; 1708 ToBeReplacedStateMachineUses.push_back(&U); 1709 return; 1710 } 1711 UnknownUse = true; 1712 }); 1713 1714 // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel 1715 // use. 1716 if (!KernelPrepareUse) 1717 continue; 1718 1719 { 1720 auto Remark = [&](OptimizationRemark OR) { 1721 return OR << "Found a parallel region that is called in a target " 1722 "region but not part of a combined target construct nor " 1723 "nesed inside a target construct without intermediate " 1724 "code. This can lead to excessive register usage for " 1725 "unrelated target regions in the same translation unit " 1726 "due to spurious call edges assumed by ptxas."; 1727 }; 1728 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1729 } 1730 1731 // If this ever hits, we should investigate. 1732 // TODO: Checking the number of uses is not a necessary restriction and 1733 // should be lifted. 1734 if (UnknownUse || NumDirectCalls != 1 || 1735 ToBeReplacedStateMachineUses.size() != 2) { 1736 { 1737 auto Remark = [&](OptimizationRemark OR) { 1738 return OR << "Parallel region is used in " 1739 << (UnknownUse ? "unknown" : "unexpected") 1740 << " ways; will not attempt to rewrite the state machine."; 1741 }; 1742 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1743 } 1744 continue; 1745 } 1746 1747 // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give 1748 // up if the function is not called from a unique kernel. 1749 Kernel K = getUniqueKernelFor(*F); 1750 if (!K) { 1751 { 1752 auto Remark = [&](OptimizationRemark OR) { 1753 return OR << "Parallel region is not known to be called from a " 1754 "unique single target region, maybe the surrounding " 1755 "function has external linkage?; will not attempt to " 1756 "rewrite the state machine use."; 1757 }; 1758 emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", 1759 Remark); 1760 } 1761 continue; 1762 } 1763 1764 // We now know F is a parallel body function called only from the kernel K. 1765 // We also identified the state machine uses in which we replace the 1766 // function pointer by a new global symbol for identification purposes. This 1767 // ensures only direct calls to the function are left. 1768 1769 { 1770 auto RemarkParalleRegion = [&](OptimizationRemark OR) { 1771 return OR << "Specialize parallel region that is only reached from a " 1772 "single target region to avoid spurious call edges and " 1773 "excessive register usage in other target regions. " 1774 "(parallel region ID: " 1775 << ore::NV("OpenMPParallelRegion", F->getName()) 1776 << ", kernel ID: " 1777 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1778 }; 1779 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", 1780 RemarkParalleRegion); 1781 auto RemarkKernel = [&](OptimizationRemark OR) { 1782 return OR << "Target region containing the parallel region that is " 1783 "specialized. (parallel region ID: " 1784 << ore::NV("OpenMPParallelRegion", F->getName()) 1785 << ", kernel ID: " 1786 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1787 }; 1788 emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); 1789 } 1790 1791 Module &M = *F->getParent(); 1792 Type *Int8Ty = Type::getInt8Ty(M.getContext()); 1793 1794 auto *ID = new GlobalVariable( 1795 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, 1796 UndefValue::get(Int8Ty), F->getName() + ".ID"); 1797 1798 for (Use *U : ToBeReplacedStateMachineUses) 1799 U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); 1800 1801 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; 1802 1803 Changed = true; 1804 } 1805 1806 return Changed; 1807 } 1808 1809 /// Abstract Attribute for tracking ICV values. 1810 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { 1811 using Base = StateWrapper<BooleanState, AbstractAttribute>; 1812 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1813 1814 void initialize(Attributor &A) override { 1815 Function *F = getAnchorScope(); 1816 if (!F || !A.isFunctionIPOAmendable(*F)) 1817 indicatePessimisticFixpoint(); 1818 } 1819 1820 /// Returns true if value is assumed to be tracked. 1821 bool isAssumedTracked() const { return getAssumed(); } 1822 1823 /// Returns true if value is known to be tracked. 1824 bool isKnownTracked() const { return getAssumed(); } 1825 1826 /// Create an abstract attribute biew for the position \p IRP. 1827 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); 1828 1829 /// Return the value with which \p I can be replaced for specific \p ICV. 1830 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, 1831 const Instruction *I, 1832 Attributor &A) const { 1833 return None; 1834 } 1835 1836 /// Return an assumed unique ICV value if a single candidate is found. If 1837 /// there cannot be one, return a nullptr. If it is not clear yet, return the 1838 /// Optional::NoneType. 1839 virtual Optional<Value *> 1840 getUniqueReplacementValue(InternalControlVar ICV) const = 0; 1841 1842 // Currently only nthreads is being tracked. 1843 // this array will only grow with time. 1844 InternalControlVar TrackableICVs[1] = {ICV_nthreads}; 1845 1846 /// See AbstractAttribute::getName() 1847 const std::string getName() const override { return "AAICVTracker"; } 1848 1849 /// See AbstractAttribute::getIdAddr() 1850 const char *getIdAddr() const override { return &ID; } 1851 1852 /// This function should return true if the type of the \p AA is AAICVTracker 1853 static bool classof(const AbstractAttribute *AA) { 1854 return (AA->getIdAddr() == &ID); 1855 } 1856 1857 static const char ID; 1858 }; 1859 1860 struct AAICVTrackerFunction : public AAICVTracker { 1861 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) 1862 : AAICVTracker(IRP, A) {} 1863 1864 // FIXME: come up with better string. 1865 const std::string getAsStr() const override { return "ICVTrackerFunction"; } 1866 1867 // FIXME: come up with some stats. 1868 void trackStatistics() const override {} 1869 1870 /// We don't manifest anything for this AA. 1871 ChangeStatus manifest(Attributor &A) override { 1872 return ChangeStatus::UNCHANGED; 1873 } 1874 1875 // Map of ICV to their values at specific program point. 1876 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar, 1877 InternalControlVar::ICV___last> 1878 ICVReplacementValuesMap; 1879 1880 ChangeStatus updateImpl(Attributor &A) override { 1881 ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 1882 1883 Function *F = getAnchorScope(); 1884 1885 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1886 1887 for (InternalControlVar ICV : TrackableICVs) { 1888 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1889 1890 auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1891 auto TrackValues = [&](Use &U, Function &) { 1892 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); 1893 if (!CI) 1894 return false; 1895 1896 // FIXME: handle setters with more that 1 arguments. 1897 /// Track new value. 1898 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second) 1899 HasChanged = ChangeStatus::CHANGED; 1900 1901 return false; 1902 }; 1903 1904 auto CallCheck = [&](Instruction &I) { 1905 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); 1906 if (ReplVal.hasValue() && 1907 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) 1908 HasChanged = ChangeStatus::CHANGED; 1909 1910 return true; 1911 }; 1912 1913 // Track all changes of an ICV. 1914 SetterRFI.foreachUse(TrackValues, F); 1915 1916 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call}, 1917 /* CheckBBLivenessOnly */ true); 1918 1919 /// TODO: Figure out a way to avoid adding entry in 1920 /// ICVReplacementValuesMap 1921 Instruction *Entry = &F->getEntryBlock().front(); 1922 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry)) 1923 ValuesMap.insert(std::make_pair(Entry, nullptr)); 1924 } 1925 1926 return HasChanged; 1927 } 1928 1929 /// Hepler to check if \p I is a call and get the value for it if it is 1930 /// unique. 1931 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, 1932 InternalControlVar &ICV) const { 1933 1934 const auto *CB = dyn_cast<CallBase>(I); 1935 if (!CB || CB->hasFnAttr("no_openmp") || 1936 CB->hasFnAttr("no_openmp_routines")) 1937 return None; 1938 1939 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1940 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; 1941 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1942 Function *CalledFunction = CB->getCalledFunction(); 1943 1944 // Indirect call, assume ICV changes. 1945 if (CalledFunction == nullptr) 1946 return nullptr; 1947 if (CalledFunction == GetterRFI.Declaration) 1948 return None; 1949 if (CalledFunction == SetterRFI.Declaration) { 1950 if (ICVReplacementValuesMap[ICV].count(I)) 1951 return ICVReplacementValuesMap[ICV].lookup(I); 1952 1953 return nullptr; 1954 } 1955 1956 // Since we don't know, assume it changes the ICV. 1957 if (CalledFunction->isDeclaration()) 1958 return nullptr; 1959 1960 const auto &ICVTrackingAA = 1961 A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB)); 1962 1963 if (ICVTrackingAA.isAssumedTracked()) 1964 return ICVTrackingAA.getUniqueReplacementValue(ICV); 1965 1966 // If we don't know, assume it changes. 1967 return nullptr; 1968 } 1969 1970 // We don't check unique value for a function, so return None. 1971 Optional<Value *> 1972 getUniqueReplacementValue(InternalControlVar ICV) const override { 1973 return None; 1974 } 1975 1976 /// Return the value with which \p I can be replaced for specific \p ICV. 1977 Optional<Value *> getReplacementValue(InternalControlVar ICV, 1978 const Instruction *I, 1979 Attributor &A) const override { 1980 const auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1981 if (ValuesMap.count(I)) 1982 return ValuesMap.lookup(I); 1983 1984 SmallVector<const Instruction *, 16> Worklist; 1985 SmallPtrSet<const Instruction *, 16> Visited; 1986 Worklist.push_back(I); 1987 1988 Optional<Value *> ReplVal; 1989 1990 while (!Worklist.empty()) { 1991 const Instruction *CurrInst = Worklist.pop_back_val(); 1992 if (!Visited.insert(CurrInst).second) 1993 continue; 1994 1995 const BasicBlock *CurrBB = CurrInst->getParent(); 1996 1997 // Go up and look for all potential setters/calls that might change the 1998 // ICV. 1999 while ((CurrInst = CurrInst->getPrevNode())) { 2000 if (ValuesMap.count(CurrInst)) { 2001 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); 2002 // Unknown value, track new. 2003 if (!ReplVal.hasValue()) { 2004 ReplVal = NewReplVal; 2005 break; 2006 } 2007 2008 // If we found a new value, we can't know the icv value anymore. 2009 if (NewReplVal.hasValue()) 2010 if (ReplVal != NewReplVal) 2011 return nullptr; 2012 2013 break; 2014 } 2015 2016 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); 2017 if (!NewReplVal.hasValue()) 2018 continue; 2019 2020 // Unknown value, track new. 2021 if (!ReplVal.hasValue()) { 2022 ReplVal = NewReplVal; 2023 break; 2024 } 2025 2026 // if (NewReplVal.hasValue()) 2027 // We found a new value, we can't know the icv value anymore. 2028 if (ReplVal != NewReplVal) 2029 return nullptr; 2030 } 2031 2032 // If we are in the same BB and we have a value, we are done. 2033 if (CurrBB == I->getParent() && ReplVal.hasValue()) 2034 return ReplVal; 2035 2036 // Go through all predecessors and add terminators for analysis. 2037 for (const BasicBlock *Pred : predecessors(CurrBB)) 2038 if (const Instruction *Terminator = Pred->getTerminator()) 2039 Worklist.push_back(Terminator); 2040 } 2041 2042 return ReplVal; 2043 } 2044 }; 2045 2046 struct AAICVTrackerFunctionReturned : AAICVTracker { 2047 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A) 2048 : AAICVTracker(IRP, A) {} 2049 2050 // FIXME: come up with better string. 2051 const std::string getAsStr() const override { 2052 return "ICVTrackerFunctionReturned"; 2053 } 2054 2055 // FIXME: come up with some stats. 2056 void trackStatistics() const override {} 2057 2058 /// We don't manifest anything for this AA. 2059 ChangeStatus manifest(Attributor &A) override { 2060 return ChangeStatus::UNCHANGED; 2061 } 2062 2063 // Map of ICV to their values at specific program point. 2064 EnumeratedArray<Optional<Value *>, InternalControlVar, 2065 InternalControlVar::ICV___last> 2066 ICVReplacementValuesMap; 2067 2068 /// Return the value with which \p I can be replaced for specific \p ICV. 2069 Optional<Value *> 2070 getUniqueReplacementValue(InternalControlVar ICV) const override { 2071 return ICVReplacementValuesMap[ICV]; 2072 } 2073 2074 ChangeStatus updateImpl(Attributor &A) override { 2075 ChangeStatus Changed = ChangeStatus::UNCHANGED; 2076 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2077 *this, IRPosition::function(*getAnchorScope())); 2078 2079 if (!ICVTrackingAA.isAssumedTracked()) 2080 return indicatePessimisticFixpoint(); 2081 2082 for (InternalControlVar ICV : TrackableICVs) { 2083 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 2084 Optional<Value *> UniqueICVValue; 2085 2086 auto CheckReturnInst = [&](Instruction &I) { 2087 Optional<Value *> NewReplVal = 2088 ICVTrackingAA.getReplacementValue(ICV, &I, A); 2089 2090 // If we found a second ICV value there is no unique returned value. 2091 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) 2092 return false; 2093 2094 UniqueICVValue = NewReplVal; 2095 2096 return true; 2097 }; 2098 2099 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}, 2100 /* CheckBBLivenessOnly */ true)) 2101 UniqueICVValue = nullptr; 2102 2103 if (UniqueICVValue == ReplVal) 2104 continue; 2105 2106 ReplVal = UniqueICVValue; 2107 Changed = ChangeStatus::CHANGED; 2108 } 2109 2110 return Changed; 2111 } 2112 }; 2113 2114 struct AAICVTrackerCallSite : AAICVTracker { 2115 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A) 2116 : AAICVTracker(IRP, A) {} 2117 2118 void initialize(Attributor &A) override { 2119 Function *F = getAnchorScope(); 2120 if (!F || !A.isFunctionIPOAmendable(*F)) 2121 indicatePessimisticFixpoint(); 2122 2123 // We only initialize this AA for getters, so we need to know which ICV it 2124 // gets. 2125 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 2126 for (InternalControlVar ICV : TrackableICVs) { 2127 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 2128 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter]; 2129 if (Getter.Declaration == getAssociatedFunction()) { 2130 AssociatedICV = ICVInfo.Kind; 2131 return; 2132 } 2133 } 2134 2135 /// Unknown ICV. 2136 indicatePessimisticFixpoint(); 2137 } 2138 2139 ChangeStatus manifest(Attributor &A) override { 2140 if (!ReplVal.hasValue() || !ReplVal.getValue()) 2141 return ChangeStatus::UNCHANGED; 2142 2143 A.changeValueAfterManifest(*getCtxI(), **ReplVal); 2144 A.deleteAfterManifest(*getCtxI()); 2145 2146 return ChangeStatus::CHANGED; 2147 } 2148 2149 // FIXME: come up with better string. 2150 const std::string getAsStr() const override { return "ICVTrackerCallSite"; } 2151 2152 // FIXME: come up with some stats. 2153 void trackStatistics() const override {} 2154 2155 InternalControlVar AssociatedICV; 2156 Optional<Value *> ReplVal; 2157 2158 ChangeStatus updateImpl(Attributor &A) override { 2159 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2160 *this, IRPosition::function(*getAnchorScope())); 2161 2162 // We don't have any information, so we assume it changes the ICV. 2163 if (!ICVTrackingAA.isAssumedTracked()) 2164 return indicatePessimisticFixpoint(); 2165 2166 Optional<Value *> NewReplVal = 2167 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); 2168 2169 if (ReplVal == NewReplVal) 2170 return ChangeStatus::UNCHANGED; 2171 2172 ReplVal = NewReplVal; 2173 return ChangeStatus::CHANGED; 2174 } 2175 2176 // Return the value with which associated value can be replaced for specific 2177 // \p ICV. 2178 Optional<Value *> 2179 getUniqueReplacementValue(InternalControlVar ICV) const override { 2180 return ReplVal; 2181 } 2182 }; 2183 2184 struct AAICVTrackerCallSiteReturned : AAICVTracker { 2185 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A) 2186 : AAICVTracker(IRP, A) {} 2187 2188 // FIXME: come up with better string. 2189 const std::string getAsStr() const override { 2190 return "ICVTrackerCallSiteReturned"; 2191 } 2192 2193 // FIXME: come up with some stats. 2194 void trackStatistics() const override {} 2195 2196 /// We don't manifest anything for this AA. 2197 ChangeStatus manifest(Attributor &A) override { 2198 return ChangeStatus::UNCHANGED; 2199 } 2200 2201 // Map of ICV to their values at specific program point. 2202 EnumeratedArray<Optional<Value *>, InternalControlVar, 2203 InternalControlVar::ICV___last> 2204 ICVReplacementValuesMap; 2205 2206 /// Return the value with which associated value can be replaced for specific 2207 /// \p ICV. 2208 Optional<Value *> 2209 getUniqueReplacementValue(InternalControlVar ICV) const override { 2210 return ICVReplacementValuesMap[ICV]; 2211 } 2212 2213 ChangeStatus updateImpl(Attributor &A) override { 2214 ChangeStatus Changed = ChangeStatus::UNCHANGED; 2215 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2216 *this, IRPosition::returned(*getAssociatedFunction())); 2217 2218 // We don't have any information, so we assume it changes the ICV. 2219 if (!ICVTrackingAA.isAssumedTracked()) 2220 return indicatePessimisticFixpoint(); 2221 2222 for (InternalControlVar ICV : TrackableICVs) { 2223 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 2224 Optional<Value *> NewReplVal = 2225 ICVTrackingAA.getUniqueReplacementValue(ICV); 2226 2227 if (ReplVal == NewReplVal) 2228 continue; 2229 2230 ReplVal = NewReplVal; 2231 Changed = ChangeStatus::CHANGED; 2232 } 2233 return Changed; 2234 } 2235 }; 2236 } // namespace 2237 2238 const char AAICVTracker::ID = 0; 2239 2240 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, 2241 Attributor &A) { 2242 AAICVTracker *AA = nullptr; 2243 switch (IRP.getPositionKind()) { 2244 case IRPosition::IRP_INVALID: 2245 case IRPosition::IRP_FLOAT: 2246 case IRPosition::IRP_ARGUMENT: 2247 case IRPosition::IRP_CALL_SITE_ARGUMENT: 2248 llvm_unreachable("ICVTracker can only be created for function position!"); 2249 case IRPosition::IRP_RETURNED: 2250 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A); 2251 break; 2252 case IRPosition::IRP_CALL_SITE_RETURNED: 2253 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A); 2254 break; 2255 case IRPosition::IRP_CALL_SITE: 2256 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A); 2257 break; 2258 case IRPosition::IRP_FUNCTION: 2259 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); 2260 break; 2261 } 2262 2263 return *AA; 2264 } 2265 2266 PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, 2267 CGSCCAnalysisManager &AM, 2268 LazyCallGraph &CG, CGSCCUpdateResult &UR) { 2269 if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule)) 2270 return PreservedAnalyses::all(); 2271 2272 if (DisableOpenMPOptimizations) 2273 return PreservedAnalyses::all(); 2274 2275 SmallVector<Function *, 16> SCC; 2276 // If there are kernels in the module, we have to run on all SCC's. 2277 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2278 for (LazyCallGraph::Node &N : C) { 2279 Function *Fn = &N.getFunction(); 2280 SCC.push_back(Fn); 2281 2282 // Do we already know that the SCC contains kernels, 2283 // or that OpenMP functions are called from this SCC? 2284 if (SCCIsInteresting) 2285 continue; 2286 // If not, let's check that. 2287 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2288 } 2289 2290 if (!SCCIsInteresting || SCC.empty()) 2291 return PreservedAnalyses::all(); 2292 2293 FunctionAnalysisManager &FAM = 2294 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 2295 2296 AnalysisGetter AG(FAM); 2297 2298 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 2299 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 2300 }; 2301 2302 CallGraphUpdater CGUpdater; 2303 CGUpdater.initialize(CG, C, AM, UR); 2304 2305 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2306 BumpPtrAllocator Allocator; 2307 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, 2308 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2309 2310 Attributor A(Functions, InfoCache, CGUpdater); 2311 2312 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2313 bool Changed = OMPOpt.run(); 2314 if (Changed) 2315 return PreservedAnalyses::none(); 2316 2317 return PreservedAnalyses::all(); 2318 } 2319 2320 namespace { 2321 2322 struct OpenMPOptLegacyPass : public CallGraphSCCPass { 2323 CallGraphUpdater CGUpdater; 2324 OpenMPInModule OMPInModule; 2325 static char ID; 2326 2327 OpenMPOptLegacyPass() : CallGraphSCCPass(ID) { 2328 initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry()); 2329 } 2330 2331 void getAnalysisUsage(AnalysisUsage &AU) const override { 2332 CallGraphSCCPass::getAnalysisUsage(AU); 2333 } 2334 2335 bool doInitialization(CallGraph &CG) override { 2336 // Disable the pass if there is no OpenMP (runtime call) in the module. 2337 containsOpenMP(CG.getModule(), OMPInModule); 2338 return false; 2339 } 2340 2341 bool runOnSCC(CallGraphSCC &CGSCC) override { 2342 if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule)) 2343 return false; 2344 if (DisableOpenMPOptimizations || skipSCC(CGSCC)) 2345 return false; 2346 2347 SmallVector<Function *, 16> SCC; 2348 // If there are kernels in the module, we have to run on all SCC's. 2349 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2350 for (CallGraphNode *CGN : CGSCC) { 2351 Function *Fn = CGN->getFunction(); 2352 if (!Fn || Fn->isDeclaration()) 2353 continue; 2354 SCC.push_back(Fn); 2355 2356 // Do we already know that the SCC contains kernels, 2357 // or that OpenMP functions are called from this SCC? 2358 if (SCCIsInteresting) 2359 continue; 2360 // If not, let's check that. 2361 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2362 } 2363 2364 if (!SCCIsInteresting || SCC.empty()) 2365 return false; 2366 2367 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 2368 CGUpdater.initialize(CG, CGSCC); 2369 2370 // Maintain a map of functions to avoid rebuilding the ORE 2371 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; 2372 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { 2373 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; 2374 if (!ORE) 2375 ORE = std::make_unique<OptimizationRemarkEmitter>(F); 2376 return *ORE; 2377 }; 2378 2379 AnalysisGetter AG; 2380 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2381 BumpPtrAllocator Allocator; 2382 OMPInformationCache InfoCache( 2383 *(Functions.back()->getParent()), AG, Allocator, 2384 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2385 2386 Attributor A(Functions, InfoCache, CGUpdater); 2387 2388 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2389 return OMPOpt.run(); 2390 } 2391 2392 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } 2393 }; 2394 2395 } // end anonymous namespace 2396 2397 void OpenMPInModule::identifyKernels(Module &M) { 2398 2399 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 2400 if (!MD) 2401 return; 2402 2403 for (auto *Op : MD->operands()) { 2404 if (Op->getNumOperands() < 2) 2405 continue; 2406 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); 2407 if (!KindID || KindID->getString() != "kernel") 2408 continue; 2409 2410 Function *KernelFn = 2411 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); 2412 if (!KernelFn) 2413 continue; 2414 2415 ++NumOpenMPTargetRegionKernels; 2416 2417 Kernels.insert(KernelFn); 2418 } 2419 } 2420 2421 bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { 2422 if (OMPInModule.isKnown()) 2423 return OMPInModule; 2424 2425 auto RecordFunctionsContainingUsesOf = [&](Function *F) { 2426 for (User *U : F->users()) 2427 if (auto *I = dyn_cast<Instruction>(U)) 2428 OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction()); 2429 }; 2430 2431 // MSVC doesn't like long if-else chains for some reason and instead just 2432 // issues an error. Work around it.. 2433 do { 2434 #define OMP_RTL(_Enum, _Name, ...) \ 2435 if (Function *F = M.getFunction(_Name)) { \ 2436 RecordFunctionsContainingUsesOf(F); \ 2437 OMPInModule = true; \ 2438 } 2439 #include "llvm/Frontend/OpenMP/OMPKinds.def" 2440 } while (false); 2441 2442 // Identify kernels once. TODO: We should split the OMPInformationCache into a 2443 // module and an SCC part. The kernel information, among other things, could 2444 // go into the module part. 2445 if (OMPInModule.isKnown() && OMPInModule) { 2446 OMPInModule.identifyKernels(M); 2447 return true; 2448 } 2449 2450 return OMPInModule = false; 2451 } 2452 2453 char OpenMPOptLegacyPass::ID = 0; 2454 2455 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", 2456 "OpenMP specific optimizations", false, false) 2457 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 2458 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt", 2459 "OpenMP specific optimizations", false, false) 2460 2461 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); } 2462