1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // OpenMP specific optimizations: 10 // 11 // - Deduplication of runtime calls, e.g., omp_get_thread_num. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/OpenMPOpt.h" 16 17 #include "llvm/ADT/EnumeratedArray.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/CallGraph.h" 20 #include "llvm/Analysis/CallGraphSCCPass.h" 21 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Frontend/OpenMP/OMPConstants.h" 24 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 25 #include "llvm/InitializePasses.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Transforms/IPO.h" 28 #include "llvm/Transforms/IPO/Attributor.h" 29 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 30 #include "llvm/Transforms/Utils/CallGraphUpdater.h" 31 #include "llvm/Transforms/Utils/CodeExtractor.h" 32 33 using namespace llvm; 34 using namespace omp; 35 36 #define DEBUG_TYPE "openmp-opt" 37 38 static cl::opt<bool> DisableOpenMPOptimizations( 39 "openmp-opt-disable", cl::ZeroOrMore, 40 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, 41 cl::init(false)); 42 43 static cl::opt<bool> EnableParallelRegionMerging( 44 "openmp-opt-enable-merging", cl::ZeroOrMore, 45 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, 46 cl::init(false)); 47 48 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), 49 cl::Hidden); 50 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", 51 cl::init(false), cl::Hidden); 52 53 static cl::opt<bool> HideMemoryTransferLatency( 54 "openmp-hide-memory-transfer-latency", 55 cl::desc("[WIP] Tries to hide the latency of host to device memory" 56 " transfers"), 57 cl::Hidden, cl::init(false)); 58 59 STATISTIC(NumOpenMPRuntimeCallsDeduplicated, 60 "Number of OpenMP runtime calls deduplicated"); 61 STATISTIC(NumOpenMPParallelRegionsDeleted, 62 "Number of OpenMP parallel regions deleted"); 63 STATISTIC(NumOpenMPRuntimeFunctionsIdentified, 64 "Number of OpenMP runtime functions identified"); 65 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, 66 "Number of OpenMP runtime function uses identified"); 67 STATISTIC(NumOpenMPTargetRegionKernels, 68 "Number of OpenMP target region entry points (=kernels) identified"); 69 STATISTIC( 70 NumOpenMPParallelRegionsReplacedInGPUStateMachine, 71 "Number of OpenMP parallel regions replaced with ID in GPU state machines"); 72 STATISTIC(NumOpenMPParallelRegionsMerged, 73 "Number of OpenMP parallel regions merged"); 74 75 #if !defined(NDEBUG) 76 static constexpr auto TAG = "[" DEBUG_TYPE "]"; 77 #endif 78 79 namespace { 80 81 struct AAICVTracker; 82 83 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for 84 /// Attributor runs. 85 struct OMPInformationCache : public InformationCache { 86 OMPInformationCache(Module &M, AnalysisGetter &AG, 87 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, 88 SmallPtrSetImpl<Kernel> &Kernels) 89 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), 90 Kernels(Kernels) { 91 92 OMPBuilder.initialize(); 93 initializeRuntimeFunctions(); 94 initializeInternalControlVars(); 95 } 96 97 /// Generic information that describes an internal control variable. 98 struct InternalControlVarInfo { 99 /// The kind, as described by InternalControlVar enum. 100 InternalControlVar Kind; 101 102 /// The name of the ICV. 103 StringRef Name; 104 105 /// Environment variable associated with this ICV. 106 StringRef EnvVarName; 107 108 /// Initial value kind. 109 ICVInitValue InitKind; 110 111 /// Initial value. 112 ConstantInt *InitValue; 113 114 /// Setter RTL function associated with this ICV. 115 RuntimeFunction Setter; 116 117 /// Getter RTL function associated with this ICV. 118 RuntimeFunction Getter; 119 120 /// RTL Function corresponding to the override clause of this ICV 121 RuntimeFunction Clause; 122 }; 123 124 /// Generic information that describes a runtime function 125 struct RuntimeFunctionInfo { 126 127 /// The kind, as described by the RuntimeFunction enum. 128 RuntimeFunction Kind; 129 130 /// The name of the function. 131 StringRef Name; 132 133 /// Flag to indicate a variadic function. 134 bool IsVarArg; 135 136 /// The return type of the function. 137 Type *ReturnType; 138 139 /// The argument types of the function. 140 SmallVector<Type *, 8> ArgumentTypes; 141 142 /// The declaration if available. 143 Function *Declaration = nullptr; 144 145 /// Uses of this runtime function per function containing the use. 146 using UseVector = SmallVector<Use *, 16>; 147 148 /// Clear UsesMap for runtime function. 149 void clearUsesMap() { UsesMap.clear(); } 150 151 /// Boolean conversion that is true if the runtime function was found. 152 operator bool() const { return Declaration; } 153 154 /// Return the vector of uses in function \p F. 155 UseVector &getOrCreateUseVector(Function *F) { 156 std::shared_ptr<UseVector> &UV = UsesMap[F]; 157 if (!UV) 158 UV = std::make_shared<UseVector>(); 159 return *UV; 160 } 161 162 /// Return the vector of uses in function \p F or `nullptr` if there are 163 /// none. 164 const UseVector *getUseVector(Function &F) const { 165 auto I = UsesMap.find(&F); 166 if (I != UsesMap.end()) 167 return I->second.get(); 168 return nullptr; 169 } 170 171 /// Return how many functions contain uses of this runtime function. 172 size_t getNumFunctionsWithUses() const { return UsesMap.size(); } 173 174 /// Return the number of arguments (or the minimal number for variadic 175 /// functions). 176 size_t getNumArgs() const { return ArgumentTypes.size(); } 177 178 /// Run the callback \p CB on each use and forget the use if the result is 179 /// true. The callback will be fed the function in which the use was 180 /// encountered as second argument. 181 void foreachUse(SmallVectorImpl<Function *> &SCC, 182 function_ref<bool(Use &, Function &)> CB) { 183 for (Function *F : SCC) 184 foreachUse(CB, F); 185 } 186 187 /// Run the callback \p CB on each use within the function \p F and forget 188 /// the use if the result is true. 189 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { 190 SmallVector<unsigned, 8> ToBeDeleted; 191 ToBeDeleted.clear(); 192 193 unsigned Idx = 0; 194 UseVector &UV = getOrCreateUseVector(F); 195 196 for (Use *U : UV) { 197 if (CB(*U, *F)) 198 ToBeDeleted.push_back(Idx); 199 ++Idx; 200 } 201 202 // Remove the to-be-deleted indices in reverse order as prior 203 // modifications will not modify the smaller indices. 204 while (!ToBeDeleted.empty()) { 205 unsigned Idx = ToBeDeleted.pop_back_val(); 206 UV[Idx] = UV.back(); 207 UV.pop_back(); 208 } 209 } 210 211 private: 212 /// Map from functions to all uses of this runtime function contained in 213 /// them. 214 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; 215 }; 216 217 /// An OpenMP-IR-Builder instance 218 OpenMPIRBuilder OMPBuilder; 219 220 /// Map from runtime function kind to the runtime function description. 221 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, 222 RuntimeFunction::OMPRTL___last> 223 RFIs; 224 225 /// Map from ICV kind to the ICV description. 226 EnumeratedArray<InternalControlVarInfo, InternalControlVar, 227 InternalControlVar::ICV___last> 228 ICVs; 229 230 /// Helper to initialize all internal control variable information for those 231 /// defined in OMPKinds.def. 232 void initializeInternalControlVars() { 233 #define ICV_RT_SET(_Name, RTL) \ 234 { \ 235 auto &ICV = ICVs[_Name]; \ 236 ICV.Setter = RTL; \ 237 } 238 #define ICV_RT_GET(Name, RTL) \ 239 { \ 240 auto &ICV = ICVs[Name]; \ 241 ICV.Getter = RTL; \ 242 } 243 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \ 244 { \ 245 auto &ICV = ICVs[Enum]; \ 246 ICV.Name = _Name; \ 247 ICV.Kind = Enum; \ 248 ICV.InitKind = Init; \ 249 ICV.EnvVarName = _EnvVarName; \ 250 switch (ICV.InitKind) { \ 251 case ICV_IMPLEMENTATION_DEFINED: \ 252 ICV.InitValue = nullptr; \ 253 break; \ 254 case ICV_ZERO: \ 255 ICV.InitValue = ConstantInt::get( \ 256 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \ 257 break; \ 258 case ICV_FALSE: \ 259 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \ 260 break; \ 261 case ICV_LAST: \ 262 break; \ 263 } \ 264 } 265 #include "llvm/Frontend/OpenMP/OMPKinds.def" 266 } 267 268 /// Returns true if the function declaration \p F matches the runtime 269 /// function types, that is, return type \p RTFRetType, and argument types 270 /// \p RTFArgTypes. 271 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, 272 SmallVector<Type *, 8> &RTFArgTypes) { 273 // TODO: We should output information to the user (under debug output 274 // and via remarks). 275 276 if (!F) 277 return false; 278 if (F->getReturnType() != RTFRetType) 279 return false; 280 if (F->arg_size() != RTFArgTypes.size()) 281 return false; 282 283 auto RTFTyIt = RTFArgTypes.begin(); 284 for (Argument &Arg : F->args()) { 285 if (Arg.getType() != *RTFTyIt) 286 return false; 287 288 ++RTFTyIt; 289 } 290 291 return true; 292 } 293 294 // Helper to collect all uses of the declaration in the UsesMap. 295 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { 296 unsigned NumUses = 0; 297 if (!RFI.Declaration) 298 return NumUses; 299 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); 300 301 if (CollectStats) { 302 NumOpenMPRuntimeFunctionsIdentified += 1; 303 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); 304 } 305 306 // TODO: We directly convert uses into proper calls and unknown uses. 307 for (Use &U : RFI.Declaration->uses()) { 308 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { 309 if (ModuleSlice.count(UserI->getFunction())) { 310 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); 311 ++NumUses; 312 } 313 } else { 314 RFI.getOrCreateUseVector(nullptr).push_back(&U); 315 ++NumUses; 316 } 317 } 318 return NumUses; 319 } 320 321 // Helper function to recollect uses of a runtime function. 322 void recollectUsesForFunction(RuntimeFunction RTF) { 323 auto &RFI = RFIs[RTF]; 324 RFI.clearUsesMap(); 325 collectUses(RFI, /*CollectStats*/ false); 326 } 327 328 // Helper function to recollect uses of all runtime functions. 329 void recollectUses() { 330 for (int Idx = 0; Idx < RFIs.size(); ++Idx) 331 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx)); 332 } 333 334 /// Helper to initialize all runtime function information for those defined 335 /// in OpenMPKinds.def. 336 void initializeRuntimeFunctions() { 337 Module &M = *((*ModuleSlice.begin())->getParent()); 338 339 // Helper macros for handling __VA_ARGS__ in OMP_RTL 340 #define OMP_TYPE(VarName, ...) \ 341 Type *VarName = OMPBuilder.VarName; \ 342 (void)VarName; 343 344 #define OMP_ARRAY_TYPE(VarName, ...) \ 345 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \ 346 (void)VarName##Ty; \ 347 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \ 348 (void)VarName##PtrTy; 349 350 #define OMP_FUNCTION_TYPE(VarName, ...) \ 351 FunctionType *VarName = OMPBuilder.VarName; \ 352 (void)VarName; \ 353 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 354 (void)VarName##Ptr; 355 356 #define OMP_STRUCT_TYPE(VarName, ...) \ 357 StructType *VarName = OMPBuilder.VarName; \ 358 (void)VarName; \ 359 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 360 (void)VarName##Ptr; 361 362 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ 363 { \ 364 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ 365 Function *F = M.getFunction(_Name); \ 366 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ 367 auto &RFI = RFIs[_Enum]; \ 368 RFI.Kind = _Enum; \ 369 RFI.Name = _Name; \ 370 RFI.IsVarArg = _IsVarArg; \ 371 RFI.ReturnType = OMPBuilder._ReturnType; \ 372 RFI.ArgumentTypes = std::move(ArgsTypes); \ 373 RFI.Declaration = F; \ 374 unsigned NumUses = collectUses(RFI); \ 375 (void)NumUses; \ 376 LLVM_DEBUG({ \ 377 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ 378 << " found\n"; \ 379 if (RFI.Declaration) \ 380 dbgs() << TAG << "-> got " << NumUses << " uses in " \ 381 << RFI.getNumFunctionsWithUses() \ 382 << " different functions.\n"; \ 383 }); \ 384 } \ 385 } 386 #include "llvm/Frontend/OpenMP/OMPKinds.def" 387 388 // TODO: We should attach the attributes defined in OMPKinds.def. 389 } 390 391 /// Collection of known kernels (\see Kernel) in the module. 392 SmallPtrSetImpl<Kernel> &Kernels; 393 }; 394 395 /// Used to map the values physically (in the IR) stored in an offload 396 /// array, to a vector in memory. 397 struct OffloadArray { 398 /// Physical array (in the IR). 399 AllocaInst *Array = nullptr; 400 /// Mapped values. 401 SmallVector<Value *, 8> StoredValues; 402 /// Last stores made in the offload array. 403 SmallVector<StoreInst *, 8> LastAccesses; 404 405 OffloadArray() = default; 406 407 /// Initializes the OffloadArray with the values stored in \p Array before 408 /// instruction \p Before is reached. Returns false if the initialization 409 /// fails. 410 /// This MUST be used immediately after the construction of the object. 411 bool initialize(AllocaInst &Array, Instruction &Before) { 412 if (!Array.getAllocatedType()->isArrayTy()) 413 return false; 414 415 if (!getValues(Array, Before)) 416 return false; 417 418 this->Array = &Array; 419 return true; 420 } 421 422 static const unsigned DeviceIDArgNum = 1; 423 static const unsigned BasePtrsArgNum = 3; 424 static const unsigned PtrsArgNum = 4; 425 static const unsigned SizesArgNum = 5; 426 427 private: 428 /// Traverses the BasicBlock where \p Array is, collecting the stores made to 429 /// \p Array, leaving StoredValues with the values stored before the 430 /// instruction \p Before is reached. 431 bool getValues(AllocaInst &Array, Instruction &Before) { 432 // Initialize container. 433 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements(); 434 StoredValues.assign(NumValues, nullptr); 435 LastAccesses.assign(NumValues, nullptr); 436 437 // TODO: This assumes the instruction \p Before is in the same 438 // BasicBlock as Array. Make it general, for any control flow graph. 439 BasicBlock *BB = Array.getParent(); 440 if (BB != Before.getParent()) 441 return false; 442 443 const DataLayout &DL = Array.getModule()->getDataLayout(); 444 const unsigned int PointerSize = DL.getPointerSize(); 445 446 for (Instruction &I : *BB) { 447 if (&I == &Before) 448 break; 449 450 if (!isa<StoreInst>(&I)) 451 continue; 452 453 auto *S = cast<StoreInst>(&I); 454 int64_t Offset = -1; 455 auto *Dst = 456 GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL); 457 if (Dst == &Array) { 458 int64_t Idx = Offset / PointerSize; 459 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); 460 LastAccesses[Idx] = S; 461 } 462 } 463 464 return isFilled(); 465 } 466 467 /// Returns true if all values in StoredValues and 468 /// LastAccesses are not nullptrs. 469 bool isFilled() { 470 const unsigned NumValues = StoredValues.size(); 471 for (unsigned I = 0; I < NumValues; ++I) { 472 if (!StoredValues[I] || !LastAccesses[I]) 473 return false; 474 } 475 476 return true; 477 } 478 }; 479 480 struct OpenMPOpt { 481 482 using OptimizationRemarkGetter = 483 function_ref<OptimizationRemarkEmitter &(Function *)>; 484 485 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, 486 OptimizationRemarkGetter OREGetter, 487 OMPInformationCache &OMPInfoCache, Attributor &A) 488 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), 489 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} 490 491 /// Check if any remarks are enabled for openmp-opt 492 bool remarksEnabled() { 493 auto &Ctx = M.getContext(); 494 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE); 495 } 496 497 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. 498 bool run() { 499 if (SCC.empty()) 500 return false; 501 502 bool Changed = false; 503 504 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() 505 << " functions in a slice with " 506 << OMPInfoCache.ModuleSlice.size() << " functions\n"); 507 508 if (PrintICVValues) 509 printICVs(); 510 if (PrintOpenMPKernels) 511 printKernels(); 512 513 Changed |= rewriteDeviceCodeStateMachine(); 514 515 Changed |= runAttributor(); 516 517 // Recollect uses, in case Attributor deleted any. 518 OMPInfoCache.recollectUses(); 519 520 Changed |= deleteParallelRegions(); 521 if (HideMemoryTransferLatency) 522 Changed |= hideMemTransfersLatency(); 523 if (remarksEnabled()) 524 analysisGlobalization(); 525 Changed |= deduplicateRuntimeCalls(); 526 if (EnableParallelRegionMerging) { 527 if (mergeParallelRegions()) { 528 deduplicateRuntimeCalls(); 529 Changed = true; 530 } 531 } 532 533 return Changed; 534 } 535 536 /// Print initial ICV values for testing. 537 /// FIXME: This should be done from the Attributor once it is added. 538 void printICVs() const { 539 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, 540 ICV_proc_bind}; 541 542 for (Function *F : OMPInfoCache.ModuleSlice) { 543 for (auto ICV : ICVs) { 544 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 545 auto Remark = [&](OptimizationRemark OR) { 546 return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) 547 << " Value: " 548 << (ICVInfo.InitValue 549 ? ICVInfo.InitValue->getValue().toString(10, true) 550 : "IMPLEMENTATION_DEFINED"); 551 }; 552 553 emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); 554 } 555 } 556 } 557 558 /// Print OpenMP GPU kernels for testing. 559 void printKernels() const { 560 for (Function *F : SCC) { 561 if (!OMPInfoCache.Kernels.count(F)) 562 continue; 563 564 auto Remark = [&](OptimizationRemark OR) { 565 return OR << "OpenMP GPU kernel " 566 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; 567 }; 568 569 emitRemarkOnFunction(F, "OpenMPGPU", Remark); 570 } 571 } 572 573 /// Return the call if \p U is a callee use in a regular call. If \p RFI is 574 /// given it has to be the callee or a nullptr is returned. 575 static CallInst *getCallIfRegularCall( 576 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 577 CallInst *CI = dyn_cast<CallInst>(U.getUser()); 578 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && 579 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 580 return CI; 581 return nullptr; 582 } 583 584 /// Return the call if \p V is a regular call. If \p RFI is given it has to be 585 /// the callee or a nullptr is returned. 586 static CallInst *getCallIfRegularCall( 587 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 588 CallInst *CI = dyn_cast<CallInst>(&V); 589 if (CI && !CI->hasOperandBundles() && 590 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 591 return CI; 592 return nullptr; 593 } 594 595 private: 596 /// Merge parallel regions when it is safe. 597 bool mergeParallelRegions() { 598 const unsigned CallbackCalleeOperand = 2; 599 const unsigned CallbackFirstArgOperand = 3; 600 using InsertPointTy = OpenMPIRBuilder::InsertPointTy; 601 602 // Check if there are any __kmpc_fork_call calls to merge. 603 OMPInformationCache::RuntimeFunctionInfo &RFI = 604 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 605 606 if (!RFI.Declaration) 607 return false; 608 609 // Unmergable calls that prevent merging a parallel region. 610 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = { 611 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind], 612 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads], 613 }; 614 615 bool Changed = false; 616 LoopInfo *LI = nullptr; 617 DominatorTree *DT = nullptr; 618 619 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; 620 621 BasicBlock *StartBB = nullptr, *EndBB = nullptr; 622 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 623 BasicBlock &ContinuationIP) { 624 BasicBlock *CGStartBB = CodeGenIP.getBlock(); 625 BasicBlock *CGEndBB = 626 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 627 assert(StartBB != nullptr && "StartBB should not be null"); 628 CGStartBB->getTerminator()->setSuccessor(0, StartBB); 629 assert(EndBB != nullptr && "EndBB should not be null"); 630 EndBB->getTerminator()->setSuccessor(0, CGEndBB); 631 }; 632 633 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &, 634 Value &Inner, Value *&ReplacementValue) -> InsertPointTy { 635 ReplacementValue = &Inner; 636 return CodeGenIP; 637 }; 638 639 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 640 641 /// Create a sequential execution region within a merged parallel region, 642 /// encapsulated in a master construct with a barrier for synchronization. 643 auto CreateSequentialRegion = [&](Function *OuterFn, 644 BasicBlock *OuterPredBB, 645 Instruction *SeqStartI, 646 Instruction *SeqEndI) { 647 // Isolate the instructions of the sequential region to a separate 648 // block. 649 BasicBlock *ParentBB = SeqStartI->getParent(); 650 BasicBlock *SeqEndBB = 651 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI); 652 BasicBlock *SeqAfterBB = 653 SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI); 654 BasicBlock *SeqStartBB = 655 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged"); 656 657 assert(ParentBB->getUniqueSuccessor() == SeqStartBB && 658 "Expected a different CFG"); 659 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); 660 ParentBB->getTerminator()->eraseFromParent(); 661 662 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 663 BasicBlock &ContinuationIP) { 664 BasicBlock *CGStartBB = CodeGenIP.getBlock(); 665 BasicBlock *CGEndBB = 666 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 667 assert(SeqStartBB != nullptr && "SeqStartBB should not be null"); 668 CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB); 669 assert(SeqEndBB != nullptr && "SeqEndBB should not be null"); 670 SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB); 671 }; 672 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 673 674 // Find outputs from the sequential region to outside users and 675 // broadcast their values to them. 676 for (Instruction &I : *SeqStartBB) { 677 SmallPtrSet<Instruction *, 4> OutsideUsers; 678 for (User *Usr : I.users()) { 679 Instruction &UsrI = *cast<Instruction>(Usr); 680 // Ignore outputs to LT intrinsics, code extraction for the merged 681 // parallel region will fix them. 682 if (UsrI.isLifetimeStartOrEnd()) 683 continue; 684 685 if (UsrI.getParent() != SeqStartBB) 686 OutsideUsers.insert(&UsrI); 687 } 688 689 if (OutsideUsers.empty()) 690 continue; 691 692 // Emit an alloca in the outer region to store the broadcasted 693 // value. 694 const DataLayout &DL = M.getDataLayout(); 695 AllocaInst *AllocaI = new AllocaInst( 696 I.getType(), DL.getAllocaAddrSpace(), nullptr, 697 I.getName() + ".seq.output.alloc", &OuterFn->front().front()); 698 699 // Emit a store instruction in the sequential BB to update the 700 // value. 701 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()); 702 703 // Emit a load instruction and replace the use of the output value 704 // with it. 705 for (Instruction *UsrI : OutsideUsers) { 706 LoadInst *LoadI = new LoadInst( 707 I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI); 708 UsrI->replaceUsesOfWith(&I, LoadI); 709 } 710 } 711 712 OpenMPIRBuilder::LocationDescription Loc( 713 InsertPointTy(ParentBB, ParentBB->end()), DL); 714 InsertPointTy SeqAfterIP = 715 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB); 716 717 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel); 718 719 BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock()); 720 721 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn 722 << "\n"); 723 }; 724 725 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all 726 // contained in BB and only separated by instructions that can be 727 // redundantly executed in parallel. The block BB is split before the first 728 // call (in MergableCIs) and after the last so the entire region we merge 729 // into a single parallel region is contained in a single basic block 730 // without any other instructions. We use the OpenMPIRBuilder to outline 731 // that block and call the resulting function via __kmpc_fork_call. 732 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { 733 // TODO: Change the interface to allow single CIs expanded, e.g, to 734 // include an outer loop. 735 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); 736 737 auto Remark = [&](OptimizationRemark OR) { 738 OR << "Parallel region at " 739 << ore::NV("OpenMPParallelMergeFront", 740 MergableCIs.front()->getDebugLoc()) 741 << " merged with parallel regions at "; 742 for (auto *CI : llvm::drop_begin(MergableCIs)) { 743 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); 744 if (CI != MergableCIs.back()) 745 OR << ", "; 746 } 747 return OR; 748 }; 749 750 emitRemark<OptimizationRemark>(MergableCIs.front(), 751 "OpenMPParallelRegionMerging", Remark); 752 753 Function *OriginalFn = BB->getParent(); 754 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() 755 << " parallel regions in " << OriginalFn->getName() 756 << "\n"); 757 758 // Isolate the calls to merge in a separate block. 759 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); 760 BasicBlock *AfterBB = 761 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); 762 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, 763 "omp.par.merged"); 764 765 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); 766 const DebugLoc DL = BB->getTerminator()->getDebugLoc(); 767 BB->getTerminator()->eraseFromParent(); 768 769 // Create sequential regions for sequential instructions that are 770 // in-between mergable parallel regions. 771 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1; 772 It != End; ++It) { 773 Instruction *ForkCI = *It; 774 Instruction *NextForkCI = *(It + 1); 775 776 // Continue if there are not in-between instructions. 777 if (ForkCI->getNextNode() == NextForkCI) 778 continue; 779 780 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(), 781 NextForkCI->getPrevNode()); 782 } 783 784 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), 785 DL); 786 IRBuilder<>::InsertPoint AllocaIP( 787 &OriginalFn->getEntryBlock(), 788 OriginalFn->getEntryBlock().getFirstInsertionPt()); 789 // Create the merged parallel region with default proc binding, to 790 // avoid overriding binding settings, and without explicit cancellation. 791 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel( 792 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, 793 OMP_PROC_BIND_default, /* IsCancellable */ false); 794 BranchInst::Create(AfterBB, AfterIP.getBlock()); 795 796 // Perform the actual outlining. 797 OMPInfoCache.OMPBuilder.finalize(OriginalFn, 798 /* AllowExtractorSinking */ true); 799 800 Function *OutlinedFn = MergableCIs.front()->getCaller(); 801 802 // Replace the __kmpc_fork_call calls with direct calls to the outlined 803 // callbacks. 804 SmallVector<Value *, 8> Args; 805 for (auto *CI : MergableCIs) { 806 Value *Callee = 807 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); 808 FunctionType *FT = 809 cast<FunctionType>(Callee->getType()->getPointerElementType()); 810 Args.clear(); 811 Args.push_back(OutlinedFn->getArg(0)); 812 Args.push_back(OutlinedFn->getArg(1)); 813 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 814 U < E; ++U) 815 Args.push_back(CI->getArgOperand(U)); 816 817 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); 818 if (CI->getDebugLoc()) 819 NewCI->setDebugLoc(CI->getDebugLoc()); 820 821 // Forward parameter attributes from the callback to the callee. 822 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 823 U < E; ++U) 824 for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) 825 NewCI->addParamAttr( 826 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); 827 828 // Emit an explicit barrier to replace the implicit fork-join barrier. 829 if (CI != MergableCIs.back()) { 830 // TODO: Remove barrier if the merged parallel region includes the 831 // 'nowait' clause. 832 OMPInfoCache.OMPBuilder.createBarrier( 833 InsertPointTy(NewCI->getParent(), 834 NewCI->getNextNode()->getIterator()), 835 OMPD_parallel); 836 } 837 838 auto Remark = [&](OptimizationRemark OR) { 839 return OR << "Parallel region at " 840 << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()) 841 << " merged with " 842 << ore::NV("OpenMPParallelMergeFront", 843 MergableCIs.front()->getDebugLoc()); 844 }; 845 if (CI != MergableCIs.front()) 846 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging", 847 Remark); 848 849 CI->eraseFromParent(); 850 } 851 852 assert(OutlinedFn != OriginalFn && "Outlining failed"); 853 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn); 854 CGUpdater.reanalyzeFunction(*OriginalFn); 855 856 NumOpenMPParallelRegionsMerged += MergableCIs.size(); 857 858 return true; 859 }; 860 861 // Helper function that identifes sequences of 862 // __kmpc_fork_call uses in a basic block. 863 auto DetectPRsCB = [&](Use &U, Function &F) { 864 CallInst *CI = getCallIfRegularCall(U, &RFI); 865 BB2PRMap[CI->getParent()].insert(CI); 866 867 return false; 868 }; 869 870 BB2PRMap.clear(); 871 RFI.foreachUse(SCC, DetectPRsCB); 872 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector; 873 // Find mergable parallel regions within a basic block that are 874 // safe to merge, that is any in-between instructions can safely 875 // execute in parallel after merging. 876 // TODO: support merging across basic-blocks. 877 for (auto &It : BB2PRMap) { 878 auto &CIs = It.getSecond(); 879 if (CIs.size() < 2) 880 continue; 881 882 BasicBlock *BB = It.getFirst(); 883 SmallVector<CallInst *, 4> MergableCIs; 884 885 /// Returns true if the instruction is mergable, false otherwise. 886 /// A terminator instruction is unmergable by definition since merging 887 /// works within a BB. Instructions before the mergable region are 888 /// mergable if they are not calls to OpenMP runtime functions that may 889 /// set different execution parameters for subsequent parallel regions. 890 /// Instructions in-between parallel regions are mergable if they are not 891 /// calls to any non-intrinsic function since that may call a non-mergable 892 /// OpenMP runtime function. 893 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) { 894 // We do not merge across BBs, hence return false (unmergable) if the 895 // instruction is a terminator. 896 if (I.isTerminator()) 897 return false; 898 899 if (!isa<CallInst>(&I)) 900 return true; 901 902 CallInst *CI = cast<CallInst>(&I); 903 if (IsBeforeMergableRegion) { 904 Function *CalledFunction = CI->getCalledFunction(); 905 if (!CalledFunction) 906 return false; 907 // Return false (unmergable) if the call before the parallel 908 // region calls an explicit affinity (proc_bind) or number of 909 // threads (num_threads) compiler-generated function. Those settings 910 // may be incompatible with following parallel regions. 911 // TODO: ICV tracking to detect compatibility. 912 for (const auto &RFI : UnmergableCallsInfo) { 913 if (CalledFunction == RFI.Declaration) 914 return false; 915 } 916 } else { 917 // Return false (unmergable) if there is a call instruction 918 // in-between parallel regions when it is not an intrinsic. It 919 // may call an unmergable OpenMP runtime function in its callpath. 920 // TODO: Keep track of possible OpenMP calls in the callpath. 921 if (!isa<IntrinsicInst>(CI)) 922 return false; 923 } 924 925 return true; 926 }; 927 // Find maximal number of parallel region CIs that are safe to merge. 928 for (auto It = BB->begin(), End = BB->end(); It != End;) { 929 Instruction &I = *It; 930 ++It; 931 932 if (CIs.count(&I)) { 933 MergableCIs.push_back(cast<CallInst>(&I)); 934 continue; 935 } 936 937 // Continue expanding if the instruction is mergable. 938 if (IsMergable(I, MergableCIs.empty())) 939 continue; 940 941 // Forward the instruction iterator to skip the next parallel region 942 // since there is an unmergable instruction which can affect it. 943 for (; It != End; ++It) { 944 Instruction &SkipI = *It; 945 if (CIs.count(&SkipI)) { 946 LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI 947 << " due to " << I << "\n"); 948 ++It; 949 break; 950 } 951 } 952 953 // Store mergable regions found. 954 if (MergableCIs.size() > 1) { 955 MergableCIsVector.push_back(MergableCIs); 956 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size() 957 << " parallel regions in block " << BB->getName() 958 << " of function " << BB->getParent()->getName() 959 << "\n";); 960 } 961 962 MergableCIs.clear(); 963 } 964 965 if (!MergableCIsVector.empty()) { 966 Changed = true; 967 968 for (auto &MergableCIs : MergableCIsVector) 969 Merge(MergableCIs, BB); 970 } 971 } 972 973 if (Changed) { 974 /// Re-collect use for fork calls, emitted barrier calls, and 975 /// any emitted master/end_master calls. 976 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call); 977 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier); 978 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master); 979 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master); 980 } 981 982 return Changed; 983 } 984 985 /// Try to delete parallel regions if possible. 986 bool deleteParallelRegions() { 987 const unsigned CallbackCalleeOperand = 2; 988 989 OMPInformationCache::RuntimeFunctionInfo &RFI = 990 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 991 992 if (!RFI.Declaration) 993 return false; 994 995 bool Changed = false; 996 auto DeleteCallCB = [&](Use &U, Function &) { 997 CallInst *CI = getCallIfRegularCall(U); 998 if (!CI) 999 return false; 1000 auto *Fn = dyn_cast<Function>( 1001 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); 1002 if (!Fn) 1003 return false; 1004 if (!Fn->onlyReadsMemory()) 1005 return false; 1006 if (!Fn->hasFnAttribute(Attribute::WillReturn)) 1007 return false; 1008 1009 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " 1010 << CI->getCaller()->getName() << "\n"); 1011 1012 auto Remark = [&](OptimizationRemark OR) { 1013 return OR << "Parallel region in " 1014 << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) 1015 << " deleted"; 1016 }; 1017 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion", 1018 Remark); 1019 1020 CGUpdater.removeCallSite(*CI); 1021 CI->eraseFromParent(); 1022 Changed = true; 1023 ++NumOpenMPParallelRegionsDeleted; 1024 return true; 1025 }; 1026 1027 RFI.foreachUse(SCC, DeleteCallCB); 1028 1029 return Changed; 1030 } 1031 1032 /// Try to eliminate runtime calls by reusing existing ones. 1033 bool deduplicateRuntimeCalls() { 1034 bool Changed = false; 1035 1036 RuntimeFunction DeduplicableRuntimeCallIDs[] = { 1037 OMPRTL_omp_get_num_threads, 1038 OMPRTL_omp_in_parallel, 1039 OMPRTL_omp_get_cancellation, 1040 OMPRTL_omp_get_thread_limit, 1041 OMPRTL_omp_get_supported_active_levels, 1042 OMPRTL_omp_get_level, 1043 OMPRTL_omp_get_ancestor_thread_num, 1044 OMPRTL_omp_get_team_size, 1045 OMPRTL_omp_get_active_level, 1046 OMPRTL_omp_in_final, 1047 OMPRTL_omp_get_proc_bind, 1048 OMPRTL_omp_get_num_places, 1049 OMPRTL_omp_get_num_procs, 1050 OMPRTL_omp_get_place_num, 1051 OMPRTL_omp_get_partition_num_places, 1052 OMPRTL_omp_get_partition_place_nums}; 1053 1054 // Global-tid is handled separately. 1055 SmallSetVector<Value *, 16> GTIdArgs; 1056 collectGlobalThreadIdArguments(GTIdArgs); 1057 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() 1058 << " global thread ID arguments\n"); 1059 1060 for (Function *F : SCC) { 1061 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) 1062 Changed |= deduplicateRuntimeCalls( 1063 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); 1064 1065 // __kmpc_global_thread_num is special as we can replace it with an 1066 // argument in enough cases to make it worth trying. 1067 Value *GTIdArg = nullptr; 1068 for (Argument &Arg : F->args()) 1069 if (GTIdArgs.count(&Arg)) { 1070 GTIdArg = &Arg; 1071 break; 1072 } 1073 Changed |= deduplicateRuntimeCalls( 1074 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); 1075 } 1076 1077 return Changed; 1078 } 1079 1080 /// Tries to hide the latency of runtime calls that involve host to 1081 /// device memory transfers by splitting them into their "issue" and "wait" 1082 /// versions. The "issue" is moved upwards as much as possible. The "wait" is 1083 /// moved downards as much as possible. The "issue" issues the memory transfer 1084 /// asynchronously, returning a handle. The "wait" waits in the returned 1085 /// handle for the memory transfer to finish. 1086 bool hideMemTransfersLatency() { 1087 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; 1088 bool Changed = false; 1089 auto SplitMemTransfers = [&](Use &U, Function &Decl) { 1090 auto *RTCall = getCallIfRegularCall(U, &RFI); 1091 if (!RTCall) 1092 return false; 1093 1094 OffloadArray OffloadArrays[3]; 1095 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) 1096 return false; 1097 1098 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); 1099 1100 // TODO: Check if can be moved upwards. 1101 bool WasSplit = false; 1102 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); 1103 if (WaitMovementPoint) 1104 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); 1105 1106 Changed |= WasSplit; 1107 return WasSplit; 1108 }; 1109 RFI.foreachUse(SCC, SplitMemTransfers); 1110 1111 return Changed; 1112 } 1113 1114 void analysisGlobalization() { 1115 RuntimeFunction GlobalizationRuntimeIDs[] = { 1116 OMPRTL___kmpc_data_sharing_coalesced_push_stack, 1117 OMPRTL___kmpc_data_sharing_push_stack}; 1118 1119 for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { 1120 auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; 1121 1122 auto CheckGlobalization = [&](Use &U, Function &Decl) { 1123 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { 1124 auto Remark = [&](OptimizationRemarkAnalysis ORA) { 1125 return ORA 1126 << "Found thread data sharing on the GPU. " 1127 << "Expect degraded performance due to data globalization."; 1128 }; 1129 emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization", 1130 Remark); 1131 } 1132 1133 return false; 1134 }; 1135 1136 RFI.foreachUse(SCC, CheckGlobalization); 1137 } 1138 } 1139 1140 /// Maps the values stored in the offload arrays passed as arguments to 1141 /// \p RuntimeCall into the offload arrays in \p OAs. 1142 bool getValuesInOffloadArrays(CallInst &RuntimeCall, 1143 MutableArrayRef<OffloadArray> OAs) { 1144 assert(OAs.size() == 3 && "Need space for three offload arrays!"); 1145 1146 // A runtime call that involves memory offloading looks something like: 1147 // call void @__tgt_target_data_begin_mapper(arg0, arg1, 1148 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, 1149 // ...) 1150 // So, the idea is to access the allocas that allocate space for these 1151 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. 1152 // Therefore: 1153 // i8** %offload_baseptrs. 1154 Value *BasePtrsArg = 1155 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); 1156 // i8** %offload_ptrs. 1157 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); 1158 // i8** %offload_sizes. 1159 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); 1160 1161 // Get values stored in **offload_baseptrs. 1162 auto *V = getUnderlyingObject(BasePtrsArg); 1163 if (!isa<AllocaInst>(V)) 1164 return false; 1165 auto *BasePtrsArray = cast<AllocaInst>(V); 1166 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) 1167 return false; 1168 1169 // Get values stored in **offload_baseptrs. 1170 V = getUnderlyingObject(PtrsArg); 1171 if (!isa<AllocaInst>(V)) 1172 return false; 1173 auto *PtrsArray = cast<AllocaInst>(V); 1174 if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) 1175 return false; 1176 1177 // Get values stored in **offload_sizes. 1178 V = getUnderlyingObject(SizesArg); 1179 // If it's a [constant] global array don't analyze it. 1180 if (isa<GlobalValue>(V)) 1181 return isa<Constant>(V); 1182 if (!isa<AllocaInst>(V)) 1183 return false; 1184 1185 auto *SizesArray = cast<AllocaInst>(V); 1186 if (!OAs[2].initialize(*SizesArray, RuntimeCall)) 1187 return false; 1188 1189 return true; 1190 } 1191 1192 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. 1193 /// For now this is a way to test that the function getValuesInOffloadArrays 1194 /// is working properly. 1195 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. 1196 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) { 1197 assert(OAs.size() == 3 && "There are three offload arrays to debug!"); 1198 1199 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); 1200 std::string ValuesStr; 1201 raw_string_ostream Printer(ValuesStr); 1202 std::string Separator = " --- "; 1203 1204 for (auto *BP : OAs[0].StoredValues) { 1205 BP->print(Printer); 1206 Printer << Separator; 1207 } 1208 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); 1209 ValuesStr.clear(); 1210 1211 for (auto *P : OAs[1].StoredValues) { 1212 P->print(Printer); 1213 Printer << Separator; 1214 } 1215 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); 1216 ValuesStr.clear(); 1217 1218 for (auto *S : OAs[2].StoredValues) { 1219 S->print(Printer); 1220 Printer << Separator; 1221 } 1222 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); 1223 } 1224 1225 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be 1226 /// moved. Returns nullptr if the movement is not possible, or not worth it. 1227 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { 1228 // FIXME: This traverses only the BasicBlock where RuntimeCall is. 1229 // Make it traverse the CFG. 1230 1231 Instruction *CurrentI = &RuntimeCall; 1232 bool IsWorthIt = false; 1233 while ((CurrentI = CurrentI->getNextNode())) { 1234 1235 // TODO: Once we detect the regions to be offloaded we should use the 1236 // alias analysis manager to check if CurrentI may modify one of 1237 // the offloaded regions. 1238 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { 1239 if (IsWorthIt) 1240 return CurrentI; 1241 1242 return nullptr; 1243 } 1244 1245 // FIXME: For now if we move it over anything without side effect 1246 // is worth it. 1247 IsWorthIt = true; 1248 } 1249 1250 // Return end of BasicBlock. 1251 return RuntimeCall.getParent()->getTerminator(); 1252 } 1253 1254 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. 1255 bool splitTargetDataBeginRTC(CallInst &RuntimeCall, 1256 Instruction &WaitMovementPoint) { 1257 // Create stack allocated handle (__tgt_async_info) at the beginning of the 1258 // function. Used for storing information of the async transfer, allowing to 1259 // wait on it later. 1260 auto &IRBuilder = OMPInfoCache.OMPBuilder; 1261 auto *F = RuntimeCall.getCaller(); 1262 Instruction *FirstInst = &(F->getEntryBlock().front()); 1263 AllocaInst *Handle = new AllocaInst( 1264 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); 1265 1266 // Add "issue" runtime call declaration: 1267 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, 1268 // i8**, i8**, i64*, i64*) 1269 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( 1270 M, OMPRTL___tgt_target_data_begin_mapper_issue); 1271 1272 // Change RuntimeCall call site for its asynchronous version. 1273 SmallVector<Value *, 16> Args; 1274 for (auto &Arg : RuntimeCall.args()) 1275 Args.push_back(Arg.get()); 1276 Args.push_back(Handle); 1277 1278 CallInst *IssueCallsite = 1279 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); 1280 RuntimeCall.eraseFromParent(); 1281 1282 // Add "wait" runtime call declaration: 1283 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) 1284 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( 1285 M, OMPRTL___tgt_target_data_begin_mapper_wait); 1286 1287 Value *WaitParams[2] = { 1288 IssueCallsite->getArgOperand( 1289 OffloadArray::DeviceIDArgNum), // device_id. 1290 Handle // handle to wait on. 1291 }; 1292 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); 1293 1294 return true; 1295 } 1296 1297 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, 1298 bool GlobalOnly, bool &SingleChoice) { 1299 if (CurrentIdent == NextIdent) 1300 return CurrentIdent; 1301 1302 // TODO: Figure out how to actually combine multiple debug locations. For 1303 // now we just keep an existing one if there is a single choice. 1304 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { 1305 SingleChoice = !CurrentIdent; 1306 return NextIdent; 1307 } 1308 return nullptr; 1309 } 1310 1311 /// Return an `struct ident_t*` value that represents the ones used in the 1312 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not 1313 /// return a local `struct ident_t*`. For now, if we cannot find a suitable 1314 /// return value we create one from scratch. We also do not yet combine 1315 /// information, e.g., the source locations, see combinedIdentStruct. 1316 Value * 1317 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, 1318 Function &F, bool GlobalOnly) { 1319 bool SingleChoice = true; 1320 Value *Ident = nullptr; 1321 auto CombineIdentStruct = [&](Use &U, Function &Caller) { 1322 CallInst *CI = getCallIfRegularCall(U, &RFI); 1323 if (!CI || &F != &Caller) 1324 return false; 1325 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), 1326 /* GlobalOnly */ true, SingleChoice); 1327 return false; 1328 }; 1329 RFI.foreachUse(SCC, CombineIdentStruct); 1330 1331 if (!Ident || !SingleChoice) { 1332 // The IRBuilder uses the insertion block to get to the module, this is 1333 // unfortunate but we work around it for now. 1334 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) 1335 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( 1336 &F.getEntryBlock(), F.getEntryBlock().begin())); 1337 // Create a fallback location if non was found. 1338 // TODO: Use the debug locations of the calls instead. 1339 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); 1340 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); 1341 } 1342 return Ident; 1343 } 1344 1345 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or 1346 /// \p ReplVal if given. 1347 bool deduplicateRuntimeCalls(Function &F, 1348 OMPInformationCache::RuntimeFunctionInfo &RFI, 1349 Value *ReplVal = nullptr) { 1350 auto *UV = RFI.getUseVector(F); 1351 if (!UV || UV->size() + (ReplVal != nullptr) < 2) 1352 return false; 1353 1354 LLVM_DEBUG( 1355 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name 1356 << (ReplVal ? " with an existing value\n" : "\n") << "\n"); 1357 1358 assert((!ReplVal || (isa<Argument>(ReplVal) && 1359 cast<Argument>(ReplVal)->getParent() == &F)) && 1360 "Unexpected replacement value!"); 1361 1362 // TODO: Use dominance to find a good position instead. 1363 auto CanBeMoved = [this](CallBase &CB) { 1364 unsigned NumArgs = CB.getNumArgOperands(); 1365 if (NumArgs == 0) 1366 return true; 1367 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) 1368 return false; 1369 for (unsigned u = 1; u < NumArgs; ++u) 1370 if (isa<Instruction>(CB.getArgOperand(u))) 1371 return false; 1372 return true; 1373 }; 1374 1375 if (!ReplVal) { 1376 for (Use *U : *UV) 1377 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { 1378 if (!CanBeMoved(*CI)) 1379 continue; 1380 1381 auto Remark = [&](OptimizationRemark OR) { 1382 auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); 1383 return OR << "OpenMP runtime call " 1384 << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " 1385 << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); 1386 }; 1387 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark); 1388 1389 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); 1390 ReplVal = CI; 1391 break; 1392 } 1393 if (!ReplVal) 1394 return false; 1395 } 1396 1397 // If we use a call as a replacement value we need to make sure the ident is 1398 // valid at the new location. For now we just pick a global one, either 1399 // existing and used by one of the calls, or created from scratch. 1400 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { 1401 if (CI->getNumArgOperands() > 0 && 1402 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { 1403 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, 1404 /* GlobalOnly */ true); 1405 CI->setArgOperand(0, Ident); 1406 } 1407 } 1408 1409 bool Changed = false; 1410 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { 1411 CallInst *CI = getCallIfRegularCall(U, &RFI); 1412 if (!CI || CI == ReplVal || &F != &Caller) 1413 return false; 1414 assert(CI->getCaller() == &F && "Unexpected call!"); 1415 1416 auto Remark = [&](OptimizationRemark OR) { 1417 return OR << "OpenMP runtime call " 1418 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; 1419 }; 1420 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark); 1421 1422 CGUpdater.removeCallSite(*CI); 1423 CI->replaceAllUsesWith(ReplVal); 1424 CI->eraseFromParent(); 1425 ++NumOpenMPRuntimeCallsDeduplicated; 1426 Changed = true; 1427 return true; 1428 }; 1429 RFI.foreachUse(SCC, ReplaceAndDeleteCB); 1430 1431 return Changed; 1432 } 1433 1434 /// Collect arguments that represent the global thread id in \p GTIdArgs. 1435 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) { 1436 // TODO: Below we basically perform a fixpoint iteration with a pessimistic 1437 // initialization. We could define an AbstractAttribute instead and 1438 // run the Attributor here once it can be run as an SCC pass. 1439 1440 // Helper to check the argument \p ArgNo at all call sites of \p F for 1441 // a GTId. 1442 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { 1443 if (!F.hasLocalLinkage()) 1444 return false; 1445 for (Use &U : F.uses()) { 1446 if (CallInst *CI = getCallIfRegularCall(U)) { 1447 Value *ArgOp = CI->getArgOperand(ArgNo); 1448 if (CI == &RefCI || GTIdArgs.count(ArgOp) || 1449 getCallIfRegularCall( 1450 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) 1451 continue; 1452 } 1453 return false; 1454 } 1455 return true; 1456 }; 1457 1458 // Helper to identify uses of a GTId as GTId arguments. 1459 auto AddUserArgs = [&](Value >Id) { 1460 for (Use &U : GTId.uses()) 1461 if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) 1462 if (CI->isArgOperand(&U)) 1463 if (Function *Callee = CI->getCalledFunction()) 1464 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) 1465 GTIdArgs.insert(Callee->getArg(U.getOperandNo())); 1466 }; 1467 1468 // The argument users of __kmpc_global_thread_num calls are GTIds. 1469 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = 1470 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; 1471 1472 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { 1473 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) 1474 AddUserArgs(*CI); 1475 return false; 1476 }); 1477 1478 // Transitively search for more arguments by looking at the users of the 1479 // ones we know already. During the search the GTIdArgs vector is extended 1480 // so we cannot cache the size nor can we use a range based for. 1481 for (unsigned u = 0; u < GTIdArgs.size(); ++u) 1482 AddUserArgs(*GTIdArgs[u]); 1483 } 1484 1485 /// Kernel (=GPU) optimizations and utility functions 1486 /// 1487 ///{{ 1488 1489 /// Check if \p F is a kernel, hence entry point for target offloading. 1490 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } 1491 1492 /// Cache to remember the unique kernel for a function. 1493 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; 1494 1495 /// Find the unique kernel that will execute \p F, if any. 1496 Kernel getUniqueKernelFor(Function &F); 1497 1498 /// Find the unique kernel that will execute \p I, if any. 1499 Kernel getUniqueKernelFor(Instruction &I) { 1500 return getUniqueKernelFor(*I.getFunction()); 1501 } 1502 1503 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in 1504 /// the cases we can avoid taking the address of a function. 1505 bool rewriteDeviceCodeStateMachine(); 1506 1507 /// 1508 ///}} 1509 1510 /// Emit a remark generically 1511 /// 1512 /// This template function can be used to generically emit a remark. The 1513 /// RemarkKind should be one of the following: 1514 /// - OptimizationRemark to indicate a successful optimization attempt 1515 /// - OptimizationRemarkMissed to report a failed optimization attempt 1516 /// - OptimizationRemarkAnalysis to provide additional information about an 1517 /// optimization attempt 1518 /// 1519 /// The remark is built using a callback function provided by the caller that 1520 /// takes a RemarkKind as input and returns a RemarkKind. 1521 template <typename RemarkKind, 1522 typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>> 1523 void emitRemark(Instruction *Inst, StringRef RemarkName, 1524 RemarkCallBack &&RemarkCB) const { 1525 Function *F = Inst->getParent()->getParent(); 1526 auto &ORE = OREGetter(F); 1527 1528 ORE.emit( 1529 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); 1530 } 1531 1532 /// Emit a remark on a function. Since only OptimizationRemark is supporting 1533 /// this, it can't be made generic. 1534 void 1535 emitRemarkOnFunction(Function *F, StringRef RemarkName, 1536 function_ref<OptimizationRemark(OptimizationRemark &&)> 1537 &&RemarkCB) const { 1538 auto &ORE = OREGetter(F); 1539 1540 ORE.emit([&]() { 1541 return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); 1542 }); 1543 } 1544 1545 /// The underlying module. 1546 Module &M; 1547 1548 /// The SCC we are operating on. 1549 SmallVectorImpl<Function *> &SCC; 1550 1551 /// Callback to update the call graph, the first argument is a removed call, 1552 /// the second an optional replacement call. 1553 CallGraphUpdater &CGUpdater; 1554 1555 /// Callback to get an OptimizationRemarkEmitter from a Function * 1556 OptimizationRemarkGetter OREGetter; 1557 1558 /// OpenMP-specific information cache. Also Used for Attributor runs. 1559 OMPInformationCache &OMPInfoCache; 1560 1561 /// Attributor instance. 1562 Attributor &A; 1563 1564 /// Helper function to run Attributor on SCC. 1565 bool runAttributor() { 1566 if (SCC.empty()) 1567 return false; 1568 1569 registerAAs(); 1570 1571 ChangeStatus Changed = A.run(); 1572 1573 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size() 1574 << " functions, result: " << Changed << ".\n"); 1575 1576 return Changed == ChangeStatus::CHANGED; 1577 } 1578 1579 /// Populate the Attributor with abstract attribute opportunities in the 1580 /// function. 1581 void registerAAs() { 1582 if (SCC.empty()) 1583 return; 1584 1585 // Create CallSite AA for all Getters. 1586 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { 1587 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; 1588 1589 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; 1590 1591 auto CreateAA = [&](Use &U, Function &Caller) { 1592 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); 1593 if (!CI) 1594 return false; 1595 1596 auto &CB = cast<CallBase>(*CI); 1597 1598 IRPosition CBPos = IRPosition::callsite_function(CB); 1599 A.getOrCreateAAFor<AAICVTracker>(CBPos); 1600 return false; 1601 }; 1602 1603 GetterRFI.foreachUse(SCC, CreateAA); 1604 } 1605 } 1606 }; 1607 1608 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { 1609 if (!OMPInfoCache.ModuleSlice.count(&F)) 1610 return nullptr; 1611 1612 // Use a scope to keep the lifetime of the CachedKernel short. 1613 { 1614 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; 1615 if (CachedKernel) 1616 return *CachedKernel; 1617 1618 // TODO: We should use an AA to create an (optimistic and callback 1619 // call-aware) call graph. For now we stick to simple patterns that 1620 // are less powerful, basically the worst fixpoint. 1621 if (isKernel(F)) { 1622 CachedKernel = Kernel(&F); 1623 return *CachedKernel; 1624 } 1625 1626 CachedKernel = nullptr; 1627 if (!F.hasLocalLinkage()) { 1628 1629 // See https://openmp.llvm.org/remarks/OptimizationRemarks.html 1630 auto Remark = [&](OptimizationRemark OR) { 1631 return OR << "[OMP100] Potentially unknown OpenMP target region caller"; 1632 }; 1633 emitRemarkOnFunction(&F, "OMP100", Remark); 1634 1635 return nullptr; 1636 } 1637 } 1638 1639 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { 1640 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 1641 // Allow use in equality comparisons. 1642 if (Cmp->isEquality()) 1643 return getUniqueKernelFor(*Cmp); 1644 return nullptr; 1645 } 1646 if (auto *CB = dyn_cast<CallBase>(U.getUser())) { 1647 // Allow direct calls. 1648 if (CB->isCallee(&U)) 1649 return getUniqueKernelFor(*CB); 1650 // Allow the use in __kmpc_kernel_prepare_parallel calls. 1651 if (Function *Callee = CB->getCalledFunction()) 1652 if (Callee->getName() == "__kmpc_kernel_prepare_parallel") 1653 return getUniqueKernelFor(*CB); 1654 return nullptr; 1655 } 1656 // Disallow every other use. 1657 return nullptr; 1658 }; 1659 1660 // TODO: In the future we want to track more than just a unique kernel. 1661 SmallPtrSet<Kernel, 2> PotentialKernels; 1662 OMPInformationCache::foreachUse(F, [&](const Use &U) { 1663 PotentialKernels.insert(GetUniqueKernelForUse(U)); 1664 }); 1665 1666 Kernel K = nullptr; 1667 if (PotentialKernels.size() == 1) 1668 K = *PotentialKernels.begin(); 1669 1670 // Cache the result. 1671 UniqueKernelMap[&F] = K; 1672 1673 return K; 1674 } 1675 1676 bool OpenMPOpt::rewriteDeviceCodeStateMachine() { 1677 OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI = 1678 OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel]; 1679 1680 bool Changed = false; 1681 if (!KernelPrepareParallelRFI) 1682 return Changed; 1683 1684 for (Function *F : SCC) { 1685 1686 // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at 1687 // all. 1688 bool UnknownUse = false; 1689 bool KernelPrepareUse = false; 1690 unsigned NumDirectCalls = 0; 1691 1692 SmallVector<Use *, 2> ToBeReplacedStateMachineUses; 1693 OMPInformationCache::foreachUse(*F, [&](Use &U) { 1694 if (auto *CB = dyn_cast<CallBase>(U.getUser())) 1695 if (CB->isCallee(&U)) { 1696 ++NumDirectCalls; 1697 return; 1698 } 1699 1700 if (isa<ICmpInst>(U.getUser())) { 1701 ToBeReplacedStateMachineUses.push_back(&U); 1702 return; 1703 } 1704 if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( 1705 *U.getUser(), &KernelPrepareParallelRFI)) { 1706 KernelPrepareUse = true; 1707 ToBeReplacedStateMachineUses.push_back(&U); 1708 return; 1709 } 1710 UnknownUse = true; 1711 }); 1712 1713 // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel 1714 // use. 1715 if (!KernelPrepareUse) 1716 continue; 1717 1718 { 1719 auto Remark = [&](OptimizationRemark OR) { 1720 return OR << "Found a parallel region that is called in a target " 1721 "region but not part of a combined target construct nor " 1722 "nesed inside a target construct without intermediate " 1723 "code. This can lead to excessive register usage for " 1724 "unrelated target regions in the same translation unit " 1725 "due to spurious call edges assumed by ptxas."; 1726 }; 1727 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1728 } 1729 1730 // If this ever hits, we should investigate. 1731 // TODO: Checking the number of uses is not a necessary restriction and 1732 // should be lifted. 1733 if (UnknownUse || NumDirectCalls != 1 || 1734 ToBeReplacedStateMachineUses.size() != 2) { 1735 { 1736 auto Remark = [&](OptimizationRemark OR) { 1737 return OR << "Parallel region is used in " 1738 << (UnknownUse ? "unknown" : "unexpected") 1739 << " ways; will not attempt to rewrite the state machine."; 1740 }; 1741 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1742 } 1743 continue; 1744 } 1745 1746 // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give 1747 // up if the function is not called from a unique kernel. 1748 Kernel K = getUniqueKernelFor(*F); 1749 if (!K) { 1750 { 1751 auto Remark = [&](OptimizationRemark OR) { 1752 return OR << "Parallel region is not known to be called from a " 1753 "unique single target region, maybe the surrounding " 1754 "function has external linkage?; will not attempt to " 1755 "rewrite the state machine use."; 1756 }; 1757 emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", 1758 Remark); 1759 } 1760 continue; 1761 } 1762 1763 // We now know F is a parallel body function called only from the kernel K. 1764 // We also identified the state machine uses in which we replace the 1765 // function pointer by a new global symbol for identification purposes. This 1766 // ensures only direct calls to the function are left. 1767 1768 { 1769 auto RemarkParalleRegion = [&](OptimizationRemark OR) { 1770 return OR << "Specialize parallel region that is only reached from a " 1771 "single target region to avoid spurious call edges and " 1772 "excessive register usage in other target regions. " 1773 "(parallel region ID: " 1774 << ore::NV("OpenMPParallelRegion", F->getName()) 1775 << ", kernel ID: " 1776 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1777 }; 1778 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", 1779 RemarkParalleRegion); 1780 auto RemarkKernel = [&](OptimizationRemark OR) { 1781 return OR << "Target region containing the parallel region that is " 1782 "specialized. (parallel region ID: " 1783 << ore::NV("OpenMPParallelRegion", F->getName()) 1784 << ", kernel ID: " 1785 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1786 }; 1787 emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); 1788 } 1789 1790 Module &M = *F->getParent(); 1791 Type *Int8Ty = Type::getInt8Ty(M.getContext()); 1792 1793 auto *ID = new GlobalVariable( 1794 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, 1795 UndefValue::get(Int8Ty), F->getName() + ".ID"); 1796 1797 for (Use *U : ToBeReplacedStateMachineUses) 1798 U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); 1799 1800 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; 1801 1802 Changed = true; 1803 } 1804 1805 return Changed; 1806 } 1807 1808 /// Abstract Attribute for tracking ICV values. 1809 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { 1810 using Base = StateWrapper<BooleanState, AbstractAttribute>; 1811 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1812 1813 void initialize(Attributor &A) override { 1814 Function *F = getAnchorScope(); 1815 if (!F || !A.isFunctionIPOAmendable(*F)) 1816 indicatePessimisticFixpoint(); 1817 } 1818 1819 /// Returns true if value is assumed to be tracked. 1820 bool isAssumedTracked() const { return getAssumed(); } 1821 1822 /// Returns true if value is known to be tracked. 1823 bool isKnownTracked() const { return getAssumed(); } 1824 1825 /// Create an abstract attribute biew for the position \p IRP. 1826 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); 1827 1828 /// Return the value with which \p I can be replaced for specific \p ICV. 1829 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, 1830 const Instruction *I, 1831 Attributor &A) const { 1832 return None; 1833 } 1834 1835 /// Return an assumed unique ICV value if a single candidate is found. If 1836 /// there cannot be one, return a nullptr. If it is not clear yet, return the 1837 /// Optional::NoneType. 1838 virtual Optional<Value *> 1839 getUniqueReplacementValue(InternalControlVar ICV) const = 0; 1840 1841 // Currently only nthreads is being tracked. 1842 // this array will only grow with time. 1843 InternalControlVar TrackableICVs[1] = {ICV_nthreads}; 1844 1845 /// See AbstractAttribute::getName() 1846 const std::string getName() const override { return "AAICVTracker"; } 1847 1848 /// See AbstractAttribute::getIdAddr() 1849 const char *getIdAddr() const override { return &ID; } 1850 1851 /// This function should return true if the type of the \p AA is AAICVTracker 1852 static bool classof(const AbstractAttribute *AA) { 1853 return (AA->getIdAddr() == &ID); 1854 } 1855 1856 static const char ID; 1857 }; 1858 1859 struct AAICVTrackerFunction : public AAICVTracker { 1860 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) 1861 : AAICVTracker(IRP, A) {} 1862 1863 // FIXME: come up with better string. 1864 const std::string getAsStr() const override { return "ICVTrackerFunction"; } 1865 1866 // FIXME: come up with some stats. 1867 void trackStatistics() const override {} 1868 1869 /// We don't manifest anything for this AA. 1870 ChangeStatus manifest(Attributor &A) override { 1871 return ChangeStatus::UNCHANGED; 1872 } 1873 1874 // Map of ICV to their values at specific program point. 1875 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar, 1876 InternalControlVar::ICV___last> 1877 ICVReplacementValuesMap; 1878 1879 ChangeStatus updateImpl(Attributor &A) override { 1880 ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 1881 1882 Function *F = getAnchorScope(); 1883 1884 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1885 1886 for (InternalControlVar ICV : TrackableICVs) { 1887 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1888 1889 auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1890 auto TrackValues = [&](Use &U, Function &) { 1891 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); 1892 if (!CI) 1893 return false; 1894 1895 // FIXME: handle setters with more that 1 arguments. 1896 /// Track new value. 1897 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second) 1898 HasChanged = ChangeStatus::CHANGED; 1899 1900 return false; 1901 }; 1902 1903 auto CallCheck = [&](Instruction &I) { 1904 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); 1905 if (ReplVal.hasValue() && 1906 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) 1907 HasChanged = ChangeStatus::CHANGED; 1908 1909 return true; 1910 }; 1911 1912 // Track all changes of an ICV. 1913 SetterRFI.foreachUse(TrackValues, F); 1914 1915 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call}, 1916 /* CheckBBLivenessOnly */ true); 1917 1918 /// TODO: Figure out a way to avoid adding entry in 1919 /// ICVReplacementValuesMap 1920 Instruction *Entry = &F->getEntryBlock().front(); 1921 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry)) 1922 ValuesMap.insert(std::make_pair(Entry, nullptr)); 1923 } 1924 1925 return HasChanged; 1926 } 1927 1928 /// Hepler to check if \p I is a call and get the value for it if it is 1929 /// unique. 1930 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, 1931 InternalControlVar &ICV) const { 1932 1933 const auto *CB = dyn_cast<CallBase>(I); 1934 if (!CB || CB->hasFnAttr("no_openmp") || 1935 CB->hasFnAttr("no_openmp_routines")) 1936 return None; 1937 1938 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1939 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; 1940 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1941 Function *CalledFunction = CB->getCalledFunction(); 1942 1943 // Indirect call, assume ICV changes. 1944 if (CalledFunction == nullptr) 1945 return nullptr; 1946 if (CalledFunction == GetterRFI.Declaration) 1947 return None; 1948 if (CalledFunction == SetterRFI.Declaration) { 1949 if (ICVReplacementValuesMap[ICV].count(I)) 1950 return ICVReplacementValuesMap[ICV].lookup(I); 1951 1952 return nullptr; 1953 } 1954 1955 // Since we don't know, assume it changes the ICV. 1956 if (CalledFunction->isDeclaration()) 1957 return nullptr; 1958 1959 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 1960 *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); 1961 1962 if (ICVTrackingAA.isAssumedTracked()) 1963 return ICVTrackingAA.getUniqueReplacementValue(ICV); 1964 1965 // If we don't know, assume it changes. 1966 return nullptr; 1967 } 1968 1969 // We don't check unique value for a function, so return None. 1970 Optional<Value *> 1971 getUniqueReplacementValue(InternalControlVar ICV) const override { 1972 return None; 1973 } 1974 1975 /// Return the value with which \p I can be replaced for specific \p ICV. 1976 Optional<Value *> getReplacementValue(InternalControlVar ICV, 1977 const Instruction *I, 1978 Attributor &A) const override { 1979 const auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1980 if (ValuesMap.count(I)) 1981 return ValuesMap.lookup(I); 1982 1983 SmallVector<const Instruction *, 16> Worklist; 1984 SmallPtrSet<const Instruction *, 16> Visited; 1985 Worklist.push_back(I); 1986 1987 Optional<Value *> ReplVal; 1988 1989 while (!Worklist.empty()) { 1990 const Instruction *CurrInst = Worklist.pop_back_val(); 1991 if (!Visited.insert(CurrInst).second) 1992 continue; 1993 1994 const BasicBlock *CurrBB = CurrInst->getParent(); 1995 1996 // Go up and look for all potential setters/calls that might change the 1997 // ICV. 1998 while ((CurrInst = CurrInst->getPrevNode())) { 1999 if (ValuesMap.count(CurrInst)) { 2000 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); 2001 // Unknown value, track new. 2002 if (!ReplVal.hasValue()) { 2003 ReplVal = NewReplVal; 2004 break; 2005 } 2006 2007 // If we found a new value, we can't know the icv value anymore. 2008 if (NewReplVal.hasValue()) 2009 if (ReplVal != NewReplVal) 2010 return nullptr; 2011 2012 break; 2013 } 2014 2015 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); 2016 if (!NewReplVal.hasValue()) 2017 continue; 2018 2019 // Unknown value, track new. 2020 if (!ReplVal.hasValue()) { 2021 ReplVal = NewReplVal; 2022 break; 2023 } 2024 2025 // if (NewReplVal.hasValue()) 2026 // We found a new value, we can't know the icv value anymore. 2027 if (ReplVal != NewReplVal) 2028 return nullptr; 2029 } 2030 2031 // If we are in the same BB and we have a value, we are done. 2032 if (CurrBB == I->getParent() && ReplVal.hasValue()) 2033 return ReplVal; 2034 2035 // Go through all predecessors and add terminators for analysis. 2036 for (const BasicBlock *Pred : predecessors(CurrBB)) 2037 if (const Instruction *Terminator = Pred->getTerminator()) 2038 Worklist.push_back(Terminator); 2039 } 2040 2041 return ReplVal; 2042 } 2043 }; 2044 2045 struct AAICVTrackerFunctionReturned : AAICVTracker { 2046 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A) 2047 : AAICVTracker(IRP, A) {} 2048 2049 // FIXME: come up with better string. 2050 const std::string getAsStr() const override { 2051 return "ICVTrackerFunctionReturned"; 2052 } 2053 2054 // FIXME: come up with some stats. 2055 void trackStatistics() const override {} 2056 2057 /// We don't manifest anything for this AA. 2058 ChangeStatus manifest(Attributor &A) override { 2059 return ChangeStatus::UNCHANGED; 2060 } 2061 2062 // Map of ICV to their values at specific program point. 2063 EnumeratedArray<Optional<Value *>, InternalControlVar, 2064 InternalControlVar::ICV___last> 2065 ICVReplacementValuesMap; 2066 2067 /// Return the value with which \p I can be replaced for specific \p ICV. 2068 Optional<Value *> 2069 getUniqueReplacementValue(InternalControlVar ICV) const override { 2070 return ICVReplacementValuesMap[ICV]; 2071 } 2072 2073 ChangeStatus updateImpl(Attributor &A) override { 2074 ChangeStatus Changed = ChangeStatus::UNCHANGED; 2075 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2076 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); 2077 2078 if (!ICVTrackingAA.isAssumedTracked()) 2079 return indicatePessimisticFixpoint(); 2080 2081 for (InternalControlVar ICV : TrackableICVs) { 2082 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 2083 Optional<Value *> UniqueICVValue; 2084 2085 auto CheckReturnInst = [&](Instruction &I) { 2086 Optional<Value *> NewReplVal = 2087 ICVTrackingAA.getReplacementValue(ICV, &I, A); 2088 2089 // If we found a second ICV value there is no unique returned value. 2090 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) 2091 return false; 2092 2093 UniqueICVValue = NewReplVal; 2094 2095 return true; 2096 }; 2097 2098 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}, 2099 /* CheckBBLivenessOnly */ true)) 2100 UniqueICVValue = nullptr; 2101 2102 if (UniqueICVValue == ReplVal) 2103 continue; 2104 2105 ReplVal = UniqueICVValue; 2106 Changed = ChangeStatus::CHANGED; 2107 } 2108 2109 return Changed; 2110 } 2111 }; 2112 2113 struct AAICVTrackerCallSite : AAICVTracker { 2114 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A) 2115 : AAICVTracker(IRP, A) {} 2116 2117 void initialize(Attributor &A) override { 2118 Function *F = getAnchorScope(); 2119 if (!F || !A.isFunctionIPOAmendable(*F)) 2120 indicatePessimisticFixpoint(); 2121 2122 // We only initialize this AA for getters, so we need to know which ICV it 2123 // gets. 2124 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 2125 for (InternalControlVar ICV : TrackableICVs) { 2126 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 2127 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter]; 2128 if (Getter.Declaration == getAssociatedFunction()) { 2129 AssociatedICV = ICVInfo.Kind; 2130 return; 2131 } 2132 } 2133 2134 /// Unknown ICV. 2135 indicatePessimisticFixpoint(); 2136 } 2137 2138 ChangeStatus manifest(Attributor &A) override { 2139 if (!ReplVal.hasValue() || !ReplVal.getValue()) 2140 return ChangeStatus::UNCHANGED; 2141 2142 A.changeValueAfterManifest(*getCtxI(), **ReplVal); 2143 A.deleteAfterManifest(*getCtxI()); 2144 2145 return ChangeStatus::CHANGED; 2146 } 2147 2148 // FIXME: come up with better string. 2149 const std::string getAsStr() const override { return "ICVTrackerCallSite"; } 2150 2151 // FIXME: come up with some stats. 2152 void trackStatistics() const override {} 2153 2154 InternalControlVar AssociatedICV; 2155 Optional<Value *> ReplVal; 2156 2157 ChangeStatus updateImpl(Attributor &A) override { 2158 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2159 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); 2160 2161 // We don't have any information, so we assume it changes the ICV. 2162 if (!ICVTrackingAA.isAssumedTracked()) 2163 return indicatePessimisticFixpoint(); 2164 2165 Optional<Value *> NewReplVal = 2166 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); 2167 2168 if (ReplVal == NewReplVal) 2169 return ChangeStatus::UNCHANGED; 2170 2171 ReplVal = NewReplVal; 2172 return ChangeStatus::CHANGED; 2173 } 2174 2175 // Return the value with which associated value can be replaced for specific 2176 // \p ICV. 2177 Optional<Value *> 2178 getUniqueReplacementValue(InternalControlVar ICV) const override { 2179 return ReplVal; 2180 } 2181 }; 2182 2183 struct AAICVTrackerCallSiteReturned : AAICVTracker { 2184 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A) 2185 : AAICVTracker(IRP, A) {} 2186 2187 // FIXME: come up with better string. 2188 const std::string getAsStr() const override { 2189 return "ICVTrackerCallSiteReturned"; 2190 } 2191 2192 // FIXME: come up with some stats. 2193 void trackStatistics() const override {} 2194 2195 /// We don't manifest anything for this AA. 2196 ChangeStatus manifest(Attributor &A) override { 2197 return ChangeStatus::UNCHANGED; 2198 } 2199 2200 // Map of ICV to their values at specific program point. 2201 EnumeratedArray<Optional<Value *>, InternalControlVar, 2202 InternalControlVar::ICV___last> 2203 ICVReplacementValuesMap; 2204 2205 /// Return the value with which associated value can be replaced for specific 2206 /// \p ICV. 2207 Optional<Value *> 2208 getUniqueReplacementValue(InternalControlVar ICV) const override { 2209 return ICVReplacementValuesMap[ICV]; 2210 } 2211 2212 ChangeStatus updateImpl(Attributor &A) override { 2213 ChangeStatus Changed = ChangeStatus::UNCHANGED; 2214 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2215 *this, IRPosition::returned(*getAssociatedFunction()), 2216 DepClassTy::REQUIRED); 2217 2218 // We don't have any information, so we assume it changes the ICV. 2219 if (!ICVTrackingAA.isAssumedTracked()) 2220 return indicatePessimisticFixpoint(); 2221 2222 for (InternalControlVar ICV : TrackableICVs) { 2223 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 2224 Optional<Value *> NewReplVal = 2225 ICVTrackingAA.getUniqueReplacementValue(ICV); 2226 2227 if (ReplVal == NewReplVal) 2228 continue; 2229 2230 ReplVal = NewReplVal; 2231 Changed = ChangeStatus::CHANGED; 2232 } 2233 return Changed; 2234 } 2235 }; 2236 } // namespace 2237 2238 const char AAICVTracker::ID = 0; 2239 2240 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, 2241 Attributor &A) { 2242 AAICVTracker *AA = nullptr; 2243 switch (IRP.getPositionKind()) { 2244 case IRPosition::IRP_INVALID: 2245 case IRPosition::IRP_FLOAT: 2246 case IRPosition::IRP_ARGUMENT: 2247 case IRPosition::IRP_CALL_SITE_ARGUMENT: 2248 llvm_unreachable("ICVTracker can only be created for function position!"); 2249 case IRPosition::IRP_RETURNED: 2250 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A); 2251 break; 2252 case IRPosition::IRP_CALL_SITE_RETURNED: 2253 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A); 2254 break; 2255 case IRPosition::IRP_CALL_SITE: 2256 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A); 2257 break; 2258 case IRPosition::IRP_FUNCTION: 2259 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); 2260 break; 2261 } 2262 2263 return *AA; 2264 } 2265 2266 PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, 2267 CGSCCAnalysisManager &AM, 2268 LazyCallGraph &CG, CGSCCUpdateResult &UR) { 2269 if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule)) 2270 return PreservedAnalyses::all(); 2271 2272 if (DisableOpenMPOptimizations) 2273 return PreservedAnalyses::all(); 2274 2275 SmallVector<Function *, 16> SCC; 2276 // If there are kernels in the module, we have to run on all SCC's. 2277 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2278 for (LazyCallGraph::Node &N : C) { 2279 Function *Fn = &N.getFunction(); 2280 SCC.push_back(Fn); 2281 2282 // Do we already know that the SCC contains kernels, 2283 // or that OpenMP functions are called from this SCC? 2284 if (SCCIsInteresting) 2285 continue; 2286 // If not, let's check that. 2287 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2288 } 2289 2290 if (!SCCIsInteresting || SCC.empty()) 2291 return PreservedAnalyses::all(); 2292 2293 FunctionAnalysisManager &FAM = 2294 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 2295 2296 AnalysisGetter AG(FAM); 2297 2298 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 2299 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 2300 }; 2301 2302 CallGraphUpdater CGUpdater; 2303 CGUpdater.initialize(CG, C, AM, UR); 2304 2305 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2306 BumpPtrAllocator Allocator; 2307 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, 2308 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2309 2310 Attributor A(Functions, InfoCache, CGUpdater); 2311 2312 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2313 bool Changed = OMPOpt.run(); 2314 if (Changed) 2315 return PreservedAnalyses::none(); 2316 2317 return PreservedAnalyses::all(); 2318 } 2319 2320 namespace { 2321 2322 struct OpenMPOptLegacyPass : public CallGraphSCCPass { 2323 CallGraphUpdater CGUpdater; 2324 OpenMPInModule OMPInModule; 2325 static char ID; 2326 2327 OpenMPOptLegacyPass() : CallGraphSCCPass(ID) { 2328 initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry()); 2329 } 2330 2331 void getAnalysisUsage(AnalysisUsage &AU) const override { 2332 CallGraphSCCPass::getAnalysisUsage(AU); 2333 } 2334 2335 bool doInitialization(CallGraph &CG) override { 2336 // Disable the pass if there is no OpenMP (runtime call) in the module. 2337 containsOpenMP(CG.getModule(), OMPInModule); 2338 return false; 2339 } 2340 2341 bool runOnSCC(CallGraphSCC &CGSCC) override { 2342 if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule)) 2343 return false; 2344 if (DisableOpenMPOptimizations || skipSCC(CGSCC)) 2345 return false; 2346 2347 SmallVector<Function *, 16> SCC; 2348 // If there are kernels in the module, we have to run on all SCC's. 2349 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2350 for (CallGraphNode *CGN : CGSCC) { 2351 Function *Fn = CGN->getFunction(); 2352 if (!Fn || Fn->isDeclaration()) 2353 continue; 2354 SCC.push_back(Fn); 2355 2356 // Do we already know that the SCC contains kernels, 2357 // or that OpenMP functions are called from this SCC? 2358 if (SCCIsInteresting) 2359 continue; 2360 // If not, let's check that. 2361 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2362 } 2363 2364 if (!SCCIsInteresting || SCC.empty()) 2365 return false; 2366 2367 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 2368 CGUpdater.initialize(CG, CGSCC); 2369 2370 // Maintain a map of functions to avoid rebuilding the ORE 2371 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; 2372 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { 2373 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; 2374 if (!ORE) 2375 ORE = std::make_unique<OptimizationRemarkEmitter>(F); 2376 return *ORE; 2377 }; 2378 2379 AnalysisGetter AG; 2380 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2381 BumpPtrAllocator Allocator; 2382 OMPInformationCache InfoCache( 2383 *(Functions.back()->getParent()), AG, Allocator, 2384 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2385 2386 Attributor A(Functions, InfoCache, CGUpdater); 2387 2388 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2389 return OMPOpt.run(); 2390 } 2391 2392 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } 2393 }; 2394 2395 } // end anonymous namespace 2396 2397 void OpenMPInModule::identifyKernels(Module &M) { 2398 2399 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 2400 if (!MD) 2401 return; 2402 2403 for (auto *Op : MD->operands()) { 2404 if (Op->getNumOperands() < 2) 2405 continue; 2406 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); 2407 if (!KindID || KindID->getString() != "kernel") 2408 continue; 2409 2410 Function *KernelFn = 2411 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); 2412 if (!KernelFn) 2413 continue; 2414 2415 ++NumOpenMPTargetRegionKernels; 2416 2417 Kernels.insert(KernelFn); 2418 } 2419 } 2420 2421 bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { 2422 if (OMPInModule.isKnown()) 2423 return OMPInModule; 2424 2425 auto RecordFunctionsContainingUsesOf = [&](Function *F) { 2426 for (User *U : F->users()) 2427 if (auto *I = dyn_cast<Instruction>(U)) 2428 OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction()); 2429 }; 2430 2431 // MSVC doesn't like long if-else chains for some reason and instead just 2432 // issues an error. Work around it.. 2433 do { 2434 #define OMP_RTL(_Enum, _Name, ...) \ 2435 if (Function *F = M.getFunction(_Name)) { \ 2436 RecordFunctionsContainingUsesOf(F); \ 2437 OMPInModule = true; \ 2438 } 2439 #include "llvm/Frontend/OpenMP/OMPKinds.def" 2440 } while (false); 2441 2442 // Identify kernels once. TODO: We should split the OMPInformationCache into a 2443 // module and an SCC part. The kernel information, among other things, could 2444 // go into the module part. 2445 if (OMPInModule.isKnown() && OMPInModule) { 2446 OMPInModule.identifyKernels(M); 2447 return true; 2448 } 2449 2450 return OMPInModule = false; 2451 } 2452 2453 char OpenMPOptLegacyPass::ID = 0; 2454 2455 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", 2456 "OpenMP specific optimizations", false, false) 2457 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 2458 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt", 2459 "OpenMP specific optimizations", false, false) 2460 2461 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); } 2462