1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // OpenMP specific optimizations: 10 // 11 // - Deduplication of runtime calls, e.g., omp_get_thread_num. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/OpenMPOpt.h" 16 17 #include "llvm/ADT/EnumeratedArray.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/CallGraph.h" 20 #include "llvm/Analysis/CallGraphSCCPass.h" 21 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Frontend/OpenMP/OMPConstants.h" 24 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 25 #include "llvm/InitializePasses.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Transforms/IPO.h" 28 #include "llvm/Transforms/IPO/Attributor.h" 29 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 30 #include "llvm/Transforms/Utils/CallGraphUpdater.h" 31 32 using namespace llvm; 33 using namespace omp; 34 35 #define DEBUG_TYPE "openmp-opt" 36 37 static cl::opt<bool> DisableOpenMPOptimizations( 38 "openmp-opt-disable", cl::ZeroOrMore, 39 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, 40 cl::init(false)); 41 42 static cl::opt<bool> EnableParallelRegionMerging( 43 "openmp-opt-enable-merging", cl::ZeroOrMore, 44 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, 45 cl::init(false)); 46 47 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), 48 cl::Hidden); 49 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", 50 cl::init(false), cl::Hidden); 51 52 static cl::opt<bool> HideMemoryTransferLatency( 53 "openmp-hide-memory-transfer-latency", 54 cl::desc("[WIP] Tries to hide the latency of host to device memory" 55 " transfers"), 56 cl::Hidden, cl::init(false)); 57 58 STATISTIC(NumOpenMPRuntimeCallsDeduplicated, 59 "Number of OpenMP runtime calls deduplicated"); 60 STATISTIC(NumOpenMPParallelRegionsDeleted, 61 "Number of OpenMP parallel regions deleted"); 62 STATISTIC(NumOpenMPRuntimeFunctionsIdentified, 63 "Number of OpenMP runtime functions identified"); 64 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, 65 "Number of OpenMP runtime function uses identified"); 66 STATISTIC(NumOpenMPTargetRegionKernels, 67 "Number of OpenMP target region entry points (=kernels) identified"); 68 STATISTIC( 69 NumOpenMPParallelRegionsReplacedInGPUStateMachine, 70 "Number of OpenMP parallel regions replaced with ID in GPU state machines"); 71 STATISTIC(NumOpenMPParallelRegionsMerged, 72 "Number of OpenMP parallel regions merged"); 73 74 #if !defined(NDEBUG) 75 static constexpr auto TAG = "[" DEBUG_TYPE "]"; 76 #endif 77 78 namespace { 79 80 struct AAICVTracker; 81 82 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for 83 /// Attributor runs. 84 struct OMPInformationCache : public InformationCache { 85 OMPInformationCache(Module &M, AnalysisGetter &AG, 86 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, 87 SmallPtrSetImpl<Kernel> &Kernels) 88 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), 89 Kernels(Kernels) { 90 91 OMPBuilder.initialize(); 92 initializeRuntimeFunctions(); 93 initializeInternalControlVars(); 94 } 95 96 /// Generic information that describes an internal control variable. 97 struct InternalControlVarInfo { 98 /// The kind, as described by InternalControlVar enum. 99 InternalControlVar Kind; 100 101 /// The name of the ICV. 102 StringRef Name; 103 104 /// Environment variable associated with this ICV. 105 StringRef EnvVarName; 106 107 /// Initial value kind. 108 ICVInitValue InitKind; 109 110 /// Initial value. 111 ConstantInt *InitValue; 112 113 /// Setter RTL function associated with this ICV. 114 RuntimeFunction Setter; 115 116 /// Getter RTL function associated with this ICV. 117 RuntimeFunction Getter; 118 119 /// RTL Function corresponding to the override clause of this ICV 120 RuntimeFunction Clause; 121 }; 122 123 /// Generic information that describes a runtime function 124 struct RuntimeFunctionInfo { 125 126 /// The kind, as described by the RuntimeFunction enum. 127 RuntimeFunction Kind; 128 129 /// The name of the function. 130 StringRef Name; 131 132 /// Flag to indicate a variadic function. 133 bool IsVarArg; 134 135 /// The return type of the function. 136 Type *ReturnType; 137 138 /// The argument types of the function. 139 SmallVector<Type *, 8> ArgumentTypes; 140 141 /// The declaration if available. 142 Function *Declaration = nullptr; 143 144 /// Uses of this runtime function per function containing the use. 145 using UseVector = SmallVector<Use *, 16>; 146 147 /// Clear UsesMap for runtime function. 148 void clearUsesMap() { UsesMap.clear(); } 149 150 /// Boolean conversion that is true if the runtime function was found. 151 operator bool() const { return Declaration; } 152 153 /// Return the vector of uses in function \p F. 154 UseVector &getOrCreateUseVector(Function *F) { 155 std::shared_ptr<UseVector> &UV = UsesMap[F]; 156 if (!UV) 157 UV = std::make_shared<UseVector>(); 158 return *UV; 159 } 160 161 /// Return the vector of uses in function \p F or `nullptr` if there are 162 /// none. 163 const UseVector *getUseVector(Function &F) const { 164 auto I = UsesMap.find(&F); 165 if (I != UsesMap.end()) 166 return I->second.get(); 167 return nullptr; 168 } 169 170 /// Return how many functions contain uses of this runtime function. 171 size_t getNumFunctionsWithUses() const { return UsesMap.size(); } 172 173 /// Return the number of arguments (or the minimal number for variadic 174 /// functions). 175 size_t getNumArgs() const { return ArgumentTypes.size(); } 176 177 /// Run the callback \p CB on each use and forget the use if the result is 178 /// true. The callback will be fed the function in which the use was 179 /// encountered as second argument. 180 void foreachUse(SmallVectorImpl<Function *> &SCC, 181 function_ref<bool(Use &, Function &)> CB) { 182 for (Function *F : SCC) 183 foreachUse(CB, F); 184 } 185 186 /// Run the callback \p CB on each use within the function \p F and forget 187 /// the use if the result is true. 188 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { 189 SmallVector<unsigned, 8> ToBeDeleted; 190 ToBeDeleted.clear(); 191 192 unsigned Idx = 0; 193 UseVector &UV = getOrCreateUseVector(F); 194 195 for (Use *U : UV) { 196 if (CB(*U, *F)) 197 ToBeDeleted.push_back(Idx); 198 ++Idx; 199 } 200 201 // Remove the to-be-deleted indices in reverse order as prior 202 // modifications will not modify the smaller indices. 203 while (!ToBeDeleted.empty()) { 204 unsigned Idx = ToBeDeleted.pop_back_val(); 205 UV[Idx] = UV.back(); 206 UV.pop_back(); 207 } 208 } 209 210 private: 211 /// Map from functions to all uses of this runtime function contained in 212 /// them. 213 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; 214 }; 215 216 /// An OpenMP-IR-Builder instance 217 OpenMPIRBuilder OMPBuilder; 218 219 /// Map from runtime function kind to the runtime function description. 220 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, 221 RuntimeFunction::OMPRTL___last> 222 RFIs; 223 224 /// Map from ICV kind to the ICV description. 225 EnumeratedArray<InternalControlVarInfo, InternalControlVar, 226 InternalControlVar::ICV___last> 227 ICVs; 228 229 /// Helper to initialize all internal control variable information for those 230 /// defined in OMPKinds.def. 231 void initializeInternalControlVars() { 232 #define ICV_RT_SET(_Name, RTL) \ 233 { \ 234 auto &ICV = ICVs[_Name]; \ 235 ICV.Setter = RTL; \ 236 } 237 #define ICV_RT_GET(Name, RTL) \ 238 { \ 239 auto &ICV = ICVs[Name]; \ 240 ICV.Getter = RTL; \ 241 } 242 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \ 243 { \ 244 auto &ICV = ICVs[Enum]; \ 245 ICV.Name = _Name; \ 246 ICV.Kind = Enum; \ 247 ICV.InitKind = Init; \ 248 ICV.EnvVarName = _EnvVarName; \ 249 switch (ICV.InitKind) { \ 250 case ICV_IMPLEMENTATION_DEFINED: \ 251 ICV.InitValue = nullptr; \ 252 break; \ 253 case ICV_ZERO: \ 254 ICV.InitValue = ConstantInt::get( \ 255 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \ 256 break; \ 257 case ICV_FALSE: \ 258 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \ 259 break; \ 260 case ICV_LAST: \ 261 break; \ 262 } \ 263 } 264 #include "llvm/Frontend/OpenMP/OMPKinds.def" 265 } 266 267 /// Returns true if the function declaration \p F matches the runtime 268 /// function types, that is, return type \p RTFRetType, and argument types 269 /// \p RTFArgTypes. 270 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, 271 SmallVector<Type *, 8> &RTFArgTypes) { 272 // TODO: We should output information to the user (under debug output 273 // and via remarks). 274 275 if (!F) 276 return false; 277 if (F->getReturnType() != RTFRetType) 278 return false; 279 if (F->arg_size() != RTFArgTypes.size()) 280 return false; 281 282 auto RTFTyIt = RTFArgTypes.begin(); 283 for (Argument &Arg : F->args()) { 284 if (Arg.getType() != *RTFTyIt) 285 return false; 286 287 ++RTFTyIt; 288 } 289 290 return true; 291 } 292 293 // Helper to collect all uses of the declaration in the UsesMap. 294 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { 295 unsigned NumUses = 0; 296 if (!RFI.Declaration) 297 return NumUses; 298 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); 299 300 if (CollectStats) { 301 NumOpenMPRuntimeFunctionsIdentified += 1; 302 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); 303 } 304 305 // TODO: We directly convert uses into proper calls and unknown uses. 306 for (Use &U : RFI.Declaration->uses()) { 307 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { 308 if (ModuleSlice.count(UserI->getFunction())) { 309 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); 310 ++NumUses; 311 } 312 } else { 313 RFI.getOrCreateUseVector(nullptr).push_back(&U); 314 ++NumUses; 315 } 316 } 317 return NumUses; 318 } 319 320 // Helper function to recollect uses of all runtime functions. 321 void recollectUses() { 322 for (int Idx = 0; Idx < RFIs.size(); ++Idx) { 323 auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)]; 324 RFI.clearUsesMap(); 325 collectUses(RFI, /*CollectStats*/ false); 326 } 327 } 328 329 /// Helper to initialize all runtime function information for those defined 330 /// in OpenMPKinds.def. 331 void initializeRuntimeFunctions() { 332 Module &M = *((*ModuleSlice.begin())->getParent()); 333 334 // Helper macros for handling __VA_ARGS__ in OMP_RTL 335 #define OMP_TYPE(VarName, ...) \ 336 Type *VarName = OMPBuilder.VarName; \ 337 (void)VarName; 338 339 #define OMP_ARRAY_TYPE(VarName, ...) \ 340 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \ 341 (void)VarName##Ty; \ 342 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \ 343 (void)VarName##PtrTy; 344 345 #define OMP_FUNCTION_TYPE(VarName, ...) \ 346 FunctionType *VarName = OMPBuilder.VarName; \ 347 (void)VarName; \ 348 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 349 (void)VarName##Ptr; 350 351 #define OMP_STRUCT_TYPE(VarName, ...) \ 352 StructType *VarName = OMPBuilder.VarName; \ 353 (void)VarName; \ 354 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 355 (void)VarName##Ptr; 356 357 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ 358 { \ 359 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ 360 Function *F = M.getFunction(_Name); \ 361 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ 362 auto &RFI = RFIs[_Enum]; \ 363 RFI.Kind = _Enum; \ 364 RFI.Name = _Name; \ 365 RFI.IsVarArg = _IsVarArg; \ 366 RFI.ReturnType = OMPBuilder._ReturnType; \ 367 RFI.ArgumentTypes = std::move(ArgsTypes); \ 368 RFI.Declaration = F; \ 369 unsigned NumUses = collectUses(RFI); \ 370 (void)NumUses; \ 371 LLVM_DEBUG({ \ 372 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ 373 << " found\n"; \ 374 if (RFI.Declaration) \ 375 dbgs() << TAG << "-> got " << NumUses << " uses in " \ 376 << RFI.getNumFunctionsWithUses() \ 377 << " different functions.\n"; \ 378 }); \ 379 } \ 380 } 381 #include "llvm/Frontend/OpenMP/OMPKinds.def" 382 383 // TODO: We should attach the attributes defined in OMPKinds.def. 384 } 385 386 /// Collection of known kernels (\see Kernel) in the module. 387 SmallPtrSetImpl<Kernel> &Kernels; 388 }; 389 390 /// Used to map the values physically (in the IR) stored in an offload 391 /// array, to a vector in memory. 392 struct OffloadArray { 393 /// Physical array (in the IR). 394 AllocaInst *Array = nullptr; 395 /// Mapped values. 396 SmallVector<Value *, 8> StoredValues; 397 /// Last stores made in the offload array. 398 SmallVector<StoreInst *, 8> LastAccesses; 399 400 OffloadArray() = default; 401 402 /// Initializes the OffloadArray with the values stored in \p Array before 403 /// instruction \p Before is reached. Returns false if the initialization 404 /// fails. 405 /// This MUST be used immediately after the construction of the object. 406 bool initialize(AllocaInst &Array, Instruction &Before) { 407 if (!Array.getAllocatedType()->isArrayTy()) 408 return false; 409 410 if (!getValues(Array, Before)) 411 return false; 412 413 this->Array = &Array; 414 return true; 415 } 416 417 static const unsigned DeviceIDArgNum = 1; 418 static const unsigned BasePtrsArgNum = 3; 419 static const unsigned PtrsArgNum = 4; 420 static const unsigned SizesArgNum = 5; 421 422 private: 423 /// Traverses the BasicBlock where \p Array is, collecting the stores made to 424 /// \p Array, leaving StoredValues with the values stored before the 425 /// instruction \p Before is reached. 426 bool getValues(AllocaInst &Array, Instruction &Before) { 427 // Initialize container. 428 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements(); 429 StoredValues.assign(NumValues, nullptr); 430 LastAccesses.assign(NumValues, nullptr); 431 432 // TODO: This assumes the instruction \p Before is in the same 433 // BasicBlock as Array. Make it general, for any control flow graph. 434 BasicBlock *BB = Array.getParent(); 435 if (BB != Before.getParent()) 436 return false; 437 438 const DataLayout &DL = Array.getModule()->getDataLayout(); 439 const unsigned int PointerSize = DL.getPointerSize(); 440 441 for (Instruction &I : *BB) { 442 if (&I == &Before) 443 break; 444 445 if (!isa<StoreInst>(&I)) 446 continue; 447 448 auto *S = cast<StoreInst>(&I); 449 int64_t Offset = -1; 450 auto *Dst = 451 GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL); 452 if (Dst == &Array) { 453 int64_t Idx = Offset / PointerSize; 454 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); 455 LastAccesses[Idx] = S; 456 } 457 } 458 459 return isFilled(); 460 } 461 462 /// Returns true if all values in StoredValues and 463 /// LastAccesses are not nullptrs. 464 bool isFilled() { 465 const unsigned NumValues = StoredValues.size(); 466 for (unsigned I = 0; I < NumValues; ++I) { 467 if (!StoredValues[I] || !LastAccesses[I]) 468 return false; 469 } 470 471 return true; 472 } 473 }; 474 475 struct OpenMPOpt { 476 477 using OptimizationRemarkGetter = 478 function_ref<OptimizationRemarkEmitter &(Function *)>; 479 480 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, 481 OptimizationRemarkGetter OREGetter, 482 OMPInformationCache &OMPInfoCache, Attributor &A) 483 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), 484 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} 485 486 /// Check if any remarks are enabled for openmp-opt 487 bool remarksEnabled() { 488 auto &Ctx = M.getContext(); 489 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE); 490 } 491 492 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. 493 bool run() { 494 if (SCC.empty()) 495 return false; 496 497 bool Changed = false; 498 499 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() 500 << " functions in a slice with " 501 << OMPInfoCache.ModuleSlice.size() << " functions\n"); 502 503 if (PrintICVValues) 504 printICVs(); 505 if (PrintOpenMPKernels) 506 printKernels(); 507 508 Changed |= rewriteDeviceCodeStateMachine(); 509 510 Changed |= runAttributor(); 511 512 // Recollect uses, in case Attributor deleted any. 513 OMPInfoCache.recollectUses(); 514 515 Changed |= deleteParallelRegions(); 516 if (HideMemoryTransferLatency) 517 Changed |= hideMemTransfersLatency(); 518 if (remarksEnabled()) 519 analysisGlobalization(); 520 Changed |= deduplicateRuntimeCalls(); 521 if (EnableParallelRegionMerging) { 522 if (mergeParallelRegions()) { 523 deduplicateRuntimeCalls(); 524 Changed = true; 525 } 526 } 527 528 return Changed; 529 } 530 531 /// Print initial ICV values for testing. 532 /// FIXME: This should be done from the Attributor once it is added. 533 void printICVs() const { 534 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, 535 ICV_proc_bind}; 536 537 for (Function *F : OMPInfoCache.ModuleSlice) { 538 for (auto ICV : ICVs) { 539 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 540 auto Remark = [&](OptimizationRemark OR) { 541 return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) 542 << " Value: " 543 << (ICVInfo.InitValue 544 ? ICVInfo.InitValue->getValue().toString(10, true) 545 : "IMPLEMENTATION_DEFINED"); 546 }; 547 548 emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); 549 } 550 } 551 } 552 553 /// Print OpenMP GPU kernels for testing. 554 void printKernels() const { 555 for (Function *F : SCC) { 556 if (!OMPInfoCache.Kernels.count(F)) 557 continue; 558 559 auto Remark = [&](OptimizationRemark OR) { 560 return OR << "OpenMP GPU kernel " 561 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; 562 }; 563 564 emitRemarkOnFunction(F, "OpenMPGPU", Remark); 565 } 566 } 567 568 /// Return the call if \p U is a callee use in a regular call. If \p RFI is 569 /// given it has to be the callee or a nullptr is returned. 570 static CallInst *getCallIfRegularCall( 571 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 572 CallInst *CI = dyn_cast<CallInst>(U.getUser()); 573 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && 574 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 575 return CI; 576 return nullptr; 577 } 578 579 /// Return the call if \p V is a regular call. If \p RFI is given it has to be 580 /// the callee or a nullptr is returned. 581 static CallInst *getCallIfRegularCall( 582 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 583 CallInst *CI = dyn_cast<CallInst>(&V); 584 if (CI && !CI->hasOperandBundles() && 585 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 586 return CI; 587 return nullptr; 588 } 589 590 private: 591 /// Merge parallel regions when it is safe. 592 bool mergeParallelRegions() { 593 const unsigned CallbackCalleeOperand = 2; 594 const unsigned CallbackFirstArgOperand = 3; 595 using InsertPointTy = OpenMPIRBuilder::InsertPointTy; 596 597 // Check if there are any __kmpc_fork_call calls to merge. 598 OMPInformationCache::RuntimeFunctionInfo &RFI = 599 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 600 601 if (!RFI.Declaration) 602 return false; 603 604 // Check if there any __kmpc_push_proc_bind calls for explicit affinities. 605 OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI = 606 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind]; 607 608 // Defensively abort if explicit affinities are set. 609 // TODO: Track ICV proc_bind to merge when mergable regions have the same 610 // affinity. 611 if (ProcBindRFI.Declaration) 612 return false; 613 614 bool Changed = false; 615 LoopInfo *LI = nullptr; 616 DominatorTree *DT = nullptr; 617 618 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; 619 620 BasicBlock *StartBB = nullptr, *EndBB = nullptr; 621 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 622 BasicBlock &ContinuationIP) { 623 BasicBlock *CGStartBB = CodeGenIP.getBlock(); 624 BasicBlock *CGEndBB = 625 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 626 assert(StartBB != nullptr && "StartBB should not be null"); 627 CGStartBB->getTerminator()->setSuccessor(0, StartBB); 628 assert(EndBB != nullptr && "EndBB should not be null"); 629 EndBB->getTerminator()->setSuccessor(0, CGEndBB); 630 }; 631 632 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &, 633 Value &Inner, Value *&ReplacementValue) -> InsertPointTy { 634 ReplacementValue = &Inner; 635 return CodeGenIP; 636 }; 637 638 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 639 640 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all 641 // contained in BB and only separated by instructions that can be 642 // redundantly executed in parallel. The block BB is split before the first 643 // call (in MergableCIs) and after the last so the entire region we merge 644 // into a single parallel region is contained in a single basic block 645 // without any other instructions. We use the OpenMPIRBuilder to outline 646 // that block and call the resulting function via __kmpc_fork_call. 647 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { 648 // TODO: Change the interface to allow single CIs expanded, e.g, to 649 // include an outer loop. 650 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); 651 652 auto Remark = [&](OptimizationRemark OR) { 653 OR << "Parallel region at " 654 << ore::NV("OpenMPParallelMergeFront", 655 MergableCIs.front()->getDebugLoc()) 656 << " merged with parallel regions at "; 657 for (auto *CI : 658 llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) { 659 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); 660 if (CI != MergableCIs.back()) 661 OR << ", "; 662 } 663 return OR; 664 }; 665 666 emitRemark<OptimizationRemark>(MergableCIs.front(), 667 "OpenMPParallelRegionMerging", Remark); 668 669 Function *OriginalFn = BB->getParent(); 670 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() 671 << " parallel regions in " << OriginalFn->getName() 672 << "\n"); 673 674 // Isolate the calls to merge in a separate block. 675 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); 676 BasicBlock *AfterBB = 677 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); 678 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, 679 "omp.par.merged"); 680 681 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); 682 const DebugLoc DL = BB->getTerminator()->getDebugLoc(); 683 BB->getTerminator()->eraseFromParent(); 684 685 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), 686 DL); 687 IRBuilder<>::InsertPoint AllocaIP( 688 &OriginalFn->getEntryBlock(), 689 OriginalFn->getEntryBlock().getFirstInsertionPt()); 690 // Create the merged parallel region with default proc binding, to 691 // avoid overriding binding settings, and without explicit cancellation. 692 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel( 693 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, 694 OMP_PROC_BIND_default, /* IsCancellable */ false); 695 BranchInst::Create(AfterBB, AfterIP.getBlock()); 696 697 // Perform the actual outlining. 698 OMPInfoCache.OMPBuilder.finalize(); 699 700 Function *OutlinedFn = MergableCIs.front()->getCaller(); 701 702 // Replace the __kmpc_fork_call calls with direct calls to the outlined 703 // callbacks. 704 SmallVector<Value *, 8> Args; 705 for (auto *CI : MergableCIs) { 706 Value *Callee = 707 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); 708 FunctionType *FT = 709 cast<FunctionType>(Callee->getType()->getPointerElementType()); 710 Args.clear(); 711 Args.push_back(OutlinedFn->getArg(0)); 712 Args.push_back(OutlinedFn->getArg(1)); 713 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 714 U < E; ++U) 715 Args.push_back(CI->getArgOperand(U)); 716 717 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); 718 if (CI->getDebugLoc()) 719 NewCI->setDebugLoc(CI->getDebugLoc()); 720 721 // Forward parameter attributes from the callback to the callee. 722 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 723 U < E; ++U) 724 for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) 725 NewCI->addParamAttr( 726 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); 727 728 // Emit an explicit barrier to replace the implicit fork-join barrier. 729 if (CI != MergableCIs.back()) { 730 // TODO: Remove barrier if the merged parallel region includes the 731 // 'nowait' clause. 732 OMPInfoCache.OMPBuilder.createBarrier( 733 InsertPointTy(NewCI->getParent(), 734 NewCI->getNextNode()->getIterator()), 735 OMPD_parallel); 736 } 737 738 auto Remark = [&](OptimizationRemark OR) { 739 return OR << "Parallel region at " 740 << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()) 741 << " merged with " 742 << ore::NV("OpenMPParallelMergeFront", 743 MergableCIs.front()->getDebugLoc()); 744 }; 745 if (CI != MergableCIs.front()) 746 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging", 747 Remark); 748 749 CI->eraseFromParent(); 750 } 751 752 assert(OutlinedFn != OriginalFn && "Outlining failed"); 753 CGUpdater.registerOutlinedFunction(*OutlinedFn); 754 CGUpdater.reanalyzeFunction(*OriginalFn); 755 756 NumOpenMPParallelRegionsMerged += MergableCIs.size(); 757 758 return true; 759 }; 760 761 // Helper function that identifes sequences of 762 // __kmpc_fork_call uses in a basic block. 763 auto DetectPRsCB = [&](Use &U, Function &F) { 764 CallInst *CI = getCallIfRegularCall(U, &RFI); 765 BB2PRMap[CI->getParent()].insert(CI); 766 767 return false; 768 }; 769 770 BB2PRMap.clear(); 771 RFI.foreachUse(SCC, DetectPRsCB); 772 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector; 773 // Find mergable parallel regions within a basic block that are 774 // safe to merge, that is any in-between instructions can safely 775 // execute in parallel after merging. 776 // TODO: support merging across basic-blocks. 777 for (auto &It : BB2PRMap) { 778 auto &CIs = It.getSecond(); 779 if (CIs.size() < 2) 780 continue; 781 782 BasicBlock *BB = It.getFirst(); 783 SmallVector<CallInst *, 4> MergableCIs; 784 785 // Find maximal number of parallel region CIs that are safe to merge. 786 for (Instruction &I : *BB) { 787 if (CIs.count(&I)) { 788 MergableCIs.push_back(cast<CallInst>(&I)); 789 continue; 790 } 791 792 if (isSafeToSpeculativelyExecute(&I, &I, DT)) 793 continue; 794 795 if (MergableCIs.size() > 1) { 796 MergableCIsVector.push_back(MergableCIs); 797 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size() 798 << " parallel regions in block " << BB->getName() 799 << " of function " << BB->getParent()->getName() 800 << "\n";); 801 } 802 803 MergableCIs.clear(); 804 } 805 806 if (!MergableCIsVector.empty()) { 807 Changed = true; 808 809 for (auto &MergableCIs : MergableCIsVector) 810 Merge(MergableCIs, BB); 811 } 812 } 813 814 if (Changed) { 815 // Update RFI info to set it up for later passes. 816 RFI.clearUsesMap(); 817 OMPInfoCache.collectUses(RFI, /* CollectStats */ false); 818 819 // Collect uses for the emitted barrier call. 820 OMPInformationCache::RuntimeFunctionInfo &BarrierRFI = 821 OMPInfoCache.RFIs[OMPRTL___kmpc_barrier]; 822 BarrierRFI.clearUsesMap(); 823 OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false); 824 } 825 826 return Changed; 827 } 828 829 /// Try to delete parallel regions if possible. 830 bool deleteParallelRegions() { 831 const unsigned CallbackCalleeOperand = 2; 832 833 OMPInformationCache::RuntimeFunctionInfo &RFI = 834 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 835 836 if (!RFI.Declaration) 837 return false; 838 839 bool Changed = false; 840 auto DeleteCallCB = [&](Use &U, Function &) { 841 CallInst *CI = getCallIfRegularCall(U); 842 if (!CI) 843 return false; 844 auto *Fn = dyn_cast<Function>( 845 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); 846 if (!Fn) 847 return false; 848 if (!Fn->onlyReadsMemory()) 849 return false; 850 if (!Fn->hasFnAttribute(Attribute::WillReturn)) 851 return false; 852 853 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " 854 << CI->getCaller()->getName() << "\n"); 855 856 auto Remark = [&](OptimizationRemark OR) { 857 return OR << "Parallel region in " 858 << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) 859 << " deleted"; 860 }; 861 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion", 862 Remark); 863 864 CGUpdater.removeCallSite(*CI); 865 CI->eraseFromParent(); 866 Changed = true; 867 ++NumOpenMPParallelRegionsDeleted; 868 return true; 869 }; 870 871 RFI.foreachUse(SCC, DeleteCallCB); 872 873 return Changed; 874 } 875 876 /// Try to eliminate runtime calls by reusing existing ones. 877 bool deduplicateRuntimeCalls() { 878 bool Changed = false; 879 880 RuntimeFunction DeduplicableRuntimeCallIDs[] = { 881 OMPRTL_omp_get_num_threads, 882 OMPRTL_omp_in_parallel, 883 OMPRTL_omp_get_cancellation, 884 OMPRTL_omp_get_thread_limit, 885 OMPRTL_omp_get_supported_active_levels, 886 OMPRTL_omp_get_level, 887 OMPRTL_omp_get_ancestor_thread_num, 888 OMPRTL_omp_get_team_size, 889 OMPRTL_omp_get_active_level, 890 OMPRTL_omp_in_final, 891 OMPRTL_omp_get_proc_bind, 892 OMPRTL_omp_get_num_places, 893 OMPRTL_omp_get_num_procs, 894 OMPRTL_omp_get_place_num, 895 OMPRTL_omp_get_partition_num_places, 896 OMPRTL_omp_get_partition_place_nums}; 897 898 // Global-tid is handled separately. 899 SmallSetVector<Value *, 16> GTIdArgs; 900 collectGlobalThreadIdArguments(GTIdArgs); 901 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() 902 << " global thread ID arguments\n"); 903 904 for (Function *F : SCC) { 905 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) 906 Changed |= deduplicateRuntimeCalls( 907 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); 908 909 // __kmpc_global_thread_num is special as we can replace it with an 910 // argument in enough cases to make it worth trying. 911 Value *GTIdArg = nullptr; 912 for (Argument &Arg : F->args()) 913 if (GTIdArgs.count(&Arg)) { 914 GTIdArg = &Arg; 915 break; 916 } 917 Changed |= deduplicateRuntimeCalls( 918 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); 919 } 920 921 return Changed; 922 } 923 924 /// Tries to hide the latency of runtime calls that involve host to 925 /// device memory transfers by splitting them into their "issue" and "wait" 926 /// versions. The "issue" is moved upwards as much as possible. The "wait" is 927 /// moved downards as much as possible. The "issue" issues the memory transfer 928 /// asynchronously, returning a handle. The "wait" waits in the returned 929 /// handle for the memory transfer to finish. 930 bool hideMemTransfersLatency() { 931 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; 932 bool Changed = false; 933 auto SplitMemTransfers = [&](Use &U, Function &Decl) { 934 auto *RTCall = getCallIfRegularCall(U, &RFI); 935 if (!RTCall) 936 return false; 937 938 OffloadArray OffloadArrays[3]; 939 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) 940 return false; 941 942 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); 943 944 // TODO: Check if can be moved upwards. 945 bool WasSplit = false; 946 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); 947 if (WaitMovementPoint) 948 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); 949 950 Changed |= WasSplit; 951 return WasSplit; 952 }; 953 RFI.foreachUse(SCC, SplitMemTransfers); 954 955 return Changed; 956 } 957 958 void analysisGlobalization() { 959 RuntimeFunction GlobalizationRuntimeIDs[] = { 960 OMPRTL___kmpc_data_sharing_coalesced_push_stack, 961 OMPRTL___kmpc_data_sharing_push_stack}; 962 963 for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { 964 auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; 965 966 auto CheckGlobalization = [&](Use &U, Function &Decl) { 967 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { 968 auto Remark = [&](OptimizationRemarkAnalysis ORA) { 969 return ORA 970 << "Found thread data sharing on the GPU. " 971 << "Expect degraded performance due to data globalization."; 972 }; 973 emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization", 974 Remark); 975 } 976 977 return false; 978 }; 979 980 RFI.foreachUse(SCC, CheckGlobalization); 981 } 982 return; 983 } 984 985 /// Maps the values stored in the offload arrays passed as arguments to 986 /// \p RuntimeCall into the offload arrays in \p OAs. 987 bool getValuesInOffloadArrays(CallInst &RuntimeCall, 988 MutableArrayRef<OffloadArray> OAs) { 989 assert(OAs.size() == 3 && "Need space for three offload arrays!"); 990 991 // A runtime call that involves memory offloading looks something like: 992 // call void @__tgt_target_data_begin_mapper(arg0, arg1, 993 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, 994 // ...) 995 // So, the idea is to access the allocas that allocate space for these 996 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. 997 // Therefore: 998 // i8** %offload_baseptrs. 999 Value *BasePtrsArg = 1000 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); 1001 // i8** %offload_ptrs. 1002 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); 1003 // i8** %offload_sizes. 1004 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); 1005 1006 // Get values stored in **offload_baseptrs. 1007 auto *V = getUnderlyingObject(BasePtrsArg); 1008 if (!isa<AllocaInst>(V)) 1009 return false; 1010 auto *BasePtrsArray = cast<AllocaInst>(V); 1011 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) 1012 return false; 1013 1014 // Get values stored in **offload_baseptrs. 1015 V = getUnderlyingObject(PtrsArg); 1016 if (!isa<AllocaInst>(V)) 1017 return false; 1018 auto *PtrsArray = cast<AllocaInst>(V); 1019 if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) 1020 return false; 1021 1022 // Get values stored in **offload_sizes. 1023 V = getUnderlyingObject(SizesArg); 1024 // If it's a [constant] global array don't analyze it. 1025 if (isa<GlobalValue>(V)) 1026 return isa<Constant>(V); 1027 if (!isa<AllocaInst>(V)) 1028 return false; 1029 1030 auto *SizesArray = cast<AllocaInst>(V); 1031 if (!OAs[2].initialize(*SizesArray, RuntimeCall)) 1032 return false; 1033 1034 return true; 1035 } 1036 1037 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. 1038 /// For now this is a way to test that the function getValuesInOffloadArrays 1039 /// is working properly. 1040 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. 1041 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) { 1042 assert(OAs.size() == 3 && "There are three offload arrays to debug!"); 1043 1044 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); 1045 std::string ValuesStr; 1046 raw_string_ostream Printer(ValuesStr); 1047 std::string Separator = " --- "; 1048 1049 for (auto *BP : OAs[0].StoredValues) { 1050 BP->print(Printer); 1051 Printer << Separator; 1052 } 1053 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); 1054 ValuesStr.clear(); 1055 1056 for (auto *P : OAs[1].StoredValues) { 1057 P->print(Printer); 1058 Printer << Separator; 1059 } 1060 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); 1061 ValuesStr.clear(); 1062 1063 for (auto *S : OAs[2].StoredValues) { 1064 S->print(Printer); 1065 Printer << Separator; 1066 } 1067 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); 1068 } 1069 1070 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be 1071 /// moved. Returns nullptr if the movement is not possible, or not worth it. 1072 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { 1073 // FIXME: This traverses only the BasicBlock where RuntimeCall is. 1074 // Make it traverse the CFG. 1075 1076 Instruction *CurrentI = &RuntimeCall; 1077 bool IsWorthIt = false; 1078 while ((CurrentI = CurrentI->getNextNode())) { 1079 1080 // TODO: Once we detect the regions to be offloaded we should use the 1081 // alias analysis manager to check if CurrentI may modify one of 1082 // the offloaded regions. 1083 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { 1084 if (IsWorthIt) 1085 return CurrentI; 1086 1087 return nullptr; 1088 } 1089 1090 // FIXME: For now if we move it over anything without side effect 1091 // is worth it. 1092 IsWorthIt = true; 1093 } 1094 1095 // Return end of BasicBlock. 1096 return RuntimeCall.getParent()->getTerminator(); 1097 } 1098 1099 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. 1100 bool splitTargetDataBeginRTC(CallInst &RuntimeCall, 1101 Instruction &WaitMovementPoint) { 1102 // Create stack allocated handle (__tgt_async_info) at the beginning of the 1103 // function. Used for storing information of the async transfer, allowing to 1104 // wait on it later. 1105 auto &IRBuilder = OMPInfoCache.OMPBuilder; 1106 auto *F = RuntimeCall.getCaller(); 1107 Instruction *FirstInst = &(F->getEntryBlock().front()); 1108 AllocaInst *Handle = new AllocaInst( 1109 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); 1110 1111 // Add "issue" runtime call declaration: 1112 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, 1113 // i8**, i8**, i64*, i64*) 1114 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( 1115 M, OMPRTL___tgt_target_data_begin_mapper_issue); 1116 1117 // Change RuntimeCall call site for its asynchronous version. 1118 SmallVector<Value *, 16> Args; 1119 for (auto &Arg : RuntimeCall.args()) 1120 Args.push_back(Arg.get()); 1121 Args.push_back(Handle); 1122 1123 CallInst *IssueCallsite = 1124 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); 1125 RuntimeCall.eraseFromParent(); 1126 1127 // Add "wait" runtime call declaration: 1128 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) 1129 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( 1130 M, OMPRTL___tgt_target_data_begin_mapper_wait); 1131 1132 Value *WaitParams[2] = { 1133 IssueCallsite->getArgOperand( 1134 OffloadArray::DeviceIDArgNum), // device_id. 1135 Handle // handle to wait on. 1136 }; 1137 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); 1138 1139 return true; 1140 } 1141 1142 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, 1143 bool GlobalOnly, bool &SingleChoice) { 1144 if (CurrentIdent == NextIdent) 1145 return CurrentIdent; 1146 1147 // TODO: Figure out how to actually combine multiple debug locations. For 1148 // now we just keep an existing one if there is a single choice. 1149 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { 1150 SingleChoice = !CurrentIdent; 1151 return NextIdent; 1152 } 1153 return nullptr; 1154 } 1155 1156 /// Return an `struct ident_t*` value that represents the ones used in the 1157 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not 1158 /// return a local `struct ident_t*`. For now, if we cannot find a suitable 1159 /// return value we create one from scratch. We also do not yet combine 1160 /// information, e.g., the source locations, see combinedIdentStruct. 1161 Value * 1162 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, 1163 Function &F, bool GlobalOnly) { 1164 bool SingleChoice = true; 1165 Value *Ident = nullptr; 1166 auto CombineIdentStruct = [&](Use &U, Function &Caller) { 1167 CallInst *CI = getCallIfRegularCall(U, &RFI); 1168 if (!CI || &F != &Caller) 1169 return false; 1170 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), 1171 /* GlobalOnly */ true, SingleChoice); 1172 return false; 1173 }; 1174 RFI.foreachUse(SCC, CombineIdentStruct); 1175 1176 if (!Ident || !SingleChoice) { 1177 // The IRBuilder uses the insertion block to get to the module, this is 1178 // unfortunate but we work around it for now. 1179 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) 1180 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( 1181 &F.getEntryBlock(), F.getEntryBlock().begin())); 1182 // Create a fallback location if non was found. 1183 // TODO: Use the debug locations of the calls instead. 1184 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); 1185 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); 1186 } 1187 return Ident; 1188 } 1189 1190 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or 1191 /// \p ReplVal if given. 1192 bool deduplicateRuntimeCalls(Function &F, 1193 OMPInformationCache::RuntimeFunctionInfo &RFI, 1194 Value *ReplVal = nullptr) { 1195 auto *UV = RFI.getUseVector(F); 1196 if (!UV || UV->size() + (ReplVal != nullptr) < 2) 1197 return false; 1198 1199 LLVM_DEBUG( 1200 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name 1201 << (ReplVal ? " with an existing value\n" : "\n") << "\n"); 1202 1203 assert((!ReplVal || (isa<Argument>(ReplVal) && 1204 cast<Argument>(ReplVal)->getParent() == &F)) && 1205 "Unexpected replacement value!"); 1206 1207 // TODO: Use dominance to find a good position instead. 1208 auto CanBeMoved = [this](CallBase &CB) { 1209 unsigned NumArgs = CB.getNumArgOperands(); 1210 if (NumArgs == 0) 1211 return true; 1212 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) 1213 return false; 1214 for (unsigned u = 1; u < NumArgs; ++u) 1215 if (isa<Instruction>(CB.getArgOperand(u))) 1216 return false; 1217 return true; 1218 }; 1219 1220 if (!ReplVal) { 1221 for (Use *U : *UV) 1222 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { 1223 if (!CanBeMoved(*CI)) 1224 continue; 1225 1226 auto Remark = [&](OptimizationRemark OR) { 1227 auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); 1228 return OR << "OpenMP runtime call " 1229 << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " 1230 << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); 1231 }; 1232 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark); 1233 1234 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); 1235 ReplVal = CI; 1236 break; 1237 } 1238 if (!ReplVal) 1239 return false; 1240 } 1241 1242 // If we use a call as a replacement value we need to make sure the ident is 1243 // valid at the new location. For now we just pick a global one, either 1244 // existing and used by one of the calls, or created from scratch. 1245 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { 1246 if (CI->getNumArgOperands() > 0 && 1247 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { 1248 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, 1249 /* GlobalOnly */ true); 1250 CI->setArgOperand(0, Ident); 1251 } 1252 } 1253 1254 bool Changed = false; 1255 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { 1256 CallInst *CI = getCallIfRegularCall(U, &RFI); 1257 if (!CI || CI == ReplVal || &F != &Caller) 1258 return false; 1259 assert(CI->getCaller() == &F && "Unexpected call!"); 1260 1261 auto Remark = [&](OptimizationRemark OR) { 1262 return OR << "OpenMP runtime call " 1263 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; 1264 }; 1265 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark); 1266 1267 CGUpdater.removeCallSite(*CI); 1268 CI->replaceAllUsesWith(ReplVal); 1269 CI->eraseFromParent(); 1270 ++NumOpenMPRuntimeCallsDeduplicated; 1271 Changed = true; 1272 return true; 1273 }; 1274 RFI.foreachUse(SCC, ReplaceAndDeleteCB); 1275 1276 return Changed; 1277 } 1278 1279 /// Collect arguments that represent the global thread id in \p GTIdArgs. 1280 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) { 1281 // TODO: Below we basically perform a fixpoint iteration with a pessimistic 1282 // initialization. We could define an AbstractAttribute instead and 1283 // run the Attributor here once it can be run as an SCC pass. 1284 1285 // Helper to check the argument \p ArgNo at all call sites of \p F for 1286 // a GTId. 1287 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { 1288 if (!F.hasLocalLinkage()) 1289 return false; 1290 for (Use &U : F.uses()) { 1291 if (CallInst *CI = getCallIfRegularCall(U)) { 1292 Value *ArgOp = CI->getArgOperand(ArgNo); 1293 if (CI == &RefCI || GTIdArgs.count(ArgOp) || 1294 getCallIfRegularCall( 1295 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) 1296 continue; 1297 } 1298 return false; 1299 } 1300 return true; 1301 }; 1302 1303 // Helper to identify uses of a GTId as GTId arguments. 1304 auto AddUserArgs = [&](Value >Id) { 1305 for (Use &U : GTId.uses()) 1306 if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) 1307 if (CI->isArgOperand(&U)) 1308 if (Function *Callee = CI->getCalledFunction()) 1309 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) 1310 GTIdArgs.insert(Callee->getArg(U.getOperandNo())); 1311 }; 1312 1313 // The argument users of __kmpc_global_thread_num calls are GTIds. 1314 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = 1315 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; 1316 1317 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { 1318 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) 1319 AddUserArgs(*CI); 1320 return false; 1321 }); 1322 1323 // Transitively search for more arguments by looking at the users of the 1324 // ones we know already. During the search the GTIdArgs vector is extended 1325 // so we cannot cache the size nor can we use a range based for. 1326 for (unsigned u = 0; u < GTIdArgs.size(); ++u) 1327 AddUserArgs(*GTIdArgs[u]); 1328 } 1329 1330 /// Kernel (=GPU) optimizations and utility functions 1331 /// 1332 ///{{ 1333 1334 /// Check if \p F is a kernel, hence entry point for target offloading. 1335 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } 1336 1337 /// Cache to remember the unique kernel for a function. 1338 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; 1339 1340 /// Find the unique kernel that will execute \p F, if any. 1341 Kernel getUniqueKernelFor(Function &F); 1342 1343 /// Find the unique kernel that will execute \p I, if any. 1344 Kernel getUniqueKernelFor(Instruction &I) { 1345 return getUniqueKernelFor(*I.getFunction()); 1346 } 1347 1348 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in 1349 /// the cases we can avoid taking the address of a function. 1350 bool rewriteDeviceCodeStateMachine(); 1351 1352 /// 1353 ///}} 1354 1355 /// Emit a remark generically 1356 /// 1357 /// This template function can be used to generically emit a remark. The 1358 /// RemarkKind should be one of the following: 1359 /// - OptimizationRemark to indicate a successful optimization attempt 1360 /// - OptimizationRemarkMissed to report a failed optimization attempt 1361 /// - OptimizationRemarkAnalysis to provide additional information about an 1362 /// optimization attempt 1363 /// 1364 /// The remark is built using a callback function provided by the caller that 1365 /// takes a RemarkKind as input and returns a RemarkKind. 1366 template <typename RemarkKind, 1367 typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>> 1368 void emitRemark(Instruction *Inst, StringRef RemarkName, 1369 RemarkCallBack &&RemarkCB) const { 1370 Function *F = Inst->getParent()->getParent(); 1371 auto &ORE = OREGetter(F); 1372 1373 ORE.emit( 1374 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); 1375 } 1376 1377 /// Emit a remark on a function. Since only OptimizationRemark is supporting 1378 /// this, it can't be made generic. 1379 void 1380 emitRemarkOnFunction(Function *F, StringRef RemarkName, 1381 function_ref<OptimizationRemark(OptimizationRemark &&)> 1382 &&RemarkCB) const { 1383 auto &ORE = OREGetter(F); 1384 1385 ORE.emit([&]() { 1386 return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); 1387 }); 1388 } 1389 1390 /// The underlying module. 1391 Module &M; 1392 1393 /// The SCC we are operating on. 1394 SmallVectorImpl<Function *> &SCC; 1395 1396 /// Callback to update the call graph, the first argument is a removed call, 1397 /// the second an optional replacement call. 1398 CallGraphUpdater &CGUpdater; 1399 1400 /// Callback to get an OptimizationRemarkEmitter from a Function * 1401 OptimizationRemarkGetter OREGetter; 1402 1403 /// OpenMP-specific information cache. Also Used for Attributor runs. 1404 OMPInformationCache &OMPInfoCache; 1405 1406 /// Attributor instance. 1407 Attributor &A; 1408 1409 /// Helper function to run Attributor on SCC. 1410 bool runAttributor() { 1411 if (SCC.empty()) 1412 return false; 1413 1414 registerAAs(); 1415 1416 ChangeStatus Changed = A.run(); 1417 1418 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size() 1419 << " functions, result: " << Changed << ".\n"); 1420 1421 return Changed == ChangeStatus::CHANGED; 1422 } 1423 1424 /// Populate the Attributor with abstract attribute opportunities in the 1425 /// function. 1426 void registerAAs() { 1427 if (SCC.empty()) 1428 return; 1429 1430 // Create CallSite AA for all Getters. 1431 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { 1432 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; 1433 1434 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; 1435 1436 auto CreateAA = [&](Use &U, Function &Caller) { 1437 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); 1438 if (!CI) 1439 return false; 1440 1441 auto &CB = cast<CallBase>(*CI); 1442 1443 IRPosition CBPos = IRPosition::callsite_function(CB); 1444 A.getOrCreateAAFor<AAICVTracker>(CBPos); 1445 return false; 1446 }; 1447 1448 GetterRFI.foreachUse(SCC, CreateAA); 1449 } 1450 } 1451 }; 1452 1453 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { 1454 if (!OMPInfoCache.ModuleSlice.count(&F)) 1455 return nullptr; 1456 1457 // Use a scope to keep the lifetime of the CachedKernel short. 1458 { 1459 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; 1460 if (CachedKernel) 1461 return *CachedKernel; 1462 1463 // TODO: We should use an AA to create an (optimistic and callback 1464 // call-aware) call graph. For now we stick to simple patterns that 1465 // are less powerful, basically the worst fixpoint. 1466 if (isKernel(F)) { 1467 CachedKernel = Kernel(&F); 1468 return *CachedKernel; 1469 } 1470 1471 CachedKernel = nullptr; 1472 if (!F.hasLocalLinkage()) { 1473 1474 // See https://openmp.llvm.org/remarks/OptimizationRemarks.html 1475 auto Remark = [&](OptimizationRemark OR) { 1476 return OR << "[OMP100] Potentially unknown OpenMP target region caller"; 1477 }; 1478 emitRemarkOnFunction(&F, "OMP100", Remark); 1479 1480 return nullptr; 1481 } 1482 } 1483 1484 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { 1485 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 1486 // Allow use in equality comparisons. 1487 if (Cmp->isEquality()) 1488 return getUniqueKernelFor(*Cmp); 1489 return nullptr; 1490 } 1491 if (auto *CB = dyn_cast<CallBase>(U.getUser())) { 1492 // Allow direct calls. 1493 if (CB->isCallee(&U)) 1494 return getUniqueKernelFor(*CB); 1495 // Allow the use in __kmpc_kernel_prepare_parallel calls. 1496 if (Function *Callee = CB->getCalledFunction()) 1497 if (Callee->getName() == "__kmpc_kernel_prepare_parallel") 1498 return getUniqueKernelFor(*CB); 1499 return nullptr; 1500 } 1501 // Disallow every other use. 1502 return nullptr; 1503 }; 1504 1505 // TODO: In the future we want to track more than just a unique kernel. 1506 SmallPtrSet<Kernel, 2> PotentialKernels; 1507 OMPInformationCache::foreachUse(F, [&](const Use &U) { 1508 PotentialKernels.insert(GetUniqueKernelForUse(U)); 1509 }); 1510 1511 Kernel K = nullptr; 1512 if (PotentialKernels.size() == 1) 1513 K = *PotentialKernels.begin(); 1514 1515 // Cache the result. 1516 UniqueKernelMap[&F] = K; 1517 1518 return K; 1519 } 1520 1521 bool OpenMPOpt::rewriteDeviceCodeStateMachine() { 1522 OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI = 1523 OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel]; 1524 1525 bool Changed = false; 1526 if (!KernelPrepareParallelRFI) 1527 return Changed; 1528 1529 for (Function *F : SCC) { 1530 1531 // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at 1532 // all. 1533 bool UnknownUse = false; 1534 bool KernelPrepareUse = false; 1535 unsigned NumDirectCalls = 0; 1536 1537 SmallVector<Use *, 2> ToBeReplacedStateMachineUses; 1538 OMPInformationCache::foreachUse(*F, [&](Use &U) { 1539 if (auto *CB = dyn_cast<CallBase>(U.getUser())) 1540 if (CB->isCallee(&U)) { 1541 ++NumDirectCalls; 1542 return; 1543 } 1544 1545 if (isa<ICmpInst>(U.getUser())) { 1546 ToBeReplacedStateMachineUses.push_back(&U); 1547 return; 1548 } 1549 if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( 1550 *U.getUser(), &KernelPrepareParallelRFI)) { 1551 KernelPrepareUse = true; 1552 ToBeReplacedStateMachineUses.push_back(&U); 1553 return; 1554 } 1555 UnknownUse = true; 1556 }); 1557 1558 // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel 1559 // use. 1560 if (!KernelPrepareUse) 1561 continue; 1562 1563 { 1564 auto Remark = [&](OptimizationRemark OR) { 1565 return OR << "Found a parallel region that is called in a target " 1566 "region but not part of a combined target construct nor " 1567 "nesed inside a target construct without intermediate " 1568 "code. This can lead to excessive register usage for " 1569 "unrelated target regions in the same translation unit " 1570 "due to spurious call edges assumed by ptxas."; 1571 }; 1572 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1573 } 1574 1575 // If this ever hits, we should investigate. 1576 // TODO: Checking the number of uses is not a necessary restriction and 1577 // should be lifted. 1578 if (UnknownUse || NumDirectCalls != 1 || 1579 ToBeReplacedStateMachineUses.size() != 2) { 1580 { 1581 auto Remark = [&](OptimizationRemark OR) { 1582 return OR << "Parallel region is used in " 1583 << (UnknownUse ? "unknown" : "unexpected") 1584 << " ways; will not attempt to rewrite the state machine."; 1585 }; 1586 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1587 } 1588 continue; 1589 } 1590 1591 // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give 1592 // up if the function is not called from a unique kernel. 1593 Kernel K = getUniqueKernelFor(*F); 1594 if (!K) { 1595 { 1596 auto Remark = [&](OptimizationRemark OR) { 1597 return OR << "Parallel region is not known to be called from a " 1598 "unique single target region, maybe the surrounding " 1599 "function has external linkage?; will not attempt to " 1600 "rewrite the state machine use."; 1601 }; 1602 emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", 1603 Remark); 1604 } 1605 continue; 1606 } 1607 1608 // We now know F is a parallel body function called only from the kernel K. 1609 // We also identified the state machine uses in which we replace the 1610 // function pointer by a new global symbol for identification purposes. This 1611 // ensures only direct calls to the function are left. 1612 1613 { 1614 auto RemarkParalleRegion = [&](OptimizationRemark OR) { 1615 return OR << "Specialize parallel region that is only reached from a " 1616 "single target region to avoid spurious call edges and " 1617 "excessive register usage in other target regions. " 1618 "(parallel region ID: " 1619 << ore::NV("OpenMPParallelRegion", F->getName()) 1620 << ", kernel ID: " 1621 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1622 }; 1623 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", 1624 RemarkParalleRegion); 1625 auto RemarkKernel = [&](OptimizationRemark OR) { 1626 return OR << "Target region containing the parallel region that is " 1627 "specialized. (parallel region ID: " 1628 << ore::NV("OpenMPParallelRegion", F->getName()) 1629 << ", kernel ID: " 1630 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1631 }; 1632 emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); 1633 } 1634 1635 Module &M = *F->getParent(); 1636 Type *Int8Ty = Type::getInt8Ty(M.getContext()); 1637 1638 auto *ID = new GlobalVariable( 1639 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, 1640 UndefValue::get(Int8Ty), F->getName() + ".ID"); 1641 1642 for (Use *U : ToBeReplacedStateMachineUses) 1643 U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); 1644 1645 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; 1646 1647 Changed = true; 1648 } 1649 1650 return Changed; 1651 } 1652 1653 /// Abstract Attribute for tracking ICV values. 1654 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { 1655 using Base = StateWrapper<BooleanState, AbstractAttribute>; 1656 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1657 1658 void initialize(Attributor &A) override { 1659 Function *F = getAnchorScope(); 1660 if (!F || !A.isFunctionIPOAmendable(*F)) 1661 indicatePessimisticFixpoint(); 1662 } 1663 1664 /// Returns true if value is assumed to be tracked. 1665 bool isAssumedTracked() const { return getAssumed(); } 1666 1667 /// Returns true if value is known to be tracked. 1668 bool isKnownTracked() const { return getAssumed(); } 1669 1670 /// Create an abstract attribute biew for the position \p IRP. 1671 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); 1672 1673 /// Return the value with which \p I can be replaced for specific \p ICV. 1674 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, 1675 const Instruction *I, 1676 Attributor &A) const { 1677 return None; 1678 } 1679 1680 /// Return an assumed unique ICV value if a single candidate is found. If 1681 /// there cannot be one, return a nullptr. If it is not clear yet, return the 1682 /// Optional::NoneType. 1683 virtual Optional<Value *> 1684 getUniqueReplacementValue(InternalControlVar ICV) const = 0; 1685 1686 // Currently only nthreads is being tracked. 1687 // this array will only grow with time. 1688 InternalControlVar TrackableICVs[1] = {ICV_nthreads}; 1689 1690 /// See AbstractAttribute::getName() 1691 const std::string getName() const override { return "AAICVTracker"; } 1692 1693 /// See AbstractAttribute::getIdAddr() 1694 const char *getIdAddr() const override { return &ID; } 1695 1696 /// This function should return true if the type of the \p AA is AAICVTracker 1697 static bool classof(const AbstractAttribute *AA) { 1698 return (AA->getIdAddr() == &ID); 1699 } 1700 1701 static const char ID; 1702 }; 1703 1704 struct AAICVTrackerFunction : public AAICVTracker { 1705 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) 1706 : AAICVTracker(IRP, A) {} 1707 1708 // FIXME: come up with better string. 1709 const std::string getAsStr() const override { return "ICVTrackerFunction"; } 1710 1711 // FIXME: come up with some stats. 1712 void trackStatistics() const override {} 1713 1714 /// We don't manifest anything for this AA. 1715 ChangeStatus manifest(Attributor &A) override { 1716 return ChangeStatus::UNCHANGED; 1717 } 1718 1719 // Map of ICV to their values at specific program point. 1720 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar, 1721 InternalControlVar::ICV___last> 1722 ICVReplacementValuesMap; 1723 1724 ChangeStatus updateImpl(Attributor &A) override { 1725 ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 1726 1727 Function *F = getAnchorScope(); 1728 1729 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1730 1731 for (InternalControlVar ICV : TrackableICVs) { 1732 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1733 1734 auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1735 auto TrackValues = [&](Use &U, Function &) { 1736 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); 1737 if (!CI) 1738 return false; 1739 1740 // FIXME: handle setters with more that 1 arguments. 1741 /// Track new value. 1742 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second) 1743 HasChanged = ChangeStatus::CHANGED; 1744 1745 return false; 1746 }; 1747 1748 auto CallCheck = [&](Instruction &I) { 1749 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); 1750 if (ReplVal.hasValue() && 1751 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) 1752 HasChanged = ChangeStatus::CHANGED; 1753 1754 return true; 1755 }; 1756 1757 // Track all changes of an ICV. 1758 SetterRFI.foreachUse(TrackValues, F); 1759 1760 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call}, 1761 /* CheckBBLivenessOnly */ true); 1762 1763 /// TODO: Figure out a way to avoid adding entry in 1764 /// ICVReplacementValuesMap 1765 Instruction *Entry = &F->getEntryBlock().front(); 1766 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry)) 1767 ValuesMap.insert(std::make_pair(Entry, nullptr)); 1768 } 1769 1770 return HasChanged; 1771 } 1772 1773 /// Hepler to check if \p I is a call and get the value for it if it is 1774 /// unique. 1775 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, 1776 InternalControlVar &ICV) const { 1777 1778 const auto *CB = dyn_cast<CallBase>(I); 1779 if (!CB || CB->hasFnAttr("no_openmp") || 1780 CB->hasFnAttr("no_openmp_routines")) 1781 return None; 1782 1783 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1784 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; 1785 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1786 Function *CalledFunction = CB->getCalledFunction(); 1787 1788 // Indirect call, assume ICV changes. 1789 if (CalledFunction == nullptr) 1790 return nullptr; 1791 if (CalledFunction == GetterRFI.Declaration) 1792 return None; 1793 if (CalledFunction == SetterRFI.Declaration) { 1794 if (ICVReplacementValuesMap[ICV].count(I)) 1795 return ICVReplacementValuesMap[ICV].lookup(I); 1796 1797 return nullptr; 1798 } 1799 1800 // Since we don't know, assume it changes the ICV. 1801 if (CalledFunction->isDeclaration()) 1802 return nullptr; 1803 1804 const auto &ICVTrackingAA = 1805 A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB)); 1806 1807 if (ICVTrackingAA.isAssumedTracked()) 1808 return ICVTrackingAA.getUniqueReplacementValue(ICV); 1809 1810 // If we don't know, assume it changes. 1811 return nullptr; 1812 } 1813 1814 // We don't check unique value for a function, so return None. 1815 Optional<Value *> 1816 getUniqueReplacementValue(InternalControlVar ICV) const override { 1817 return None; 1818 } 1819 1820 /// Return the value with which \p I can be replaced for specific \p ICV. 1821 Optional<Value *> getReplacementValue(InternalControlVar ICV, 1822 const Instruction *I, 1823 Attributor &A) const override { 1824 const auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1825 if (ValuesMap.count(I)) 1826 return ValuesMap.lookup(I); 1827 1828 SmallVector<const Instruction *, 16> Worklist; 1829 SmallPtrSet<const Instruction *, 16> Visited; 1830 Worklist.push_back(I); 1831 1832 Optional<Value *> ReplVal; 1833 1834 while (!Worklist.empty()) { 1835 const Instruction *CurrInst = Worklist.pop_back_val(); 1836 if (!Visited.insert(CurrInst).second) 1837 continue; 1838 1839 const BasicBlock *CurrBB = CurrInst->getParent(); 1840 1841 // Go up and look for all potential setters/calls that might change the 1842 // ICV. 1843 while ((CurrInst = CurrInst->getPrevNode())) { 1844 if (ValuesMap.count(CurrInst)) { 1845 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); 1846 // Unknown value, track new. 1847 if (!ReplVal.hasValue()) { 1848 ReplVal = NewReplVal; 1849 break; 1850 } 1851 1852 // If we found a new value, we can't know the icv value anymore. 1853 if (NewReplVal.hasValue()) 1854 if (ReplVal != NewReplVal) 1855 return nullptr; 1856 1857 break; 1858 } 1859 1860 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); 1861 if (!NewReplVal.hasValue()) 1862 continue; 1863 1864 // Unknown value, track new. 1865 if (!ReplVal.hasValue()) { 1866 ReplVal = NewReplVal; 1867 break; 1868 } 1869 1870 // if (NewReplVal.hasValue()) 1871 // We found a new value, we can't know the icv value anymore. 1872 if (ReplVal != NewReplVal) 1873 return nullptr; 1874 } 1875 1876 // If we are in the same BB and we have a value, we are done. 1877 if (CurrBB == I->getParent() && ReplVal.hasValue()) 1878 return ReplVal; 1879 1880 // Go through all predecessors and add terminators for analysis. 1881 for (const BasicBlock *Pred : predecessors(CurrBB)) 1882 if (const Instruction *Terminator = Pred->getTerminator()) 1883 Worklist.push_back(Terminator); 1884 } 1885 1886 return ReplVal; 1887 } 1888 }; 1889 1890 struct AAICVTrackerFunctionReturned : AAICVTracker { 1891 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A) 1892 : AAICVTracker(IRP, A) {} 1893 1894 // FIXME: come up with better string. 1895 const std::string getAsStr() const override { 1896 return "ICVTrackerFunctionReturned"; 1897 } 1898 1899 // FIXME: come up with some stats. 1900 void trackStatistics() const override {} 1901 1902 /// We don't manifest anything for this AA. 1903 ChangeStatus manifest(Attributor &A) override { 1904 return ChangeStatus::UNCHANGED; 1905 } 1906 1907 // Map of ICV to their values at specific program point. 1908 EnumeratedArray<Optional<Value *>, InternalControlVar, 1909 InternalControlVar::ICV___last> 1910 ICVReplacementValuesMap; 1911 1912 /// Return the value with which \p I can be replaced for specific \p ICV. 1913 Optional<Value *> 1914 getUniqueReplacementValue(InternalControlVar ICV) const override { 1915 return ICVReplacementValuesMap[ICV]; 1916 } 1917 1918 ChangeStatus updateImpl(Attributor &A) override { 1919 ChangeStatus Changed = ChangeStatus::UNCHANGED; 1920 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 1921 *this, IRPosition::function(*getAnchorScope())); 1922 1923 if (!ICVTrackingAA.isAssumedTracked()) 1924 return indicatePessimisticFixpoint(); 1925 1926 for (InternalControlVar ICV : TrackableICVs) { 1927 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 1928 Optional<Value *> UniqueICVValue; 1929 1930 auto CheckReturnInst = [&](Instruction &I) { 1931 Optional<Value *> NewReplVal = 1932 ICVTrackingAA.getReplacementValue(ICV, &I, A); 1933 1934 // If we found a second ICV value there is no unique returned value. 1935 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) 1936 return false; 1937 1938 UniqueICVValue = NewReplVal; 1939 1940 return true; 1941 }; 1942 1943 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}, 1944 /* CheckBBLivenessOnly */ true)) 1945 UniqueICVValue = nullptr; 1946 1947 if (UniqueICVValue == ReplVal) 1948 continue; 1949 1950 ReplVal = UniqueICVValue; 1951 Changed = ChangeStatus::CHANGED; 1952 } 1953 1954 return Changed; 1955 } 1956 }; 1957 1958 struct AAICVTrackerCallSite : AAICVTracker { 1959 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A) 1960 : AAICVTracker(IRP, A) {} 1961 1962 void initialize(Attributor &A) override { 1963 Function *F = getAnchorScope(); 1964 if (!F || !A.isFunctionIPOAmendable(*F)) 1965 indicatePessimisticFixpoint(); 1966 1967 // We only initialize this AA for getters, so we need to know which ICV it 1968 // gets. 1969 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1970 for (InternalControlVar ICV : TrackableICVs) { 1971 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 1972 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter]; 1973 if (Getter.Declaration == getAssociatedFunction()) { 1974 AssociatedICV = ICVInfo.Kind; 1975 return; 1976 } 1977 } 1978 1979 /// Unknown ICV. 1980 indicatePessimisticFixpoint(); 1981 } 1982 1983 ChangeStatus manifest(Attributor &A) override { 1984 if (!ReplVal.hasValue() || !ReplVal.getValue()) 1985 return ChangeStatus::UNCHANGED; 1986 1987 A.changeValueAfterManifest(*getCtxI(), **ReplVal); 1988 A.deleteAfterManifest(*getCtxI()); 1989 1990 return ChangeStatus::CHANGED; 1991 } 1992 1993 // FIXME: come up with better string. 1994 const std::string getAsStr() const override { return "ICVTrackerCallSite"; } 1995 1996 // FIXME: come up with some stats. 1997 void trackStatistics() const override {} 1998 1999 InternalControlVar AssociatedICV; 2000 Optional<Value *> ReplVal; 2001 2002 ChangeStatus updateImpl(Attributor &A) override { 2003 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2004 *this, IRPosition::function(*getAnchorScope())); 2005 2006 // We don't have any information, so we assume it changes the ICV. 2007 if (!ICVTrackingAA.isAssumedTracked()) 2008 return indicatePessimisticFixpoint(); 2009 2010 Optional<Value *> NewReplVal = 2011 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); 2012 2013 if (ReplVal == NewReplVal) 2014 return ChangeStatus::UNCHANGED; 2015 2016 ReplVal = NewReplVal; 2017 return ChangeStatus::CHANGED; 2018 } 2019 2020 // Return the value with which associated value can be replaced for specific 2021 // \p ICV. 2022 Optional<Value *> 2023 getUniqueReplacementValue(InternalControlVar ICV) const override { 2024 return ReplVal; 2025 } 2026 }; 2027 2028 struct AAICVTrackerCallSiteReturned : AAICVTracker { 2029 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A) 2030 : AAICVTracker(IRP, A) {} 2031 2032 // FIXME: come up with better string. 2033 const std::string getAsStr() const override { 2034 return "ICVTrackerCallSiteReturned"; 2035 } 2036 2037 // FIXME: come up with some stats. 2038 void trackStatistics() const override {} 2039 2040 /// We don't manifest anything for this AA. 2041 ChangeStatus manifest(Attributor &A) override { 2042 return ChangeStatus::UNCHANGED; 2043 } 2044 2045 // Map of ICV to their values at specific program point. 2046 EnumeratedArray<Optional<Value *>, InternalControlVar, 2047 InternalControlVar::ICV___last> 2048 ICVReplacementValuesMap; 2049 2050 /// Return the value with which associated value can be replaced for specific 2051 /// \p ICV. 2052 Optional<Value *> 2053 getUniqueReplacementValue(InternalControlVar ICV) const override { 2054 return ICVReplacementValuesMap[ICV]; 2055 } 2056 2057 ChangeStatus updateImpl(Attributor &A) override { 2058 ChangeStatus Changed = ChangeStatus::UNCHANGED; 2059 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2060 *this, IRPosition::returned(*getAssociatedFunction())); 2061 2062 // We don't have any information, so we assume it changes the ICV. 2063 if (!ICVTrackingAA.isAssumedTracked()) 2064 return indicatePessimisticFixpoint(); 2065 2066 for (InternalControlVar ICV : TrackableICVs) { 2067 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 2068 Optional<Value *> NewReplVal = 2069 ICVTrackingAA.getUniqueReplacementValue(ICV); 2070 2071 if (ReplVal == NewReplVal) 2072 continue; 2073 2074 ReplVal = NewReplVal; 2075 Changed = ChangeStatus::CHANGED; 2076 } 2077 return Changed; 2078 } 2079 }; 2080 } // namespace 2081 2082 const char AAICVTracker::ID = 0; 2083 2084 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, 2085 Attributor &A) { 2086 AAICVTracker *AA = nullptr; 2087 switch (IRP.getPositionKind()) { 2088 case IRPosition::IRP_INVALID: 2089 case IRPosition::IRP_FLOAT: 2090 case IRPosition::IRP_ARGUMENT: 2091 case IRPosition::IRP_CALL_SITE_ARGUMENT: 2092 llvm_unreachable("ICVTracker can only be created for function position!"); 2093 case IRPosition::IRP_RETURNED: 2094 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A); 2095 break; 2096 case IRPosition::IRP_CALL_SITE_RETURNED: 2097 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A); 2098 break; 2099 case IRPosition::IRP_CALL_SITE: 2100 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A); 2101 break; 2102 case IRPosition::IRP_FUNCTION: 2103 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); 2104 break; 2105 } 2106 2107 return *AA; 2108 } 2109 2110 PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, 2111 CGSCCAnalysisManager &AM, 2112 LazyCallGraph &CG, CGSCCUpdateResult &UR) { 2113 if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule)) 2114 return PreservedAnalyses::all(); 2115 2116 if (DisableOpenMPOptimizations) 2117 return PreservedAnalyses::all(); 2118 2119 SmallVector<Function *, 16> SCC; 2120 // If there are kernels in the module, we have to run on all SCC's. 2121 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2122 for (LazyCallGraph::Node &N : C) { 2123 Function *Fn = &N.getFunction(); 2124 SCC.push_back(Fn); 2125 2126 // Do we already know that the SCC contains kernels, 2127 // or that OpenMP functions are called from this SCC? 2128 if (SCCIsInteresting) 2129 continue; 2130 // If not, let's check that. 2131 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2132 } 2133 2134 if (!SCCIsInteresting || SCC.empty()) 2135 return PreservedAnalyses::all(); 2136 2137 FunctionAnalysisManager &FAM = 2138 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 2139 2140 AnalysisGetter AG(FAM); 2141 2142 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 2143 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 2144 }; 2145 2146 CallGraphUpdater CGUpdater; 2147 CGUpdater.initialize(CG, C, AM, UR); 2148 2149 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2150 BumpPtrAllocator Allocator; 2151 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, 2152 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2153 2154 Attributor A(Functions, InfoCache, CGUpdater); 2155 2156 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2157 bool Changed = OMPOpt.run(); 2158 if (Changed) 2159 return PreservedAnalyses::none(); 2160 2161 return PreservedAnalyses::all(); 2162 } 2163 2164 namespace { 2165 2166 struct OpenMPOptLegacyPass : public CallGraphSCCPass { 2167 CallGraphUpdater CGUpdater; 2168 OpenMPInModule OMPInModule; 2169 static char ID; 2170 2171 OpenMPOptLegacyPass() : CallGraphSCCPass(ID) { 2172 initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry()); 2173 } 2174 2175 void getAnalysisUsage(AnalysisUsage &AU) const override { 2176 CallGraphSCCPass::getAnalysisUsage(AU); 2177 } 2178 2179 bool doInitialization(CallGraph &CG) override { 2180 // Disable the pass if there is no OpenMP (runtime call) in the module. 2181 containsOpenMP(CG.getModule(), OMPInModule); 2182 return false; 2183 } 2184 2185 bool runOnSCC(CallGraphSCC &CGSCC) override { 2186 if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule)) 2187 return false; 2188 if (DisableOpenMPOptimizations || skipSCC(CGSCC)) 2189 return false; 2190 2191 SmallVector<Function *, 16> SCC; 2192 // If there are kernels in the module, we have to run on all SCC's. 2193 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2194 for (CallGraphNode *CGN : CGSCC) { 2195 Function *Fn = CGN->getFunction(); 2196 if (!Fn || Fn->isDeclaration()) 2197 continue; 2198 SCC.push_back(Fn); 2199 2200 // Do we already know that the SCC contains kernels, 2201 // or that OpenMP functions are called from this SCC? 2202 if (SCCIsInteresting) 2203 continue; 2204 // If not, let's check that. 2205 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2206 } 2207 2208 if (!SCCIsInteresting || SCC.empty()) 2209 return false; 2210 2211 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 2212 CGUpdater.initialize(CG, CGSCC); 2213 2214 // Maintain a map of functions to avoid rebuilding the ORE 2215 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; 2216 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { 2217 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; 2218 if (!ORE) 2219 ORE = std::make_unique<OptimizationRemarkEmitter>(F); 2220 return *ORE; 2221 }; 2222 2223 AnalysisGetter AG; 2224 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2225 BumpPtrAllocator Allocator; 2226 OMPInformationCache InfoCache( 2227 *(Functions.back()->getParent()), AG, Allocator, 2228 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2229 2230 Attributor A(Functions, InfoCache, CGUpdater); 2231 2232 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2233 return OMPOpt.run(); 2234 } 2235 2236 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } 2237 }; 2238 2239 } // end anonymous namespace 2240 2241 void OpenMPInModule::identifyKernels(Module &M) { 2242 2243 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 2244 if (!MD) 2245 return; 2246 2247 for (auto *Op : MD->operands()) { 2248 if (Op->getNumOperands() < 2) 2249 continue; 2250 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); 2251 if (!KindID || KindID->getString() != "kernel") 2252 continue; 2253 2254 Function *KernelFn = 2255 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); 2256 if (!KernelFn) 2257 continue; 2258 2259 ++NumOpenMPTargetRegionKernels; 2260 2261 Kernels.insert(KernelFn); 2262 } 2263 } 2264 2265 bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { 2266 if (OMPInModule.isKnown()) 2267 return OMPInModule; 2268 2269 auto RecordFunctionsContainingUsesOf = [&](Function *F) { 2270 for (User *U : F->users()) 2271 if (auto *I = dyn_cast<Instruction>(U)) 2272 OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction()); 2273 }; 2274 2275 // MSVC doesn't like long if-else chains for some reason and instead just 2276 // issues an error. Work around it.. 2277 do { 2278 #define OMP_RTL(_Enum, _Name, ...) \ 2279 if (Function *F = M.getFunction(_Name)) { \ 2280 RecordFunctionsContainingUsesOf(F); \ 2281 OMPInModule = true; \ 2282 } 2283 #include "llvm/Frontend/OpenMP/OMPKinds.def" 2284 } while (false); 2285 2286 // Identify kernels once. TODO: We should split the OMPInformationCache into a 2287 // module and an SCC part. The kernel information, among other things, could 2288 // go into the module part. 2289 if (OMPInModule.isKnown() && OMPInModule) { 2290 OMPInModule.identifyKernels(M); 2291 return true; 2292 } 2293 2294 return OMPInModule = false; 2295 } 2296 2297 char OpenMPOptLegacyPass::ID = 0; 2298 2299 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", 2300 "OpenMP specific optimizations", false, false) 2301 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 2302 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt", 2303 "OpenMP specific optimizations", false, false) 2304 2305 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); } 2306