1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // OpenMP specific optimizations: 10 // 11 // - Deduplication of runtime calls, e.g., omp_get_thread_num. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/OpenMPOpt.h" 16 17 #include "llvm/ADT/EnumeratedArray.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/CallGraph.h" 20 #include "llvm/Analysis/CallGraphSCCPass.h" 21 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Frontend/OpenMP/OMPConstants.h" 24 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 25 #include "llvm/InitializePasses.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Transforms/IPO.h" 28 #include "llvm/Transforms/IPO/Attributor.h" 29 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 30 #include "llvm/Transforms/Utils/CallGraphUpdater.h" 31 32 using namespace llvm; 33 using namespace omp; 34 35 #define DEBUG_TYPE "openmp-opt" 36 37 static cl::opt<bool> DisableOpenMPOptimizations( 38 "openmp-opt-disable", cl::ZeroOrMore, 39 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, 40 cl::init(false)); 41 42 static cl::opt<bool> EnableParallelRegionMerging( 43 "openmp-opt-enable-merging", cl::ZeroOrMore, 44 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, 45 cl::init(false)); 46 47 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), 48 cl::Hidden); 49 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", 50 cl::init(false), cl::Hidden); 51 52 static cl::opt<bool> HideMemoryTransferLatency( 53 "openmp-hide-memory-transfer-latency", 54 cl::desc("[WIP] Tries to hide the latency of host to device memory" 55 " transfers"), 56 cl::Hidden, cl::init(false)); 57 58 59 STATISTIC(NumOpenMPRuntimeCallsDeduplicated, 60 "Number of OpenMP runtime calls deduplicated"); 61 STATISTIC(NumOpenMPParallelRegionsDeleted, 62 "Number of OpenMP parallel regions deleted"); 63 STATISTIC(NumOpenMPRuntimeFunctionsIdentified, 64 "Number of OpenMP runtime functions identified"); 65 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified, 66 "Number of OpenMP runtime function uses identified"); 67 STATISTIC(NumOpenMPTargetRegionKernels, 68 "Number of OpenMP target region entry points (=kernels) identified"); 69 STATISTIC( 70 NumOpenMPParallelRegionsReplacedInGPUStateMachine, 71 "Number of OpenMP parallel regions replaced with ID in GPU state machines"); 72 STATISTIC(NumOpenMPParallelRegionsMerged, 73 "Number of OpenMP parallel regions merged"); 74 75 #if !defined(NDEBUG) 76 static constexpr auto TAG = "[" DEBUG_TYPE "]"; 77 #endif 78 79 namespace { 80 81 struct AAICVTracker; 82 83 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for 84 /// Attributor runs. 85 struct OMPInformationCache : public InformationCache { 86 OMPInformationCache(Module &M, AnalysisGetter &AG, 87 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, 88 SmallPtrSetImpl<Kernel> &Kernels) 89 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), 90 Kernels(Kernels) { 91 92 OMPBuilder.initialize(); 93 initializeRuntimeFunctions(); 94 initializeInternalControlVars(); 95 } 96 97 /// Generic information that describes an internal control variable. 98 struct InternalControlVarInfo { 99 /// The kind, as described by InternalControlVar enum. 100 InternalControlVar Kind; 101 102 /// The name of the ICV. 103 StringRef Name; 104 105 /// Environment variable associated with this ICV. 106 StringRef EnvVarName; 107 108 /// Initial value kind. 109 ICVInitValue InitKind; 110 111 /// Initial value. 112 ConstantInt *InitValue; 113 114 /// Setter RTL function associated with this ICV. 115 RuntimeFunction Setter; 116 117 /// Getter RTL function associated with this ICV. 118 RuntimeFunction Getter; 119 120 /// RTL Function corresponding to the override clause of this ICV 121 RuntimeFunction Clause; 122 }; 123 124 /// Generic information that describes a runtime function 125 struct RuntimeFunctionInfo { 126 127 /// The kind, as described by the RuntimeFunction enum. 128 RuntimeFunction Kind; 129 130 /// The name of the function. 131 StringRef Name; 132 133 /// Flag to indicate a variadic function. 134 bool IsVarArg; 135 136 /// The return type of the function. 137 Type *ReturnType; 138 139 /// The argument types of the function. 140 SmallVector<Type *, 8> ArgumentTypes; 141 142 /// The declaration if available. 143 Function *Declaration = nullptr; 144 145 /// Uses of this runtime function per function containing the use. 146 using UseVector = SmallVector<Use *, 16>; 147 148 /// Clear UsesMap for runtime function. 149 void clearUsesMap() { UsesMap.clear(); } 150 151 /// Boolean conversion that is true if the runtime function was found. 152 operator bool() const { return Declaration; } 153 154 /// Return the vector of uses in function \p F. 155 UseVector &getOrCreateUseVector(Function *F) { 156 std::shared_ptr<UseVector> &UV = UsesMap[F]; 157 if (!UV) 158 UV = std::make_shared<UseVector>(); 159 return *UV; 160 } 161 162 /// Return the vector of uses in function \p F or `nullptr` if there are 163 /// none. 164 const UseVector *getUseVector(Function &F) const { 165 auto I = UsesMap.find(&F); 166 if (I != UsesMap.end()) 167 return I->second.get(); 168 return nullptr; 169 } 170 171 /// Return how many functions contain uses of this runtime function. 172 size_t getNumFunctionsWithUses() const { return UsesMap.size(); } 173 174 /// Return the number of arguments (or the minimal number for variadic 175 /// functions). 176 size_t getNumArgs() const { return ArgumentTypes.size(); } 177 178 /// Run the callback \p CB on each use and forget the use if the result is 179 /// true. The callback will be fed the function in which the use was 180 /// encountered as second argument. 181 void foreachUse(SmallVectorImpl<Function *> &SCC, 182 function_ref<bool(Use &, Function &)> CB) { 183 for (Function *F : SCC) 184 foreachUse(CB, F); 185 } 186 187 /// Run the callback \p CB on each use within the function \p F and forget 188 /// the use if the result is true. 189 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { 190 SmallVector<unsigned, 8> ToBeDeleted; 191 ToBeDeleted.clear(); 192 193 unsigned Idx = 0; 194 UseVector &UV = getOrCreateUseVector(F); 195 196 for (Use *U : UV) { 197 if (CB(*U, *F)) 198 ToBeDeleted.push_back(Idx); 199 ++Idx; 200 } 201 202 // Remove the to-be-deleted indices in reverse order as prior 203 // modifications will not modify the smaller indices. 204 while (!ToBeDeleted.empty()) { 205 unsigned Idx = ToBeDeleted.pop_back_val(); 206 UV[Idx] = UV.back(); 207 UV.pop_back(); 208 } 209 } 210 211 private: 212 /// Map from functions to all uses of this runtime function contained in 213 /// them. 214 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; 215 }; 216 217 /// An OpenMP-IR-Builder instance 218 OpenMPIRBuilder OMPBuilder; 219 220 /// Map from runtime function kind to the runtime function description. 221 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, 222 RuntimeFunction::OMPRTL___last> 223 RFIs; 224 225 /// Map from ICV kind to the ICV description. 226 EnumeratedArray<InternalControlVarInfo, InternalControlVar, 227 InternalControlVar::ICV___last> 228 ICVs; 229 230 /// Helper to initialize all internal control variable information for those 231 /// defined in OMPKinds.def. 232 void initializeInternalControlVars() { 233 #define ICV_RT_SET(_Name, RTL) \ 234 { \ 235 auto &ICV = ICVs[_Name]; \ 236 ICV.Setter = RTL; \ 237 } 238 #define ICV_RT_GET(Name, RTL) \ 239 { \ 240 auto &ICV = ICVs[Name]; \ 241 ICV.Getter = RTL; \ 242 } 243 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \ 244 { \ 245 auto &ICV = ICVs[Enum]; \ 246 ICV.Name = _Name; \ 247 ICV.Kind = Enum; \ 248 ICV.InitKind = Init; \ 249 ICV.EnvVarName = _EnvVarName; \ 250 switch (ICV.InitKind) { \ 251 case ICV_IMPLEMENTATION_DEFINED: \ 252 ICV.InitValue = nullptr; \ 253 break; \ 254 case ICV_ZERO: \ 255 ICV.InitValue = ConstantInt::get( \ 256 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \ 257 break; \ 258 case ICV_FALSE: \ 259 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \ 260 break; \ 261 case ICV_LAST: \ 262 break; \ 263 } \ 264 } 265 #include "llvm/Frontend/OpenMP/OMPKinds.def" 266 } 267 268 /// Returns true if the function declaration \p F matches the runtime 269 /// function types, that is, return type \p RTFRetType, and argument types 270 /// \p RTFArgTypes. 271 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, 272 SmallVector<Type *, 8> &RTFArgTypes) { 273 // TODO: We should output information to the user (under debug output 274 // and via remarks). 275 276 if (!F) 277 return false; 278 if (F->getReturnType() != RTFRetType) 279 return false; 280 if (F->arg_size() != RTFArgTypes.size()) 281 return false; 282 283 auto RTFTyIt = RTFArgTypes.begin(); 284 for (Argument &Arg : F->args()) { 285 if (Arg.getType() != *RTFTyIt) 286 return false; 287 288 ++RTFTyIt; 289 } 290 291 return true; 292 } 293 294 // Helper to collect all uses of the declaration in the UsesMap. 295 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { 296 unsigned NumUses = 0; 297 if (!RFI.Declaration) 298 return NumUses; 299 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); 300 301 if (CollectStats) { 302 NumOpenMPRuntimeFunctionsIdentified += 1; 303 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); 304 } 305 306 // TODO: We directly convert uses into proper calls and unknown uses. 307 for (Use &U : RFI.Declaration->uses()) { 308 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { 309 if (ModuleSlice.count(UserI->getFunction())) { 310 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); 311 ++NumUses; 312 } 313 } else { 314 RFI.getOrCreateUseVector(nullptr).push_back(&U); 315 ++NumUses; 316 } 317 } 318 return NumUses; 319 } 320 321 // Helper function to recollect uses of all runtime functions. 322 void recollectUses() { 323 for (int Idx = 0; Idx < RFIs.size(); ++Idx) { 324 auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)]; 325 RFI.clearUsesMap(); 326 collectUses(RFI, /*CollectStats*/ false); 327 } 328 } 329 330 /// Helper to initialize all runtime function information for those defined 331 /// in OpenMPKinds.def. 332 void initializeRuntimeFunctions() { 333 Module &M = *((*ModuleSlice.begin())->getParent()); 334 335 // Helper macros for handling __VA_ARGS__ in OMP_RTL 336 #define OMP_TYPE(VarName, ...) \ 337 Type *VarName = OMPBuilder.VarName; \ 338 (void)VarName; 339 340 #define OMP_ARRAY_TYPE(VarName, ...) \ 341 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \ 342 (void)VarName##Ty; \ 343 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \ 344 (void)VarName##PtrTy; 345 346 #define OMP_FUNCTION_TYPE(VarName, ...) \ 347 FunctionType *VarName = OMPBuilder.VarName; \ 348 (void)VarName; \ 349 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 350 (void)VarName##Ptr; 351 352 #define OMP_STRUCT_TYPE(VarName, ...) \ 353 StructType *VarName = OMPBuilder.VarName; \ 354 (void)VarName; \ 355 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ 356 (void)VarName##Ptr; 357 358 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ 359 { \ 360 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ 361 Function *F = M.getFunction(_Name); \ 362 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ 363 auto &RFI = RFIs[_Enum]; \ 364 RFI.Kind = _Enum; \ 365 RFI.Name = _Name; \ 366 RFI.IsVarArg = _IsVarArg; \ 367 RFI.ReturnType = OMPBuilder._ReturnType; \ 368 RFI.ArgumentTypes = std::move(ArgsTypes); \ 369 RFI.Declaration = F; \ 370 unsigned NumUses = collectUses(RFI); \ 371 (void)NumUses; \ 372 LLVM_DEBUG({ \ 373 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ 374 << " found\n"; \ 375 if (RFI.Declaration) \ 376 dbgs() << TAG << "-> got " << NumUses << " uses in " \ 377 << RFI.getNumFunctionsWithUses() \ 378 << " different functions.\n"; \ 379 }); \ 380 } \ 381 } 382 #include "llvm/Frontend/OpenMP/OMPKinds.def" 383 384 // TODO: We should attach the attributes defined in OMPKinds.def. 385 } 386 387 /// Collection of known kernels (\see Kernel) in the module. 388 SmallPtrSetImpl<Kernel> &Kernels; 389 }; 390 391 /// Used to map the values physically (in the IR) stored in an offload 392 /// array, to a vector in memory. 393 struct OffloadArray { 394 /// Physical array (in the IR). 395 AllocaInst *Array = nullptr; 396 /// Mapped values. 397 SmallVector<Value *, 8> StoredValues; 398 /// Last stores made in the offload array. 399 SmallVector<StoreInst *, 8> LastAccesses; 400 401 OffloadArray() = default; 402 403 /// Initializes the OffloadArray with the values stored in \p Array before 404 /// instruction \p Before is reached. Returns false if the initialization 405 /// fails. 406 /// This MUST be used immediately after the construction of the object. 407 bool initialize(AllocaInst &Array, Instruction &Before) { 408 if (!Array.getAllocatedType()->isArrayTy()) 409 return false; 410 411 if (!getValues(Array, Before)) 412 return false; 413 414 this->Array = &Array; 415 return true; 416 } 417 418 static const unsigned BasePtrsArgNum = 2; 419 static const unsigned PtrsArgNum = 3; 420 static const unsigned SizesArgNum = 4; 421 422 private: 423 /// Traverses the BasicBlock where \p Array is, collecting the stores made to 424 /// \p Array, leaving StoredValues with the values stored before the 425 /// instruction \p Before is reached. 426 bool getValues(AllocaInst &Array, Instruction &Before) { 427 // Initialize container. 428 const uint64_t NumValues = 429 Array.getAllocatedType()->getArrayNumElements(); 430 StoredValues.assign(NumValues, nullptr); 431 LastAccesses.assign(NumValues, nullptr); 432 433 // TODO: This assumes the instruction \p Before is in the same 434 // BasicBlock as Array. Make it general, for any control flow graph. 435 BasicBlock *BB = Array.getParent(); 436 if (BB != Before.getParent()) 437 return false; 438 439 const DataLayout &DL = Array.getModule()->getDataLayout(); 440 const unsigned int PointerSize = DL.getPointerSize(); 441 442 for (Instruction &I : *BB) { 443 if (&I == &Before) 444 break; 445 446 if (!isa<StoreInst>(&I)) 447 continue; 448 449 auto *S = cast<StoreInst>(&I); 450 int64_t Offset = -1; 451 auto *Dst = GetPointerBaseWithConstantOffset(S->getPointerOperand(), 452 Offset, DL); 453 if (Dst == &Array) { 454 int64_t Idx = Offset / PointerSize; 455 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); 456 LastAccesses[Idx] = S; 457 } 458 } 459 460 return isFilled(); 461 } 462 463 /// Returns true if all values in StoredValues and 464 /// LastAccesses are not nullptrs. 465 bool isFilled() { 466 const unsigned NumValues = StoredValues.size(); 467 for (unsigned I = 0; I < NumValues; ++I) { 468 if (!StoredValues[I] || !LastAccesses[I]) 469 return false; 470 } 471 472 return true; 473 } 474 }; 475 476 struct OpenMPOpt { 477 478 using OptimizationRemarkGetter = 479 function_ref<OptimizationRemarkEmitter &(Function *)>; 480 481 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, 482 OptimizationRemarkGetter OREGetter, 483 OMPInformationCache &OMPInfoCache, Attributor &A) 484 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), 485 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} 486 487 /// Check if any remarks are enabled for openmp-opt 488 bool remarksEnabled() { 489 auto &Ctx = M.getContext(); 490 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE); 491 } 492 493 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. 494 bool run() { 495 if (SCC.empty()) 496 return false; 497 498 bool Changed = false; 499 500 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() 501 << " functions in a slice with " 502 << OMPInfoCache.ModuleSlice.size() << " functions\n"); 503 504 if (PrintICVValues) 505 printICVs(); 506 if (PrintOpenMPKernels) 507 printKernels(); 508 509 Changed |= rewriteDeviceCodeStateMachine(); 510 511 Changed |= runAttributor(); 512 513 // Recollect uses, in case Attributor deleted any. 514 OMPInfoCache.recollectUses(); 515 516 Changed |= deleteParallelRegions(); 517 if (HideMemoryTransferLatency) 518 Changed |= hideMemTransfersLatency(); 519 if (remarksEnabled()) 520 analysisGlobalization(); 521 Changed |= deduplicateRuntimeCalls(); 522 if (EnableParallelRegionMerging) { 523 if (mergeParallelRegions()) { 524 deduplicateRuntimeCalls(); 525 Changed = true; 526 } 527 } 528 529 return Changed; 530 } 531 532 /// Print initial ICV values for testing. 533 /// FIXME: This should be done from the Attributor once it is added. 534 void printICVs() const { 535 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, 536 ICV_proc_bind}; 537 538 for (Function *F : OMPInfoCache.ModuleSlice) { 539 for (auto ICV : ICVs) { 540 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 541 auto Remark = [&](OptimizationRemark OR) { 542 return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) 543 << " Value: " 544 << (ICVInfo.InitValue 545 ? ICVInfo.InitValue->getValue().toString(10, true) 546 : "IMPLEMENTATION_DEFINED"); 547 }; 548 549 emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); 550 } 551 } 552 } 553 554 /// Print OpenMP GPU kernels for testing. 555 void printKernels() const { 556 for (Function *F : SCC) { 557 if (!OMPInfoCache.Kernels.count(F)) 558 continue; 559 560 auto Remark = [&](OptimizationRemark OR) { 561 return OR << "OpenMP GPU kernel " 562 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; 563 }; 564 565 emitRemarkOnFunction(F, "OpenMPGPU", Remark); 566 } 567 } 568 569 /// Return the call if \p U is a callee use in a regular call. If \p RFI is 570 /// given it has to be the callee or a nullptr is returned. 571 static CallInst *getCallIfRegularCall( 572 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 573 CallInst *CI = dyn_cast<CallInst>(U.getUser()); 574 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && 575 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 576 return CI; 577 return nullptr; 578 } 579 580 /// Return the call if \p V is a regular call. If \p RFI is given it has to be 581 /// the callee or a nullptr is returned. 582 static CallInst *getCallIfRegularCall( 583 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { 584 CallInst *CI = dyn_cast<CallInst>(&V); 585 if (CI && !CI->hasOperandBundles() && 586 (!RFI || CI->getCalledFunction() == RFI->Declaration)) 587 return CI; 588 return nullptr; 589 } 590 591 private: 592 /// Merge parallel regions when it is safe. 593 bool mergeParallelRegions() { 594 const unsigned CallbackCalleeOperand = 2; 595 const unsigned CallbackFirstArgOperand = 3; 596 using InsertPointTy = OpenMPIRBuilder::InsertPointTy; 597 598 // Check if there are any __kmpc_fork_call calls to merge. 599 OMPInformationCache::RuntimeFunctionInfo &RFI = 600 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 601 602 if (!RFI.Declaration) 603 return false; 604 605 // Check if there any __kmpc_push_proc_bind calls for explicit affinities. 606 OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI = 607 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind]; 608 609 // Defensively abort if explicit affinities are set. 610 // TODO: Track ICV proc_bind to merge when mergable regions have the same 611 // affinity. 612 if (ProcBindRFI.Declaration) 613 return false; 614 615 bool Changed = false; 616 LoopInfo *LI = nullptr; 617 DominatorTree *DT = nullptr; 618 619 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; 620 621 BasicBlock *StartBB = nullptr, *EndBB = nullptr; 622 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 623 BasicBlock &ContinuationIP) { 624 BasicBlock *CGStartBB = CodeGenIP.getBlock(); 625 BasicBlock *CGEndBB = 626 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); 627 assert(StartBB != nullptr && "StartBB should not be null"); 628 CGStartBB->getTerminator()->setSuccessor(0, StartBB); 629 assert(EndBB != nullptr && "EndBB should not be null"); 630 EndBB->getTerminator()->setSuccessor(0, CGEndBB); 631 }; 632 633 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, 634 Value &VPtr, Value *&ReplacementValue) -> InsertPointTy { 635 ReplacementValue = &VPtr; 636 return CodeGenIP; 637 }; 638 639 auto FiniCB = [&](InsertPointTy CodeGenIP) {}; 640 641 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all 642 // contained in BB and only separated by instructions that can be 643 // redundantly executed in parallel. The block BB is split before the first 644 // call (in MergableCIs) and after the last so the entire region we merge 645 // into a single parallel region is contained in a single basic block 646 // without any other instructions. We use the OpenMPIRBuilder to outline 647 // that block and call the resulting function via __kmpc_fork_call. 648 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { 649 // TODO: Change the interface to allow single CIs expanded, e.g, to 650 // include an outer loop. 651 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs"); 652 653 auto Remark = [&](OptimizationRemark OR) { 654 OR << "Parallel region at " 655 << ore::NV("OpenMPParallelMergeFront", 656 MergableCIs.front()->getDebugLoc()) 657 << " merged with parallel regions at "; 658 for (auto *CI : 659 llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) { 660 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); 661 if (CI != MergableCIs.back()) 662 OR << ", "; 663 } 664 return OR; 665 }; 666 667 emitRemark<OptimizationRemark>(MergableCIs.front(), 668 "OpenMPParallelRegionMerging", Remark); 669 670 Function *OriginalFn = BB->getParent(); 671 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size() 672 << " parallel regions in " << OriginalFn->getName() 673 << "\n"); 674 675 // Isolate the calls to merge in a separate block. 676 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); 677 BasicBlock *AfterBB = 678 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); 679 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, 680 "omp.par.merged"); 681 682 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG"); 683 const DebugLoc DL = BB->getTerminator()->getDebugLoc(); 684 BB->getTerminator()->eraseFromParent(); 685 686 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), 687 DL); 688 IRBuilder<>::InsertPoint AllocaIP( 689 &OriginalFn->getEntryBlock(), 690 OriginalFn->getEntryBlock().getFirstInsertionPt()); 691 // Create the merged parallel region with default proc binding, to 692 // avoid overriding binding settings, and without explicit cancellation. 693 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel( 694 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, 695 OMP_PROC_BIND_default, /* IsCancellable */ false); 696 BranchInst::Create(AfterBB, AfterIP.getBlock()); 697 698 // Perform the actual outlining. 699 OMPInfoCache.OMPBuilder.finalize(); 700 701 Function *OutlinedFn = MergableCIs.front()->getCaller(); 702 703 // Replace the __kmpc_fork_call calls with direct calls to the outlined 704 // callbacks. 705 SmallVector<Value *, 8> Args; 706 for (auto *CI : MergableCIs) { 707 Value *Callee = 708 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); 709 FunctionType *FT = 710 cast<FunctionType>(Callee->getType()->getPointerElementType()); 711 Args.clear(); 712 Args.push_back(OutlinedFn->getArg(0)); 713 Args.push_back(OutlinedFn->getArg(1)); 714 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 715 U < E; ++U) 716 Args.push_back(CI->getArgOperand(U)); 717 718 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); 719 if (CI->getDebugLoc()) 720 NewCI->setDebugLoc(CI->getDebugLoc()); 721 722 // Forward parameter attributes from the callback to the callee. 723 for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); 724 U < E; ++U) 725 for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) 726 NewCI->addParamAttr( 727 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); 728 729 // Emit an explicit barrier to replace the implicit fork-join barrier. 730 if (CI != MergableCIs.back()) { 731 // TODO: Remove barrier if the merged parallel region includes the 732 // 'nowait' clause. 733 OMPInfoCache.OMPBuilder.CreateBarrier( 734 InsertPointTy(NewCI->getParent(), 735 NewCI->getNextNode()->getIterator()), 736 OMPD_parallel); 737 } 738 739 auto Remark = [&](OptimizationRemark OR) { 740 return OR << "Parallel region at " 741 << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()) 742 << " merged with " 743 << ore::NV("OpenMPParallelMergeFront", 744 MergableCIs.front()->getDebugLoc()); 745 }; 746 if (CI != MergableCIs.front()) 747 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging", 748 Remark); 749 750 CI->eraseFromParent(); 751 } 752 753 assert(OutlinedFn != OriginalFn && "Outlining failed"); 754 CGUpdater.registerOutlinedFunction(*OutlinedFn); 755 CGUpdater.reanalyzeFunction(*OriginalFn); 756 757 NumOpenMPParallelRegionsMerged += MergableCIs.size(); 758 759 return true; 760 }; 761 762 // Helper function that identifes sequences of 763 // __kmpc_fork_call uses in a basic block. 764 auto DetectPRsCB = [&](Use &U, Function &F) { 765 CallInst *CI = getCallIfRegularCall(U, &RFI); 766 BB2PRMap[CI->getParent()].insert(CI); 767 768 return false; 769 }; 770 771 BB2PRMap.clear(); 772 RFI.foreachUse(SCC, DetectPRsCB); 773 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector; 774 // Find mergable parallel regions within a basic block that are 775 // safe to merge, that is any in-between instructions can safely 776 // execute in parallel after merging. 777 // TODO: support merging across basic-blocks. 778 for (auto &It : BB2PRMap) { 779 auto &CIs = It.getSecond(); 780 if (CIs.size() < 2) 781 continue; 782 783 BasicBlock *BB = It.getFirst(); 784 SmallVector<CallInst *, 4> MergableCIs; 785 786 // Find maximal number of parallel region CIs that are safe to merge. 787 for (Instruction &I : *BB) { 788 if (CIs.count(&I)) { 789 MergableCIs.push_back(cast<CallInst>(&I)); 790 continue; 791 } 792 793 if (isSafeToSpeculativelyExecute(&I, &I, DT)) 794 continue; 795 796 if (MergableCIs.size() > 1) { 797 MergableCIsVector.push_back(MergableCIs); 798 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size() 799 << " parallel regions in block " << BB->getName() 800 << " of function " << BB->getParent()->getName() 801 << "\n";); 802 } 803 804 MergableCIs.clear(); 805 } 806 807 if (!MergableCIsVector.empty()) { 808 Changed = true; 809 810 for (auto &MergableCIs : MergableCIsVector) 811 Merge(MergableCIs, BB); 812 } 813 } 814 815 if (Changed) { 816 // Update RFI info to set it up for later passes. 817 RFI.clearUsesMap(); 818 OMPInfoCache.collectUses(RFI, /* CollectStats */ false); 819 820 // Collect uses for the emitted barrier call. 821 OMPInformationCache::RuntimeFunctionInfo &BarrierRFI = 822 OMPInfoCache.RFIs[OMPRTL___kmpc_barrier]; 823 BarrierRFI.clearUsesMap(); 824 OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false); 825 } 826 827 return Changed; 828 } 829 830 /// Try to delete parallel regions if possible. 831 bool deleteParallelRegions() { 832 const unsigned CallbackCalleeOperand = 2; 833 834 OMPInformationCache::RuntimeFunctionInfo &RFI = 835 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; 836 837 if (!RFI.Declaration) 838 return false; 839 840 bool Changed = false; 841 auto DeleteCallCB = [&](Use &U, Function &) { 842 CallInst *CI = getCallIfRegularCall(U); 843 if (!CI) 844 return false; 845 auto *Fn = dyn_cast<Function>( 846 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); 847 if (!Fn) 848 return false; 849 if (!Fn->onlyReadsMemory()) 850 return false; 851 if (!Fn->hasFnAttribute(Attribute::WillReturn)) 852 return false; 853 854 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " 855 << CI->getCaller()->getName() << "\n"); 856 857 auto Remark = [&](OptimizationRemark OR) { 858 return OR << "Parallel region in " 859 << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) 860 << " deleted"; 861 }; 862 emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion", 863 Remark); 864 865 CGUpdater.removeCallSite(*CI); 866 CI->eraseFromParent(); 867 Changed = true; 868 ++NumOpenMPParallelRegionsDeleted; 869 return true; 870 }; 871 872 RFI.foreachUse(SCC, DeleteCallCB); 873 874 return Changed; 875 } 876 877 /// Try to eliminate runtime calls by reusing existing ones. 878 bool deduplicateRuntimeCalls() { 879 bool Changed = false; 880 881 RuntimeFunction DeduplicableRuntimeCallIDs[] = { 882 OMPRTL_omp_get_num_threads, 883 OMPRTL_omp_in_parallel, 884 OMPRTL_omp_get_cancellation, 885 OMPRTL_omp_get_thread_limit, 886 OMPRTL_omp_get_supported_active_levels, 887 OMPRTL_omp_get_level, 888 OMPRTL_omp_get_ancestor_thread_num, 889 OMPRTL_omp_get_team_size, 890 OMPRTL_omp_get_active_level, 891 OMPRTL_omp_in_final, 892 OMPRTL_omp_get_proc_bind, 893 OMPRTL_omp_get_num_places, 894 OMPRTL_omp_get_num_procs, 895 OMPRTL_omp_get_place_num, 896 OMPRTL_omp_get_partition_num_places, 897 OMPRTL_omp_get_partition_place_nums}; 898 899 // Global-tid is handled separately. 900 SmallSetVector<Value *, 16> GTIdArgs; 901 collectGlobalThreadIdArguments(GTIdArgs); 902 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() 903 << " global thread ID arguments\n"); 904 905 for (Function *F : SCC) { 906 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) 907 Changed |= deduplicateRuntimeCalls( 908 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); 909 910 // __kmpc_global_thread_num is special as we can replace it with an 911 // argument in enough cases to make it worth trying. 912 Value *GTIdArg = nullptr; 913 for (Argument &Arg : F->args()) 914 if (GTIdArgs.count(&Arg)) { 915 GTIdArg = &Arg; 916 break; 917 } 918 Changed |= deduplicateRuntimeCalls( 919 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); 920 } 921 922 return Changed; 923 } 924 925 /// Tries to hide the latency of runtime calls that involve host to 926 /// device memory transfers by splitting them into their "issue" and "wait" 927 /// versions. The "issue" is moved upwards as much as possible. The "wait" is 928 /// moved downards as much as possible. The "issue" issues the memory transfer 929 /// asynchronously, returning a handle. The "wait" waits in the returned 930 /// handle for the memory transfer to finish. 931 bool hideMemTransfersLatency() { 932 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; 933 bool Changed = false; 934 auto SplitMemTransfers = [&](Use &U, Function &Decl) { 935 auto *RTCall = getCallIfRegularCall(U, &RFI); 936 if (!RTCall) 937 return false; 938 939 OffloadArray OffloadArrays[3]; 940 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) 941 return false; 942 943 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays)); 944 945 // TODO: Check if can be moved upwards. 946 bool WasSplit = false; 947 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); 948 if (WaitMovementPoint) 949 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); 950 951 Changed |= WasSplit; 952 return WasSplit; 953 }; 954 RFI.foreachUse(SCC, SplitMemTransfers); 955 956 return Changed; 957 } 958 959 void analysisGlobalization() { 960 RuntimeFunction GlobalizationRuntimeIDs[] = { 961 OMPRTL___kmpc_data_sharing_coalesced_push_stack, 962 OMPRTL___kmpc_data_sharing_push_stack}; 963 964 for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { 965 auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; 966 967 auto CheckGlobalization = [&](Use &U, Function &Decl) { 968 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { 969 auto Remark = [&](OptimizationRemarkAnalysis ORA) { 970 return ORA 971 << "Found thread data sharing on the GPU. " 972 << "Expect degraded performance due to data globalization."; 973 }; 974 emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization", 975 Remark); 976 } 977 978 return false; 979 }; 980 981 RFI.foreachUse(SCC, CheckGlobalization); 982 } 983 return; 984 } 985 986 /// Maps the values stored in the offload arrays passed as arguments to 987 /// \p RuntimeCall into the offload arrays in \p OAs. 988 bool getValuesInOffloadArrays(CallInst &RuntimeCall, 989 MutableArrayRef<OffloadArray> OAs) { 990 assert(OAs.size() == 3 && "Need space for three offload arrays!"); 991 992 // A runtime call that involves memory offloading looks something like: 993 // call void @__tgt_target_data_begin_mapper(arg0, arg1, 994 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, 995 // ...) 996 // So, the idea is to access the allocas that allocate space for these 997 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. 998 // Therefore: 999 // i8** %offload_baseptrs. 1000 Value *BasePtrsArg = 1001 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); 1002 // i8** %offload_ptrs. 1003 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); 1004 // i8** %offload_sizes. 1005 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); 1006 1007 // Get values stored in **offload_baseptrs. 1008 auto *V = getUnderlyingObject(BasePtrsArg); 1009 if (!isa<AllocaInst>(V)) 1010 return false; 1011 auto *BasePtrsArray = cast<AllocaInst>(V); 1012 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) 1013 return false; 1014 1015 // Get values stored in **offload_baseptrs. 1016 V = getUnderlyingObject(PtrsArg); 1017 if (!isa<AllocaInst>(V)) 1018 return false; 1019 auto *PtrsArray = cast<AllocaInst>(V); 1020 if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) 1021 return false; 1022 1023 // Get values stored in **offload_sizes. 1024 V = getUnderlyingObject(SizesArg); 1025 // If it's a [constant] global array don't analyze it. 1026 if (isa<GlobalValue>(V)) 1027 return isa<Constant>(V); 1028 if (!isa<AllocaInst>(V)) 1029 return false; 1030 1031 auto *SizesArray = cast<AllocaInst>(V); 1032 if (!OAs[2].initialize(*SizesArray, RuntimeCall)) 1033 return false; 1034 1035 return true; 1036 } 1037 1038 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. 1039 /// For now this is a way to test that the function getValuesInOffloadArrays 1040 /// is working properly. 1041 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. 1042 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) { 1043 assert(OAs.size() == 3 && "There are three offload arrays to debug!"); 1044 1045 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n"); 1046 std::string ValuesStr; 1047 raw_string_ostream Printer(ValuesStr); 1048 std::string Separator = " --- "; 1049 1050 for (auto *BP : OAs[0].StoredValues) { 1051 BP->print(Printer); 1052 Printer << Separator; 1053 } 1054 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n"); 1055 ValuesStr.clear(); 1056 1057 for (auto *P : OAs[1].StoredValues) { 1058 P->print(Printer); 1059 Printer << Separator; 1060 } 1061 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n"); 1062 ValuesStr.clear(); 1063 1064 for (auto *S : OAs[2].StoredValues) { 1065 S->print(Printer); 1066 Printer << Separator; 1067 } 1068 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n"); 1069 } 1070 1071 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be 1072 /// moved. Returns nullptr if the movement is not possible, or not worth it. 1073 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { 1074 // FIXME: This traverses only the BasicBlock where RuntimeCall is. 1075 // Make it traverse the CFG. 1076 1077 Instruction *CurrentI = &RuntimeCall; 1078 bool IsWorthIt = false; 1079 while ((CurrentI = CurrentI->getNextNode())) { 1080 1081 // TODO: Once we detect the regions to be offloaded we should use the 1082 // alias analysis manager to check if CurrentI may modify one of 1083 // the offloaded regions. 1084 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { 1085 if (IsWorthIt) 1086 return CurrentI; 1087 1088 return nullptr; 1089 } 1090 1091 // FIXME: For now if we move it over anything without side effect 1092 // is worth it. 1093 IsWorthIt = true; 1094 } 1095 1096 // Return end of BasicBlock. 1097 return RuntimeCall.getParent()->getTerminator(); 1098 } 1099 1100 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. 1101 bool splitTargetDataBeginRTC(CallInst &RuntimeCall, 1102 Instruction &WaitMovementPoint) { 1103 // Create stack allocated handle (__tgt_async_info) at the beginning of the 1104 // function. Used for storing information of the async transfer, allowing to 1105 // wait on it later. 1106 auto &IRBuilder = OMPInfoCache.OMPBuilder; 1107 auto *F = RuntimeCall.getCaller(); 1108 Instruction *FirstInst = &(F->getEntryBlock().front()); 1109 AllocaInst *Handle = new AllocaInst( 1110 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); 1111 1112 // Add "issue" runtime call declaration: 1113 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, 1114 // i8**, i8**, i64*, i64*) 1115 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( 1116 M, OMPRTL___tgt_target_data_begin_mapper_issue); 1117 1118 // Change RuntimeCall call site for its asynchronous version. 1119 SmallVector<Value *, 8> Args; 1120 for (auto &Arg : RuntimeCall.args()) 1121 Args.push_back(Arg.get()); 1122 Args.push_back(Handle); 1123 1124 CallInst *IssueCallsite = 1125 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); 1126 RuntimeCall.eraseFromParent(); 1127 1128 // Add "wait" runtime call declaration: 1129 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) 1130 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( 1131 M, OMPRTL___tgt_target_data_begin_mapper_wait); 1132 1133 // Add call site to WaitDecl. 1134 const unsigned DeviceIDArgNum = 0; 1135 Value *WaitParams[2] = { 1136 IssueCallsite->getArgOperand(DeviceIDArgNum), // device_id. 1137 Handle // handle to wait on. 1138 }; 1139 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); 1140 1141 return true; 1142 } 1143 1144 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, 1145 bool GlobalOnly, bool &SingleChoice) { 1146 if (CurrentIdent == NextIdent) 1147 return CurrentIdent; 1148 1149 // TODO: Figure out how to actually combine multiple debug locations. For 1150 // now we just keep an existing one if there is a single choice. 1151 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { 1152 SingleChoice = !CurrentIdent; 1153 return NextIdent; 1154 } 1155 return nullptr; 1156 } 1157 1158 /// Return an `struct ident_t*` value that represents the ones used in the 1159 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not 1160 /// return a local `struct ident_t*`. For now, if we cannot find a suitable 1161 /// return value we create one from scratch. We also do not yet combine 1162 /// information, e.g., the source locations, see combinedIdentStruct. 1163 Value * 1164 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, 1165 Function &F, bool GlobalOnly) { 1166 bool SingleChoice = true; 1167 Value *Ident = nullptr; 1168 auto CombineIdentStruct = [&](Use &U, Function &Caller) { 1169 CallInst *CI = getCallIfRegularCall(U, &RFI); 1170 if (!CI || &F != &Caller) 1171 return false; 1172 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), 1173 /* GlobalOnly */ true, SingleChoice); 1174 return false; 1175 }; 1176 RFI.foreachUse(SCC, CombineIdentStruct); 1177 1178 if (!Ident || !SingleChoice) { 1179 // The IRBuilder uses the insertion block to get to the module, this is 1180 // unfortunate but we work around it for now. 1181 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) 1182 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( 1183 &F.getEntryBlock(), F.getEntryBlock().begin())); 1184 // Create a fallback location if non was found. 1185 // TODO: Use the debug locations of the calls instead. 1186 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); 1187 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); 1188 } 1189 return Ident; 1190 } 1191 1192 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or 1193 /// \p ReplVal if given. 1194 bool deduplicateRuntimeCalls(Function &F, 1195 OMPInformationCache::RuntimeFunctionInfo &RFI, 1196 Value *ReplVal = nullptr) { 1197 auto *UV = RFI.getUseVector(F); 1198 if (!UV || UV->size() + (ReplVal != nullptr) < 2) 1199 return false; 1200 1201 LLVM_DEBUG( 1202 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name 1203 << (ReplVal ? " with an existing value\n" : "\n") << "\n"); 1204 1205 assert((!ReplVal || (isa<Argument>(ReplVal) && 1206 cast<Argument>(ReplVal)->getParent() == &F)) && 1207 "Unexpected replacement value!"); 1208 1209 // TODO: Use dominance to find a good position instead. 1210 auto CanBeMoved = [this](CallBase &CB) { 1211 unsigned NumArgs = CB.getNumArgOperands(); 1212 if (NumArgs == 0) 1213 return true; 1214 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) 1215 return false; 1216 for (unsigned u = 1; u < NumArgs; ++u) 1217 if (isa<Instruction>(CB.getArgOperand(u))) 1218 return false; 1219 return true; 1220 }; 1221 1222 if (!ReplVal) { 1223 for (Use *U : *UV) 1224 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { 1225 if (!CanBeMoved(*CI)) 1226 continue; 1227 1228 auto Remark = [&](OptimizationRemark OR) { 1229 auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); 1230 return OR << "OpenMP runtime call " 1231 << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " 1232 << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); 1233 }; 1234 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark); 1235 1236 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); 1237 ReplVal = CI; 1238 break; 1239 } 1240 if (!ReplVal) 1241 return false; 1242 } 1243 1244 // If we use a call as a replacement value we need to make sure the ident is 1245 // valid at the new location. For now we just pick a global one, either 1246 // existing and used by one of the calls, or created from scratch. 1247 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { 1248 if (CI->getNumArgOperands() > 0 && 1249 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { 1250 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, 1251 /* GlobalOnly */ true); 1252 CI->setArgOperand(0, Ident); 1253 } 1254 } 1255 1256 bool Changed = false; 1257 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { 1258 CallInst *CI = getCallIfRegularCall(U, &RFI); 1259 if (!CI || CI == ReplVal || &F != &Caller) 1260 return false; 1261 assert(CI->getCaller() == &F && "Unexpected call!"); 1262 1263 auto Remark = [&](OptimizationRemark OR) { 1264 return OR << "OpenMP runtime call " 1265 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; 1266 }; 1267 emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark); 1268 1269 CGUpdater.removeCallSite(*CI); 1270 CI->replaceAllUsesWith(ReplVal); 1271 CI->eraseFromParent(); 1272 ++NumOpenMPRuntimeCallsDeduplicated; 1273 Changed = true; 1274 return true; 1275 }; 1276 RFI.foreachUse(SCC, ReplaceAndDeleteCB); 1277 1278 return Changed; 1279 } 1280 1281 /// Collect arguments that represent the global thread id in \p GTIdArgs. 1282 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) { 1283 // TODO: Below we basically perform a fixpoint iteration with a pessimistic 1284 // initialization. We could define an AbstractAttribute instead and 1285 // run the Attributor here once it can be run as an SCC pass. 1286 1287 // Helper to check the argument \p ArgNo at all call sites of \p F for 1288 // a GTId. 1289 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { 1290 if (!F.hasLocalLinkage()) 1291 return false; 1292 for (Use &U : F.uses()) { 1293 if (CallInst *CI = getCallIfRegularCall(U)) { 1294 Value *ArgOp = CI->getArgOperand(ArgNo); 1295 if (CI == &RefCI || GTIdArgs.count(ArgOp) || 1296 getCallIfRegularCall( 1297 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) 1298 continue; 1299 } 1300 return false; 1301 } 1302 return true; 1303 }; 1304 1305 // Helper to identify uses of a GTId as GTId arguments. 1306 auto AddUserArgs = [&](Value >Id) { 1307 for (Use &U : GTId.uses()) 1308 if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) 1309 if (CI->isArgOperand(&U)) 1310 if (Function *Callee = CI->getCalledFunction()) 1311 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) 1312 GTIdArgs.insert(Callee->getArg(U.getOperandNo())); 1313 }; 1314 1315 // The argument users of __kmpc_global_thread_num calls are GTIds. 1316 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = 1317 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; 1318 1319 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { 1320 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) 1321 AddUserArgs(*CI); 1322 return false; 1323 }); 1324 1325 // Transitively search for more arguments by looking at the users of the 1326 // ones we know already. During the search the GTIdArgs vector is extended 1327 // so we cannot cache the size nor can we use a range based for. 1328 for (unsigned u = 0; u < GTIdArgs.size(); ++u) 1329 AddUserArgs(*GTIdArgs[u]); 1330 } 1331 1332 /// Kernel (=GPU) optimizations and utility functions 1333 /// 1334 ///{{ 1335 1336 /// Check if \p F is a kernel, hence entry point for target offloading. 1337 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } 1338 1339 /// Cache to remember the unique kernel for a function. 1340 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; 1341 1342 /// Find the unique kernel that will execute \p F, if any. 1343 Kernel getUniqueKernelFor(Function &F); 1344 1345 /// Find the unique kernel that will execute \p I, if any. 1346 Kernel getUniqueKernelFor(Instruction &I) { 1347 return getUniqueKernelFor(*I.getFunction()); 1348 } 1349 1350 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in 1351 /// the cases we can avoid taking the address of a function. 1352 bool rewriteDeviceCodeStateMachine(); 1353 1354 /// 1355 ///}} 1356 1357 /// Emit a remark generically 1358 /// 1359 /// This template function can be used to generically emit a remark. The 1360 /// RemarkKind should be one of the following: 1361 /// - OptimizationRemark to indicate a successful optimization attempt 1362 /// - OptimizationRemarkMissed to report a failed optimization attempt 1363 /// - OptimizationRemarkAnalysis to provide additional information about an 1364 /// optimization attempt 1365 /// 1366 /// The remark is built using a callback function provided by the caller that 1367 /// takes a RemarkKind as input and returns a RemarkKind. 1368 template <typename RemarkKind, 1369 typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>> 1370 void emitRemark(Instruction *Inst, StringRef RemarkName, 1371 RemarkCallBack &&RemarkCB) const { 1372 Function *F = Inst->getParent()->getParent(); 1373 auto &ORE = OREGetter(F); 1374 1375 ORE.emit( 1376 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); 1377 } 1378 1379 /// Emit a remark on a function. Since only OptimizationRemark is supporting 1380 /// this, it can't be made generic. 1381 void 1382 emitRemarkOnFunction(Function *F, StringRef RemarkName, 1383 function_ref<OptimizationRemark(OptimizationRemark &&)> 1384 &&RemarkCB) const { 1385 auto &ORE = OREGetter(F); 1386 1387 ORE.emit([&]() { 1388 return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); 1389 }); 1390 } 1391 1392 /// The underlying module. 1393 Module &M; 1394 1395 /// The SCC we are operating on. 1396 SmallVectorImpl<Function *> &SCC; 1397 1398 /// Callback to update the call graph, the first argument is a removed call, 1399 /// the second an optional replacement call. 1400 CallGraphUpdater &CGUpdater; 1401 1402 /// Callback to get an OptimizationRemarkEmitter from a Function * 1403 OptimizationRemarkGetter OREGetter; 1404 1405 /// OpenMP-specific information cache. Also Used for Attributor runs. 1406 OMPInformationCache &OMPInfoCache; 1407 1408 /// Attributor instance. 1409 Attributor &A; 1410 1411 /// Helper function to run Attributor on SCC. 1412 bool runAttributor() { 1413 if (SCC.empty()) 1414 return false; 1415 1416 registerAAs(); 1417 1418 ChangeStatus Changed = A.run(); 1419 1420 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size() 1421 << " functions, result: " << Changed << ".\n"); 1422 1423 return Changed == ChangeStatus::CHANGED; 1424 } 1425 1426 /// Populate the Attributor with abstract attribute opportunities in the 1427 /// function. 1428 void registerAAs() { 1429 if (SCC.empty()) 1430 return; 1431 1432 // Create CallSite AA for all Getters. 1433 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { 1434 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; 1435 1436 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; 1437 1438 auto CreateAA = [&](Use &U, Function &Caller) { 1439 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); 1440 if (!CI) 1441 return false; 1442 1443 auto &CB = cast<CallBase>(*CI); 1444 1445 IRPosition CBPos = IRPosition::callsite_function(CB); 1446 A.getOrCreateAAFor<AAICVTracker>(CBPos); 1447 return false; 1448 }; 1449 1450 GetterRFI.foreachUse(SCC, CreateAA); 1451 } 1452 } 1453 }; 1454 1455 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { 1456 if (!OMPInfoCache.ModuleSlice.count(&F)) 1457 return nullptr; 1458 1459 // Use a scope to keep the lifetime of the CachedKernel short. 1460 { 1461 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; 1462 if (CachedKernel) 1463 return *CachedKernel; 1464 1465 // TODO: We should use an AA to create an (optimistic and callback 1466 // call-aware) call graph. For now we stick to simple patterns that 1467 // are less powerful, basically the worst fixpoint. 1468 if (isKernel(F)) { 1469 CachedKernel = Kernel(&F); 1470 return *CachedKernel; 1471 } 1472 1473 CachedKernel = nullptr; 1474 if (!F.hasLocalLinkage()) 1475 return nullptr; 1476 } 1477 1478 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { 1479 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { 1480 // Allow use in equality comparisons. 1481 if (Cmp->isEquality()) 1482 return getUniqueKernelFor(*Cmp); 1483 return nullptr; 1484 } 1485 if (auto *CB = dyn_cast<CallBase>(U.getUser())) { 1486 // Allow direct calls. 1487 if (CB->isCallee(&U)) 1488 return getUniqueKernelFor(*CB); 1489 // Allow the use in __kmpc_kernel_prepare_parallel calls. 1490 if (Function *Callee = CB->getCalledFunction()) 1491 if (Callee->getName() == "__kmpc_kernel_prepare_parallel") 1492 return getUniqueKernelFor(*CB); 1493 return nullptr; 1494 } 1495 // Disallow every other use. 1496 return nullptr; 1497 }; 1498 1499 // TODO: In the future we want to track more than just a unique kernel. 1500 SmallPtrSet<Kernel, 2> PotentialKernels; 1501 OMPInformationCache::foreachUse(F, [&](const Use &U) { 1502 PotentialKernels.insert(GetUniqueKernelForUse(U)); 1503 }); 1504 1505 Kernel K = nullptr; 1506 if (PotentialKernels.size() == 1) 1507 K = *PotentialKernels.begin(); 1508 1509 // Cache the result. 1510 UniqueKernelMap[&F] = K; 1511 1512 return K; 1513 } 1514 1515 bool OpenMPOpt::rewriteDeviceCodeStateMachine() { 1516 OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI = 1517 OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel]; 1518 1519 bool Changed = false; 1520 if (!KernelPrepareParallelRFI) 1521 return Changed; 1522 1523 for (Function *F : SCC) { 1524 1525 // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at 1526 // all. 1527 bool UnknownUse = false; 1528 bool KernelPrepareUse = false; 1529 unsigned NumDirectCalls = 0; 1530 1531 SmallVector<Use *, 2> ToBeReplacedStateMachineUses; 1532 OMPInformationCache::foreachUse(*F, [&](Use &U) { 1533 if (auto *CB = dyn_cast<CallBase>(U.getUser())) 1534 if (CB->isCallee(&U)) { 1535 ++NumDirectCalls; 1536 return; 1537 } 1538 1539 if (isa<ICmpInst>(U.getUser())) { 1540 ToBeReplacedStateMachineUses.push_back(&U); 1541 return; 1542 } 1543 if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( 1544 *U.getUser(), &KernelPrepareParallelRFI)) { 1545 KernelPrepareUse = true; 1546 ToBeReplacedStateMachineUses.push_back(&U); 1547 return; 1548 } 1549 UnknownUse = true; 1550 }); 1551 1552 // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel 1553 // use. 1554 if (!KernelPrepareUse) 1555 continue; 1556 1557 { 1558 auto Remark = [&](OptimizationRemark OR) { 1559 return OR << "Found a parallel region that is called in a target " 1560 "region but not part of a combined target construct nor " 1561 "nesed inside a target construct without intermediate " 1562 "code. This can lead to excessive register usage for " 1563 "unrelated target regions in the same translation unit " 1564 "due to spurious call edges assumed by ptxas."; 1565 }; 1566 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1567 } 1568 1569 // If this ever hits, we should investigate. 1570 // TODO: Checking the number of uses is not a necessary restriction and 1571 // should be lifted. 1572 if (UnknownUse || NumDirectCalls != 1 || 1573 ToBeReplacedStateMachineUses.size() != 2) { 1574 { 1575 auto Remark = [&](OptimizationRemark OR) { 1576 return OR << "Parallel region is used in " 1577 << (UnknownUse ? "unknown" : "unexpected") 1578 << " ways; will not attempt to rewrite the state machine."; 1579 }; 1580 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); 1581 } 1582 continue; 1583 } 1584 1585 // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give 1586 // up if the function is not called from a unique kernel. 1587 Kernel K = getUniqueKernelFor(*F); 1588 if (!K) { 1589 { 1590 auto Remark = [&](OptimizationRemark OR) { 1591 return OR << "Parallel region is not known to be called from a " 1592 "unique single target region, maybe the surrounding " 1593 "function has external linkage?; will not attempt to " 1594 "rewrite the state machine use."; 1595 }; 1596 emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", 1597 Remark); 1598 } 1599 continue; 1600 } 1601 1602 // We now know F is a parallel body function called only from the kernel K. 1603 // We also identified the state machine uses in which we replace the 1604 // function pointer by a new global symbol for identification purposes. This 1605 // ensures only direct calls to the function are left. 1606 1607 { 1608 auto RemarkParalleRegion = [&](OptimizationRemark OR) { 1609 return OR << "Specialize parallel region that is only reached from a " 1610 "single target region to avoid spurious call edges and " 1611 "excessive register usage in other target regions. " 1612 "(parallel region ID: " 1613 << ore::NV("OpenMPParallelRegion", F->getName()) 1614 << ", kernel ID: " 1615 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1616 }; 1617 emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", 1618 RemarkParalleRegion); 1619 auto RemarkKernel = [&](OptimizationRemark OR) { 1620 return OR << "Target region containing the parallel region that is " 1621 "specialized. (parallel region ID: " 1622 << ore::NV("OpenMPParallelRegion", F->getName()) 1623 << ", kernel ID: " 1624 << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; 1625 }; 1626 emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); 1627 } 1628 1629 Module &M = *F->getParent(); 1630 Type *Int8Ty = Type::getInt8Ty(M.getContext()); 1631 1632 auto *ID = new GlobalVariable( 1633 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, 1634 UndefValue::get(Int8Ty), F->getName() + ".ID"); 1635 1636 for (Use *U : ToBeReplacedStateMachineUses) 1637 U->set(ConstantExpr::getBitCast(ID, U->get()->getType())); 1638 1639 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; 1640 1641 Changed = true; 1642 } 1643 1644 return Changed; 1645 } 1646 1647 /// Abstract Attribute for tracking ICV values. 1648 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { 1649 using Base = StateWrapper<BooleanState, AbstractAttribute>; 1650 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} 1651 1652 void initialize(Attributor &A) override { 1653 Function *F = getAnchorScope(); 1654 if (!F || !A.isFunctionIPOAmendable(*F)) 1655 indicatePessimisticFixpoint(); 1656 } 1657 1658 /// Returns true if value is assumed to be tracked. 1659 bool isAssumedTracked() const { return getAssumed(); } 1660 1661 /// Returns true if value is known to be tracked. 1662 bool isKnownTracked() const { return getAssumed(); } 1663 1664 /// Create an abstract attribute biew for the position \p IRP. 1665 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); 1666 1667 /// Return the value with which \p I can be replaced for specific \p ICV. 1668 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, 1669 const Instruction *I, 1670 Attributor &A) const { 1671 return None; 1672 } 1673 1674 /// Return an assumed unique ICV value if a single candidate is found. If 1675 /// there cannot be one, return a nullptr. If it is not clear yet, return the 1676 /// Optional::NoneType. 1677 virtual Optional<Value *> 1678 getUniqueReplacementValue(InternalControlVar ICV) const = 0; 1679 1680 // Currently only nthreads is being tracked. 1681 // this array will only grow with time. 1682 InternalControlVar TrackableICVs[1] = {ICV_nthreads}; 1683 1684 /// See AbstractAttribute::getName() 1685 const std::string getName() const override { return "AAICVTracker"; } 1686 1687 /// See AbstractAttribute::getIdAddr() 1688 const char *getIdAddr() const override { return &ID; } 1689 1690 /// This function should return true if the type of the \p AA is AAICVTracker 1691 static bool classof(const AbstractAttribute *AA) { 1692 return (AA->getIdAddr() == &ID); 1693 } 1694 1695 static const char ID; 1696 }; 1697 1698 struct AAICVTrackerFunction : public AAICVTracker { 1699 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) 1700 : AAICVTracker(IRP, A) {} 1701 1702 // FIXME: come up with better string. 1703 const std::string getAsStr() const override { return "ICVTrackerFunction"; } 1704 1705 // FIXME: come up with some stats. 1706 void trackStatistics() const override {} 1707 1708 /// We don't manifest anything for this AA. 1709 ChangeStatus manifest(Attributor &A) override { 1710 return ChangeStatus::UNCHANGED; 1711 } 1712 1713 // Map of ICV to their values at specific program point. 1714 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar, 1715 InternalControlVar::ICV___last> 1716 ICVReplacementValuesMap; 1717 1718 ChangeStatus updateImpl(Attributor &A) override { 1719 ChangeStatus HasChanged = ChangeStatus::UNCHANGED; 1720 1721 Function *F = getAnchorScope(); 1722 1723 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1724 1725 for (InternalControlVar ICV : TrackableICVs) { 1726 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1727 1728 auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1729 auto TrackValues = [&](Use &U, Function &) { 1730 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); 1731 if (!CI) 1732 return false; 1733 1734 // FIXME: handle setters with more that 1 arguments. 1735 /// Track new value. 1736 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second) 1737 HasChanged = ChangeStatus::CHANGED; 1738 1739 return false; 1740 }; 1741 1742 auto CallCheck = [&](Instruction &I) { 1743 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); 1744 if (ReplVal.hasValue() && 1745 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) 1746 HasChanged = ChangeStatus::CHANGED; 1747 1748 return true; 1749 }; 1750 1751 // Track all changes of an ICV. 1752 SetterRFI.foreachUse(TrackValues, F); 1753 1754 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call}, 1755 /* CheckBBLivenessOnly */ true); 1756 1757 /// TODO: Figure out a way to avoid adding entry in 1758 /// ICVReplacementValuesMap 1759 Instruction *Entry = &F->getEntryBlock().front(); 1760 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry)) 1761 ValuesMap.insert(std::make_pair(Entry, nullptr)); 1762 } 1763 1764 return HasChanged; 1765 } 1766 1767 /// Hepler to check if \p I is a call and get the value for it if it is 1768 /// unique. 1769 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, 1770 InternalControlVar &ICV) const { 1771 1772 const auto *CB = dyn_cast<CallBase>(I); 1773 if (!CB) 1774 return None; 1775 1776 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1777 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; 1778 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; 1779 Function *CalledFunction = CB->getCalledFunction(); 1780 1781 // Indirect call, assume ICV changes. 1782 if (CalledFunction == nullptr) 1783 return nullptr; 1784 if (CalledFunction == GetterRFI.Declaration) 1785 return None; 1786 if (CalledFunction == SetterRFI.Declaration) { 1787 if (ICVReplacementValuesMap[ICV].count(I)) 1788 return ICVReplacementValuesMap[ICV].lookup(I); 1789 1790 return nullptr; 1791 } 1792 1793 // Since we don't know, assume it changes the ICV. 1794 if (CalledFunction->isDeclaration()) 1795 return nullptr; 1796 1797 const auto &ICVTrackingAA = 1798 A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB)); 1799 1800 if (ICVTrackingAA.isAssumedTracked()) 1801 return ICVTrackingAA.getUniqueReplacementValue(ICV); 1802 1803 // If we don't know, assume it changes. 1804 return nullptr; 1805 } 1806 1807 // We don't check unique value for a function, so return None. 1808 Optional<Value *> 1809 getUniqueReplacementValue(InternalControlVar ICV) const override { 1810 return None; 1811 } 1812 1813 /// Return the value with which \p I can be replaced for specific \p ICV. 1814 Optional<Value *> getReplacementValue(InternalControlVar ICV, 1815 const Instruction *I, 1816 Attributor &A) const override { 1817 const auto &ValuesMap = ICVReplacementValuesMap[ICV]; 1818 if (ValuesMap.count(I)) 1819 return ValuesMap.lookup(I); 1820 1821 SmallVector<const Instruction *, 16> Worklist; 1822 SmallPtrSet<const Instruction *, 16> Visited; 1823 Worklist.push_back(I); 1824 1825 Optional<Value *> ReplVal; 1826 1827 while (!Worklist.empty()) { 1828 const Instruction *CurrInst = Worklist.pop_back_val(); 1829 if (!Visited.insert(CurrInst).second) 1830 continue; 1831 1832 const BasicBlock *CurrBB = CurrInst->getParent(); 1833 1834 // Go up and look for all potential setters/calls that might change the 1835 // ICV. 1836 while ((CurrInst = CurrInst->getPrevNode())) { 1837 if (ValuesMap.count(CurrInst)) { 1838 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); 1839 // Unknown value, track new. 1840 if (!ReplVal.hasValue()) { 1841 ReplVal = NewReplVal; 1842 break; 1843 } 1844 1845 // If we found a new value, we can't know the icv value anymore. 1846 if (NewReplVal.hasValue()) 1847 if (ReplVal != NewReplVal) 1848 return nullptr; 1849 1850 break; 1851 } 1852 1853 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); 1854 if (!NewReplVal.hasValue()) 1855 continue; 1856 1857 // Unknown value, track new. 1858 if (!ReplVal.hasValue()) { 1859 ReplVal = NewReplVal; 1860 break; 1861 } 1862 1863 // if (NewReplVal.hasValue()) 1864 // We found a new value, we can't know the icv value anymore. 1865 if (ReplVal != NewReplVal) 1866 return nullptr; 1867 } 1868 1869 // If we are in the same BB and we have a value, we are done. 1870 if (CurrBB == I->getParent() && ReplVal.hasValue()) 1871 return ReplVal; 1872 1873 // Go through all predecessors and add terminators for analysis. 1874 for (const BasicBlock *Pred : predecessors(CurrBB)) 1875 if (const Instruction *Terminator = Pred->getTerminator()) 1876 Worklist.push_back(Terminator); 1877 } 1878 1879 return ReplVal; 1880 } 1881 }; 1882 1883 struct AAICVTrackerFunctionReturned : AAICVTracker { 1884 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A) 1885 : AAICVTracker(IRP, A) {} 1886 1887 // FIXME: come up with better string. 1888 const std::string getAsStr() const override { 1889 return "ICVTrackerFunctionReturned"; 1890 } 1891 1892 // FIXME: come up with some stats. 1893 void trackStatistics() const override {} 1894 1895 /// We don't manifest anything for this AA. 1896 ChangeStatus manifest(Attributor &A) override { 1897 return ChangeStatus::UNCHANGED; 1898 } 1899 1900 // Map of ICV to their values at specific program point. 1901 EnumeratedArray<Optional<Value *>, InternalControlVar, 1902 InternalControlVar::ICV___last> 1903 ICVReplacementValuesMap; 1904 1905 /// Return the value with which \p I can be replaced for specific \p ICV. 1906 Optional<Value *> 1907 getUniqueReplacementValue(InternalControlVar ICV) const override { 1908 return ICVReplacementValuesMap[ICV]; 1909 } 1910 1911 ChangeStatus updateImpl(Attributor &A) override { 1912 ChangeStatus Changed = ChangeStatus::UNCHANGED; 1913 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 1914 *this, IRPosition::function(*getAnchorScope())); 1915 1916 if (!ICVTrackingAA.isAssumedTracked()) 1917 return indicatePessimisticFixpoint(); 1918 1919 for (InternalControlVar ICV : TrackableICVs) { 1920 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 1921 Optional<Value *> UniqueICVValue; 1922 1923 auto CheckReturnInst = [&](Instruction &I) { 1924 Optional<Value *> NewReplVal = 1925 ICVTrackingAA.getReplacementValue(ICV, &I, A); 1926 1927 // If we found a second ICV value there is no unique returned value. 1928 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) 1929 return false; 1930 1931 UniqueICVValue = NewReplVal; 1932 1933 return true; 1934 }; 1935 1936 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}, 1937 /* CheckBBLivenessOnly */ true)) 1938 UniqueICVValue = nullptr; 1939 1940 if (UniqueICVValue == ReplVal) 1941 continue; 1942 1943 ReplVal = UniqueICVValue; 1944 Changed = ChangeStatus::CHANGED; 1945 } 1946 1947 return Changed; 1948 } 1949 }; 1950 1951 struct AAICVTrackerCallSite : AAICVTracker { 1952 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A) 1953 : AAICVTracker(IRP, A) {} 1954 1955 void initialize(Attributor &A) override { 1956 Function *F = getAnchorScope(); 1957 if (!F || !A.isFunctionIPOAmendable(*F)) 1958 indicatePessimisticFixpoint(); 1959 1960 // We only initialize this AA for getters, so we need to know which ICV it 1961 // gets. 1962 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); 1963 for (InternalControlVar ICV : TrackableICVs) { 1964 auto ICVInfo = OMPInfoCache.ICVs[ICV]; 1965 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter]; 1966 if (Getter.Declaration == getAssociatedFunction()) { 1967 AssociatedICV = ICVInfo.Kind; 1968 return; 1969 } 1970 } 1971 1972 /// Unknown ICV. 1973 indicatePessimisticFixpoint(); 1974 } 1975 1976 ChangeStatus manifest(Attributor &A) override { 1977 if (!ReplVal.hasValue() || !ReplVal.getValue()) 1978 return ChangeStatus::UNCHANGED; 1979 1980 A.changeValueAfterManifest(*getCtxI(), **ReplVal); 1981 A.deleteAfterManifest(*getCtxI()); 1982 1983 return ChangeStatus::CHANGED; 1984 } 1985 1986 // FIXME: come up with better string. 1987 const std::string getAsStr() const override { return "ICVTrackerCallSite"; } 1988 1989 // FIXME: come up with some stats. 1990 void trackStatistics() const override {} 1991 1992 InternalControlVar AssociatedICV; 1993 Optional<Value *> ReplVal; 1994 1995 ChangeStatus updateImpl(Attributor &A) override { 1996 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 1997 *this, IRPosition::function(*getAnchorScope())); 1998 1999 // We don't have any information, so we assume it changes the ICV. 2000 if (!ICVTrackingAA.isAssumedTracked()) 2001 return indicatePessimisticFixpoint(); 2002 2003 Optional<Value *> NewReplVal = 2004 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); 2005 2006 if (ReplVal == NewReplVal) 2007 return ChangeStatus::UNCHANGED; 2008 2009 ReplVal = NewReplVal; 2010 return ChangeStatus::CHANGED; 2011 } 2012 2013 // Return the value with which associated value can be replaced for specific 2014 // \p ICV. 2015 Optional<Value *> 2016 getUniqueReplacementValue(InternalControlVar ICV) const override { 2017 return ReplVal; 2018 } 2019 }; 2020 2021 struct AAICVTrackerCallSiteReturned : AAICVTracker { 2022 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A) 2023 : AAICVTracker(IRP, A) {} 2024 2025 // FIXME: come up with better string. 2026 const std::string getAsStr() const override { 2027 return "ICVTrackerCallSiteReturned"; 2028 } 2029 2030 // FIXME: come up with some stats. 2031 void trackStatistics() const override {} 2032 2033 /// We don't manifest anything for this AA. 2034 ChangeStatus manifest(Attributor &A) override { 2035 return ChangeStatus::UNCHANGED; 2036 } 2037 2038 // Map of ICV to their values at specific program point. 2039 EnumeratedArray<Optional<Value *>, InternalControlVar, 2040 InternalControlVar::ICV___last> 2041 ICVReplacementValuesMap; 2042 2043 /// Return the value with which associated value can be replaced for specific 2044 /// \p ICV. 2045 Optional<Value *> 2046 getUniqueReplacementValue(InternalControlVar ICV) const override { 2047 return ICVReplacementValuesMap[ICV]; 2048 } 2049 2050 ChangeStatus updateImpl(Attributor &A) override { 2051 ChangeStatus Changed = ChangeStatus::UNCHANGED; 2052 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( 2053 *this, IRPosition::returned(*getAssociatedFunction())); 2054 2055 // We don't have any information, so we assume it changes the ICV. 2056 if (!ICVTrackingAA.isAssumedTracked()) 2057 return indicatePessimisticFixpoint(); 2058 2059 for (InternalControlVar ICV : TrackableICVs) { 2060 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; 2061 Optional<Value *> NewReplVal = 2062 ICVTrackingAA.getUniqueReplacementValue(ICV); 2063 2064 if (ReplVal == NewReplVal) 2065 continue; 2066 2067 ReplVal = NewReplVal; 2068 Changed = ChangeStatus::CHANGED; 2069 } 2070 return Changed; 2071 } 2072 }; 2073 } // namespace 2074 2075 const char AAICVTracker::ID = 0; 2076 2077 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, 2078 Attributor &A) { 2079 AAICVTracker *AA = nullptr; 2080 switch (IRP.getPositionKind()) { 2081 case IRPosition::IRP_INVALID: 2082 case IRPosition::IRP_FLOAT: 2083 case IRPosition::IRP_ARGUMENT: 2084 case IRPosition::IRP_CALL_SITE_ARGUMENT: 2085 llvm_unreachable("ICVTracker can only be created for function position!"); 2086 case IRPosition::IRP_RETURNED: 2087 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A); 2088 break; 2089 case IRPosition::IRP_CALL_SITE_RETURNED: 2090 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A); 2091 break; 2092 case IRPosition::IRP_CALL_SITE: 2093 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A); 2094 break; 2095 case IRPosition::IRP_FUNCTION: 2096 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); 2097 break; 2098 } 2099 2100 return *AA; 2101 } 2102 2103 PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, 2104 CGSCCAnalysisManager &AM, 2105 LazyCallGraph &CG, CGSCCUpdateResult &UR) { 2106 if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule)) 2107 return PreservedAnalyses::all(); 2108 2109 if (DisableOpenMPOptimizations) 2110 return PreservedAnalyses::all(); 2111 2112 SmallVector<Function *, 16> SCC; 2113 // If there are kernels in the module, we have to run on all SCC's. 2114 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2115 for (LazyCallGraph::Node &N : C) { 2116 Function *Fn = &N.getFunction(); 2117 SCC.push_back(Fn); 2118 2119 // Do we already know that the SCC contains kernels, 2120 // or that OpenMP functions are called from this SCC? 2121 if (SCCIsInteresting) 2122 continue; 2123 // If not, let's check that. 2124 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2125 } 2126 2127 if (!SCCIsInteresting || SCC.empty()) 2128 return PreservedAnalyses::all(); 2129 2130 FunctionAnalysisManager &FAM = 2131 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); 2132 2133 AnalysisGetter AG(FAM); 2134 2135 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { 2136 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 2137 }; 2138 2139 CallGraphUpdater CGUpdater; 2140 CGUpdater.initialize(CG, C, AM, UR); 2141 2142 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2143 BumpPtrAllocator Allocator; 2144 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, 2145 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2146 2147 Attributor A(Functions, InfoCache, CGUpdater); 2148 2149 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2150 bool Changed = OMPOpt.run(); 2151 if (Changed) 2152 return PreservedAnalyses::none(); 2153 2154 return PreservedAnalyses::all(); 2155 } 2156 2157 namespace { 2158 2159 struct OpenMPOptLegacyPass : public CallGraphSCCPass { 2160 CallGraphUpdater CGUpdater; 2161 OpenMPInModule OMPInModule; 2162 static char ID; 2163 2164 OpenMPOptLegacyPass() : CallGraphSCCPass(ID) { 2165 initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry()); 2166 } 2167 2168 void getAnalysisUsage(AnalysisUsage &AU) const override { 2169 CallGraphSCCPass::getAnalysisUsage(AU); 2170 } 2171 2172 bool doInitialization(CallGraph &CG) override { 2173 // Disable the pass if there is no OpenMP (runtime call) in the module. 2174 containsOpenMP(CG.getModule(), OMPInModule); 2175 return false; 2176 } 2177 2178 bool runOnSCC(CallGraphSCC &CGSCC) override { 2179 if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule)) 2180 return false; 2181 if (DisableOpenMPOptimizations || skipSCC(CGSCC)) 2182 return false; 2183 2184 SmallVector<Function *, 16> SCC; 2185 // If there are kernels in the module, we have to run on all SCC's. 2186 bool SCCIsInteresting = !OMPInModule.getKernels().empty(); 2187 for (CallGraphNode *CGN : CGSCC) { 2188 Function *Fn = CGN->getFunction(); 2189 if (!Fn || Fn->isDeclaration()) 2190 continue; 2191 SCC.push_back(Fn); 2192 2193 // Do we already know that the SCC contains kernels, 2194 // or that OpenMP functions are called from this SCC? 2195 if (SCCIsInteresting) 2196 continue; 2197 // If not, let's check that. 2198 SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); 2199 } 2200 2201 if (!SCCIsInteresting || SCC.empty()) 2202 return false; 2203 2204 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 2205 CGUpdater.initialize(CG, CGSCC); 2206 2207 // Maintain a map of functions to avoid rebuilding the ORE 2208 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; 2209 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { 2210 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; 2211 if (!ORE) 2212 ORE = std::make_unique<OptimizationRemarkEmitter>(F); 2213 return *ORE; 2214 }; 2215 2216 AnalysisGetter AG; 2217 SetVector<Function *> Functions(SCC.begin(), SCC.end()); 2218 BumpPtrAllocator Allocator; 2219 OMPInformationCache InfoCache( 2220 *(Functions.back()->getParent()), AG, Allocator, 2221 /*CGSCC*/ Functions, OMPInModule.getKernels()); 2222 2223 Attributor A(Functions, InfoCache, CGUpdater); 2224 2225 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); 2226 return OMPOpt.run(); 2227 } 2228 2229 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } 2230 }; 2231 2232 } // end anonymous namespace 2233 2234 void OpenMPInModule::identifyKernels(Module &M) { 2235 2236 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 2237 if (!MD) 2238 return; 2239 2240 for (auto *Op : MD->operands()) { 2241 if (Op->getNumOperands() < 2) 2242 continue; 2243 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); 2244 if (!KindID || KindID->getString() != "kernel") 2245 continue; 2246 2247 Function *KernelFn = 2248 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); 2249 if (!KernelFn) 2250 continue; 2251 2252 ++NumOpenMPTargetRegionKernels; 2253 2254 Kernels.insert(KernelFn); 2255 } 2256 } 2257 2258 bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { 2259 if (OMPInModule.isKnown()) 2260 return OMPInModule; 2261 2262 auto RecordFunctionsContainingUsesOf = [&](Function *F) { 2263 for (User *U : F->users()) 2264 if (auto *I = dyn_cast<Instruction>(U)) 2265 OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction()); 2266 }; 2267 2268 // MSVC doesn't like long if-else chains for some reason and instead just 2269 // issues an error. Work around it.. 2270 do { 2271 #define OMP_RTL(_Enum, _Name, ...) \ 2272 if (Function *F = M.getFunction(_Name)) { \ 2273 RecordFunctionsContainingUsesOf(F); \ 2274 OMPInModule = true; \ 2275 } 2276 #include "llvm/Frontend/OpenMP/OMPKinds.def" 2277 } while (false); 2278 2279 // Identify kernels once. TODO: We should split the OMPInformationCache into a 2280 // module and an SCC part. The kernel information, among other things, could 2281 // go into the module part. 2282 if (OMPInModule.isKnown() && OMPInModule) { 2283 OMPInModule.identifyKernels(M); 2284 return true; 2285 } 2286 2287 return OMPInModule = false; 2288 } 2289 2290 char OpenMPOptLegacyPass::ID = 0; 2291 2292 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", 2293 "OpenMP specific optimizations", false, false) 2294 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 2295 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt", 2296 "OpenMP specific optimizations", false, false) 2297 2298 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); } 2299