1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/Analysis/TargetLibraryInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/LLVMContext.h" 35 #include "llvm/IR/PassManager.h" 36 #include "llvm/IR/Type.h" 37 #include "llvm/InitializePasses.h" 38 #include "llvm/Pass.h" 39 #include "llvm/PassRegistry.h" 40 #include "llvm/ProfileData/InstrProf.h" 41 #define INSTR_PROF_VALUE_PROF_MEMOP_API 42 #include "llvm/ProfileData/InstrProfData.inc" 43 #include "llvm/Support/Casting.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Transforms/Instrumentation.h" 49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 50 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 51 #include <cassert> 52 #include <cstdint> 53 #include <vector> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "pgo-memop-opt" 58 59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 61 62 // The minimum call count to optimize memory intrinsic calls. 63 static cl::opt<unsigned> 64 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 65 cl::init(1000), 66 cl::desc("The minimum count to optimize memory " 67 "intrinsic calls")); 68 69 // Command line option to disable memory intrinsic optimization. The default is 70 // false. This is for debug purpose. 71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 72 cl::Hidden, cl::desc("Disable optimize")); 73 74 // The percent threshold to optimize memory intrinsic calls. 75 static cl::opt<unsigned> 76 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 77 cl::Hidden, cl::ZeroOrMore, 78 cl::desc("The percentage threshold for the " 79 "memory intrinsic calls optimization")); 80 81 // Maximum number of versions for optimizing memory intrinsic call. 82 static cl::opt<unsigned> 83 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 84 cl::ZeroOrMore, 85 cl::desc("The max version for the optimized memory " 86 " intrinsic calls")); 87 88 // Scale the counts from the annotation using the BB count value. 89 static cl::opt<bool> 90 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 91 cl::desc("Scale the memop size counts using the basic " 92 " block count value")); 93 94 cl::opt<bool> 95 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 96 cl::Hidden, 97 cl::desc("Size-specialize memcmp and bcmp calls")); 98 99 static cl::opt<unsigned> 100 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 101 cl::desc("Optimize the memop size <= this value")); 102 103 namespace { 104 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 105 public: 106 static char ID; 107 108 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 109 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 110 } 111 112 StringRef getPassName() const override { return "PGOMemOPSize"; } 113 114 private: 115 bool runOnFunction(Function &F) override; 116 void getAnalysisUsage(AnalysisUsage &AU) const override { 117 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 118 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 119 AU.addPreserved<GlobalsAAWrapperPass>(); 120 AU.addPreserved<DominatorTreeWrapperPass>(); 121 AU.addRequired<TargetLibraryInfoWrapperPass>(); 122 } 123 }; 124 } // end anonymous namespace 125 126 char PGOMemOPSizeOptLegacyPass::ID = 0; 127 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 128 "Optimize memory intrinsic using its size value profile", 129 false, false) 130 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 131 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 132 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 133 "Optimize memory intrinsic using its size value profile", 134 false, false) 135 136 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 137 return new PGOMemOPSizeOptLegacyPass(); 138 } 139 140 namespace { 141 142 static const char *getMIName(const MemIntrinsic *MI) { 143 switch (MI->getIntrinsicID()) { 144 case Intrinsic::memcpy: 145 return "memcpy"; 146 case Intrinsic::memmove: 147 return "memmove"; 148 case Intrinsic::memset: 149 return "memset"; 150 default: 151 return "unknown"; 152 } 153 } 154 155 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 156 struct MemOp { 157 Instruction *I; 158 MemOp(MemIntrinsic *MI) : I(MI) {} 159 MemOp(CallInst *CI) : I(CI) {} 160 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 161 CallInst *asCI() { return cast<CallInst>(I); } 162 MemOp clone() { 163 if (auto MI = asMI()) 164 return MemOp(cast<MemIntrinsic>(MI->clone())); 165 return MemOp(cast<CallInst>(asCI()->clone())); 166 } 167 Value *getLength() { 168 if (auto MI = asMI()) 169 return MI->getLength(); 170 return asCI()->getArgOperand(2); 171 } 172 void setLength(Value *Length) { 173 if (auto MI = asMI()) 174 return MI->setLength(Length); 175 asCI()->setArgOperand(2, Length); 176 } 177 StringRef getFuncName() { 178 if (auto MI = asMI()) 179 return MI->getCalledFunction()->getName(); 180 return asCI()->getCalledFunction()->getName(); 181 } 182 bool isMemmove() { 183 if (auto MI = asMI()) 184 if (MI->getIntrinsicID() == Intrinsic::memmove) 185 return true; 186 return false; 187 } 188 bool isMemcmp(TargetLibraryInfo &TLI) { 189 LibFunc Func; 190 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 191 Func == LibFunc_memcmp) { 192 return true; 193 } 194 return false; 195 } 196 bool isBcmp(TargetLibraryInfo &TLI) { 197 LibFunc Func; 198 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 199 Func == LibFunc_bcmp) { 200 return true; 201 } 202 return false; 203 } 204 const char *getName(TargetLibraryInfo &TLI) { 205 if (auto MI = asMI()) 206 return getMIName(MI); 207 LibFunc Func; 208 if (TLI.getLibFunc(*asCI(), Func)) { 209 if (Func == LibFunc_memcmp) 210 return "memcmp"; 211 if (Func == LibFunc_bcmp) 212 return "bcmp"; 213 } 214 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 215 return nullptr; 216 } 217 }; 218 219 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 220 public: 221 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 222 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 223 TargetLibraryInfo &TLI) 224 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 225 ValueDataArray = 226 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); 227 } 228 bool isChanged() const { return Changed; } 229 void perform() { 230 WorkList.clear(); 231 visit(Func); 232 233 for (auto &MO : WorkList) { 234 ++NumOfPGOMemOPAnnotate; 235 if (perform(MO)) { 236 Changed = true; 237 ++NumOfPGOMemOPOpt; 238 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 239 << "is Transformed.\n"); 240 } 241 } 242 } 243 244 void visitMemIntrinsic(MemIntrinsic &MI) { 245 Value *Length = MI.getLength(); 246 // Not perform on constant length calls. 247 if (isa<ConstantInt>(Length)) 248 return; 249 WorkList.push_back(MemOp(&MI)); 250 } 251 252 void visitCallInst(CallInst &CI) { 253 LibFunc Func; 254 if (TLI.getLibFunc(CI, Func) && 255 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 256 !isa<ConstantInt>(CI.getArgOperand(2))) { 257 WorkList.push_back(MemOp(&CI)); 258 } 259 } 260 261 private: 262 Function &Func; 263 BlockFrequencyInfo &BFI; 264 OptimizationRemarkEmitter &ORE; 265 DominatorTree *DT; 266 TargetLibraryInfo &TLI; 267 bool Changed; 268 std::vector<MemOp> WorkList; 269 // The space to read the profile annotation. 270 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 271 bool perform(MemOp MO); 272 }; 273 274 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 275 assert(Count <= TotalCount); 276 if (Count < MemOPCountThreshold) 277 return false; 278 if (Count < TotalCount * MemOPPercentThreshold / 100) 279 return false; 280 return true; 281 } 282 283 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 284 uint64_t Denom) { 285 if (!MemOPScaleCount) 286 return Count; 287 bool Overflowed; 288 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 289 return ScaleCount / Denom; 290 } 291 292 bool MemOPSizeOpt::perform(MemOp MO) { 293 assert(MO.I); 294 if (MO.isMemmove()) 295 return false; 296 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 297 return false; 298 299 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; 300 uint64_t TotalCount; 301 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, 302 ValueDataArray.get(), NumVals, TotalCount)) 303 return false; 304 305 uint64_t ActualCount = TotalCount; 306 uint64_t SavedTotalCount = TotalCount; 307 if (MemOPScaleCount) { 308 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 309 if (!BBEdgeCount) 310 return false; 311 ActualCount = *BBEdgeCount; 312 } 313 314 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 315 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 316 << ActualCount << "\n"); 317 LLVM_DEBUG( 318 for (auto &VD 319 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 320 321 if (ActualCount < MemOPCountThreshold) 322 return false; 323 // Skip if the total value profiled count is 0, in which case we can't 324 // scale up the counts properly (and there is no profitable transformation). 325 if (TotalCount == 0) 326 return false; 327 328 TotalCount = ActualCount; 329 if (MemOPScaleCount) 330 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 331 << " denominator = " << SavedTotalCount << "\n"); 332 333 // Keeping track of the count of the default case: 334 uint64_t RemainCount = TotalCount; 335 uint64_t SavedRemainCount = SavedTotalCount; 336 SmallVector<uint64_t, 16> SizeIds; 337 SmallVector<uint64_t, 16> CaseCounts; 338 uint64_t MaxCount = 0; 339 unsigned Version = 0; 340 int64_t LastV = -1; 341 // Default case is in the front -- save the slot here. 342 CaseCounts.push_back(0); 343 SmallVector<InstrProfValueData, 24> RemainingVDs; 344 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 345 auto &VD = *I; 346 int64_t V = VD.Value; 347 uint64_t C = VD.Count; 348 if (MemOPScaleCount) 349 C = getScaledCount(C, ActualCount, SavedTotalCount); 350 351 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 352 RemainingVDs.push_back(VD); 353 continue; 354 } 355 356 // ValueCounts are sorted on the count. Break at the first un-profitable 357 // value. 358 if (!isProfitable(C, RemainCount)) { 359 RemainingVDs.insert(RemainingVDs.end(), I, E); 360 break; 361 } 362 363 if (V == LastV) { 364 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName() 365 << ": Two consecutive, identical values in MemOp value" 366 "counts.\n"); 367 return false; 368 } 369 370 LastV = V; 371 372 SizeIds.push_back(V); 373 CaseCounts.push_back(C); 374 if (C > MaxCount) 375 MaxCount = C; 376 377 assert(RemainCount >= C); 378 RemainCount -= C; 379 assert(SavedRemainCount >= VD.Count); 380 SavedRemainCount -= VD.Count; 381 382 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 383 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 384 break; 385 } 386 } 387 388 if (Version == 0) 389 return false; 390 391 CaseCounts[0] = RemainCount; 392 if (RemainCount > MaxCount) 393 MaxCount = RemainCount; 394 395 uint64_t SumForOpt = TotalCount - RemainCount; 396 397 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 398 << " Versions (covering " << SumForOpt << " out of " 399 << TotalCount << ")\n"); 400 401 // mem_op(..., size) 402 // ==> 403 // switch (size) { 404 // case s1: 405 // mem_op(..., s1); 406 // goto merge_bb; 407 // case s2: 408 // mem_op(..., s2); 409 // goto merge_bb; 410 // ... 411 // default: 412 // mem_op(..., size); 413 // goto merge_bb; 414 // } 415 // merge_bb: 416 417 BasicBlock *BB = MO.I->getParent(); 418 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 419 LLVM_DEBUG(dbgs() << *BB << "\n"); 420 auto OrigBBFreq = BFI.getBlockFreq(BB); 421 422 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 423 BasicBlock::iterator It(*MO.I); 424 ++It; 425 assert(It != DefaultBB->end()); 426 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 427 MergeBB->setName("MemOP.Merge"); 428 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 429 DefaultBB->setName("MemOP.Default"); 430 431 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 432 auto &Ctx = Func.getContext(); 433 IRBuilder<> IRB(BB); 434 BB->getTerminator()->eraseFromParent(); 435 Value *SizeVar = MO.getLength(); 436 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 437 Type *MemOpTy = MO.I->getType(); 438 PHINode *PHI = nullptr; 439 if (!MemOpTy->isVoidTy()) { 440 // Insert a phi for the return values at the merge block. 441 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 442 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 443 MO.I->replaceAllUsesWith(PHI); 444 PHI->addIncoming(MO.I, DefaultBB); 445 } 446 447 // Clear the value profile data. 448 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 449 // If all promoted, we don't need the MD.prof metadata. 450 if (SavedRemainCount > 0 || Version != NumVals) { 451 // Otherwise we need update with the un-promoted records back. 452 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); 453 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, 454 IPVK_MemOPSize, NumVals); 455 } 456 457 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 458 459 std::vector<DominatorTree::UpdateType> Updates; 460 if (DT) 461 Updates.reserve(2 * SizeIds.size()); 462 463 for (uint64_t SizeId : SizeIds) { 464 BasicBlock *CaseBB = BasicBlock::Create( 465 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 466 MemOp NewMO = MO.clone(); 467 // Fix the argument. 468 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 469 assert(SizeType && "Expected integer type size argument."); 470 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 471 NewMO.setLength(CaseSizeId); 472 CaseBB->getInstList().push_back(NewMO.I); 473 IRBuilder<> IRBCase(CaseBB); 474 IRBCase.CreateBr(MergeBB); 475 SI->addCase(CaseSizeId, CaseBB); 476 if (!MemOpTy->isVoidTy()) 477 PHI->addIncoming(NewMO.I, CaseBB); 478 if (DT) { 479 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 480 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 481 } 482 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 483 } 484 DTU.applyUpdates(Updates); 485 Updates.clear(); 486 487 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 488 489 LLVM_DEBUG(dbgs() << *BB << "\n"); 490 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 491 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 492 493 ORE.emit([&]() { 494 using namespace ore; 495 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 496 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 497 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 498 << " for " << NV("Versions", Version) << " versions"; 499 }); 500 501 return true; 502 } 503 } // namespace 504 505 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 506 OptimizationRemarkEmitter &ORE, 507 DominatorTree *DT, TargetLibraryInfo &TLI) { 508 if (DisableMemOPOPT) 509 return false; 510 511 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 512 return false; 513 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 514 MemOPSizeOpt.perform(); 515 return MemOPSizeOpt.isChanged(); 516 } 517 518 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 519 BlockFrequencyInfo &BFI = 520 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 521 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 522 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 523 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 524 TargetLibraryInfo &TLI = 525 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 526 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 527 } 528 529 namespace llvm { 530 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 531 532 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 533 FunctionAnalysisManager &FAM) { 534 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 535 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 536 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 537 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 538 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 539 if (!Changed) 540 return PreservedAnalyses::all(); 541 auto PA = PreservedAnalyses(); 542 PA.preserve<DominatorTreeAnalysis>(); 543 return PA; 544 } 545 } // namespace llvm 546