1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/Analysis/TargetLibraryInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/InstrTypes.h" 33 #include "llvm/IR/Instruction.h" 34 #include "llvm/IR/Instructions.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/PassManager.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/InitializePasses.h" 39 #include "llvm/Pass.h" 40 #include "llvm/PassRegistry.h" 41 #include "llvm/ProfileData/InstrProf.h" 42 #define INSTR_PROF_VALUE_PROF_MEMOP_API 43 #include "llvm/ProfileData/InstrProfData.inc" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Support/WithColor.h" 50 #include "llvm/Transforms/Instrumentation.h" 51 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 52 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 53 #include <cassert> 54 #include <cstdint> 55 #include <vector> 56 57 using namespace llvm; 58 59 #define DEBUG_TYPE "pgo-memop-opt" 60 61 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 62 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 63 64 // The minimum call count to optimize memory intrinsic calls. 65 static cl::opt<unsigned> 66 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 67 cl::init(1000), 68 cl::desc("The minimum count to optimize memory " 69 "intrinsic calls")); 70 71 // Command line option to disable memory intrinsic optimization. The default is 72 // false. This is for debug purpose. 73 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 74 cl::Hidden, cl::desc("Disable optimize")); 75 76 // The percent threshold to optimize memory intrinsic calls. 77 static cl::opt<unsigned> 78 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 79 cl::Hidden, cl::ZeroOrMore, 80 cl::desc("The percentage threshold for the " 81 "memory intrinsic calls optimization")); 82 83 // Maximum number of versions for optimizing memory intrinsic call. 84 static cl::opt<unsigned> 85 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 86 cl::ZeroOrMore, 87 cl::desc("The max version for the optimized memory " 88 " intrinsic calls")); 89 90 // Scale the counts from the annotation using the BB count value. 91 static cl::opt<bool> 92 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 93 cl::desc("Scale the memop size counts using the basic " 94 " block count value")); 95 96 cl::opt<bool> 97 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 98 cl::Hidden, 99 cl::desc("Size-specialize memcmp and bcmp calls")); 100 101 static cl::opt<unsigned> 102 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 103 cl::desc("Optimize the memop size <= this value")); 104 105 namespace { 106 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 107 public: 108 static char ID; 109 110 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 111 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 112 } 113 114 StringRef getPassName() const override { return "PGOMemOPSize"; } 115 116 private: 117 bool runOnFunction(Function &F) override; 118 void getAnalysisUsage(AnalysisUsage &AU) const override { 119 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 120 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 121 AU.addPreserved<GlobalsAAWrapperPass>(); 122 AU.addPreserved<DominatorTreeWrapperPass>(); 123 AU.addRequired<TargetLibraryInfoWrapperPass>(); 124 } 125 }; 126 } // end anonymous namespace 127 128 char PGOMemOPSizeOptLegacyPass::ID = 0; 129 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 130 "Optimize memory intrinsic using its size value profile", 131 false, false) 132 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 133 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 134 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 135 "Optimize memory intrinsic using its size value profile", 136 false, false) 137 138 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 139 return new PGOMemOPSizeOptLegacyPass(); 140 } 141 142 namespace { 143 144 static const char *getMIName(const MemIntrinsic *MI) { 145 switch (MI->getIntrinsicID()) { 146 case Intrinsic::memcpy: 147 return "memcpy"; 148 case Intrinsic::memmove: 149 return "memmove"; 150 case Intrinsic::memset: 151 return "memset"; 152 default: 153 return "unknown"; 154 } 155 } 156 157 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 158 struct MemOp { 159 Instruction *I; 160 MemOp(MemIntrinsic *MI) : I(MI) {} 161 MemOp(CallInst *CI) : I(CI) {} 162 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 163 CallInst *asCI() { return cast<CallInst>(I); } 164 MemOp clone() { 165 if (auto MI = asMI()) 166 return MemOp(cast<MemIntrinsic>(MI->clone())); 167 return MemOp(cast<CallInst>(asCI()->clone())); 168 } 169 Value *getLength() { 170 if (auto MI = asMI()) 171 return MI->getLength(); 172 return asCI()->getArgOperand(2); 173 } 174 void setLength(Value *Length) { 175 if (auto MI = asMI()) 176 return MI->setLength(Length); 177 asCI()->setArgOperand(2, Length); 178 } 179 StringRef getFuncName() { 180 if (auto MI = asMI()) 181 return MI->getCalledFunction()->getName(); 182 return asCI()->getCalledFunction()->getName(); 183 } 184 bool isMemmove() { 185 if (auto MI = asMI()) 186 if (MI->getIntrinsicID() == Intrinsic::memmove) 187 return true; 188 return false; 189 } 190 bool isMemcmp(TargetLibraryInfo &TLI) { 191 LibFunc Func; 192 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 193 Func == LibFunc_memcmp) { 194 return true; 195 } 196 return false; 197 } 198 bool isBcmp(TargetLibraryInfo &TLI) { 199 LibFunc Func; 200 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 201 Func == LibFunc_bcmp) { 202 return true; 203 } 204 return false; 205 } 206 const char *getName(TargetLibraryInfo &TLI) { 207 if (auto MI = asMI()) 208 return getMIName(MI); 209 LibFunc Func; 210 if (TLI.getLibFunc(*asCI(), Func)) { 211 if (Func == LibFunc_memcmp) 212 return "memcmp"; 213 if (Func == LibFunc_bcmp) 214 return "bcmp"; 215 } 216 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 217 return nullptr; 218 } 219 }; 220 221 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 222 public: 223 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 224 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 225 TargetLibraryInfo &TLI) 226 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 227 ValueDataArray = 228 std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 229 } 230 bool isChanged() const { return Changed; } 231 void perform() { 232 WorkList.clear(); 233 visit(Func); 234 235 for (auto &MO : WorkList) { 236 ++NumOfPGOMemOPAnnotate; 237 if (perform(MO)) { 238 Changed = true; 239 ++NumOfPGOMemOPOpt; 240 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 241 << "is Transformed.\n"); 242 } 243 } 244 } 245 246 void visitMemIntrinsic(MemIntrinsic &MI) { 247 Value *Length = MI.getLength(); 248 // Not perform on constant length calls. 249 if (isa<ConstantInt>(Length)) 250 return; 251 WorkList.push_back(MemOp(&MI)); 252 } 253 254 void visitCallInst(CallInst &CI) { 255 LibFunc Func; 256 if (TLI.getLibFunc(CI, Func) && 257 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 258 !isa<ConstantInt>(CI.getArgOperand(2))) { 259 WorkList.push_back(MemOp(&CI)); 260 } 261 } 262 263 private: 264 Function &Func; 265 BlockFrequencyInfo &BFI; 266 OptimizationRemarkEmitter &ORE; 267 DominatorTree *DT; 268 TargetLibraryInfo &TLI; 269 bool Changed; 270 std::vector<MemOp> WorkList; 271 // The space to read the profile annotation. 272 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 273 bool perform(MemOp MO); 274 }; 275 276 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 277 assert(Count <= TotalCount); 278 if (Count < MemOPCountThreshold) 279 return false; 280 if (Count < TotalCount * MemOPPercentThreshold / 100) 281 return false; 282 return true; 283 } 284 285 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 286 uint64_t Denom) { 287 if (!MemOPScaleCount) 288 return Count; 289 bool Overflowed; 290 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 291 return ScaleCount / Denom; 292 } 293 294 bool MemOPSizeOpt::perform(MemOp MO) { 295 assert(MO.I); 296 if (MO.isMemmove()) 297 return false; 298 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 299 return false; 300 301 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 302 uint64_t TotalCount; 303 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions, 304 ValueDataArray.get(), NumVals, TotalCount)) 305 return false; 306 307 uint64_t ActualCount = TotalCount; 308 uint64_t SavedTotalCount = TotalCount; 309 if (MemOPScaleCount) { 310 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 311 if (!BBEdgeCount) 312 return false; 313 ActualCount = *BBEdgeCount; 314 } 315 316 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 317 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 318 << ActualCount << "\n"); 319 LLVM_DEBUG( 320 for (auto &VD 321 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 322 323 if (ActualCount < MemOPCountThreshold) 324 return false; 325 // Skip if the total value profiled count is 0, in which case we can't 326 // scale up the counts properly (and there is no profitable transformation). 327 if (TotalCount == 0) 328 return false; 329 330 TotalCount = ActualCount; 331 if (MemOPScaleCount) 332 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 333 << " denominator = " << SavedTotalCount << "\n"); 334 335 // Keeping track of the count of the default case: 336 uint64_t RemainCount = TotalCount; 337 uint64_t SavedRemainCount = SavedTotalCount; 338 SmallVector<uint64_t, 16> SizeIds; 339 SmallVector<uint64_t, 16> CaseCounts; 340 uint64_t MaxCount = 0; 341 unsigned Version = 0; 342 int64_t LastV = -1; 343 // Default case is in the front -- save the slot here. 344 CaseCounts.push_back(0); 345 for (auto &VD : VDs) { 346 int64_t V = VD.Value; 347 uint64_t C = VD.Count; 348 if (MemOPScaleCount) 349 C = getScaledCount(C, ActualCount, SavedTotalCount); 350 351 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) 352 continue; 353 354 // ValueCounts are sorted on the count. Break at the first un-profitable 355 // value. 356 if (!isProfitable(C, RemainCount)) 357 break; 358 359 if (V == LastV) { 360 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName() 361 << ": Two consecutive, identical values in MemOp value" 362 "counts.\n"); 363 return false; 364 } 365 366 LastV = V; 367 368 SizeIds.push_back(V); 369 CaseCounts.push_back(C); 370 if (C > MaxCount) 371 MaxCount = C; 372 373 assert(RemainCount >= C); 374 RemainCount -= C; 375 assert(SavedRemainCount >= VD.Count); 376 SavedRemainCount -= VD.Count; 377 378 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 379 break; 380 } 381 382 if (Version == 0) 383 return false; 384 385 CaseCounts[0] = RemainCount; 386 if (RemainCount > MaxCount) 387 MaxCount = RemainCount; 388 389 uint64_t SumForOpt = TotalCount - RemainCount; 390 391 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 392 << " Versions (covering " << SumForOpt << " out of " 393 << TotalCount << ")\n"); 394 395 // mem_op(..., size) 396 // ==> 397 // switch (size) { 398 // case s1: 399 // mem_op(..., s1); 400 // goto merge_bb; 401 // case s2: 402 // mem_op(..., s2); 403 // goto merge_bb; 404 // ... 405 // default: 406 // mem_op(..., size); 407 // goto merge_bb; 408 // } 409 // merge_bb: 410 411 BasicBlock *BB = MO.I->getParent(); 412 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 413 LLVM_DEBUG(dbgs() << *BB << "\n"); 414 auto OrigBBFreq = BFI.getBlockFreq(BB); 415 416 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 417 BasicBlock::iterator It(*MO.I); 418 ++It; 419 assert(It != DefaultBB->end()); 420 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 421 MergeBB->setName("MemOP.Merge"); 422 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 423 DefaultBB->setName("MemOP.Default"); 424 425 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 426 auto &Ctx = Func.getContext(); 427 IRBuilder<> IRB(BB); 428 BB->getTerminator()->eraseFromParent(); 429 Value *SizeVar = MO.getLength(); 430 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 431 Type *MemOpTy = MO.I->getType(); 432 PHINode *PHI = nullptr; 433 if (!MemOpTy->isVoidTy()) { 434 // Insert a phi for the return values at the merge block. 435 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 436 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 437 MO.I->replaceAllUsesWith(PHI); 438 PHI->addIncoming(MO.I, DefaultBB); 439 } 440 441 // Clear the value profile data. 442 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 443 // If all promoted, we don't need the MD.prof metadata. 444 if (SavedRemainCount > 0 || Version != NumVals) 445 // Otherwise we need update with the un-promoted records back. 446 annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version), 447 SavedRemainCount, IPVK_MemOPSize, NumVals); 448 449 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 450 451 std::vector<DominatorTree::UpdateType> Updates; 452 if (DT) 453 Updates.reserve(2 * SizeIds.size()); 454 455 for (uint64_t SizeId : SizeIds) { 456 BasicBlock *CaseBB = BasicBlock::Create( 457 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 458 MemOp NewMO = MO.clone(); 459 // Fix the argument. 460 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 461 assert(SizeType && "Expected integer type size argument."); 462 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 463 NewMO.setLength(CaseSizeId); 464 CaseBB->getInstList().push_back(NewMO.I); 465 IRBuilder<> IRBCase(CaseBB); 466 IRBCase.CreateBr(MergeBB); 467 SI->addCase(CaseSizeId, CaseBB); 468 if (!MemOpTy->isVoidTy()) 469 PHI->addIncoming(NewMO.I, CaseBB); 470 if (DT) { 471 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 472 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 473 } 474 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 475 } 476 DTU.applyUpdates(Updates); 477 Updates.clear(); 478 479 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 480 481 LLVM_DEBUG(dbgs() << *BB << "\n"); 482 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 483 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 484 485 ORE.emit([&]() { 486 using namespace ore; 487 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 488 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 489 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 490 << " for " << NV("Versions", Version) << " versions"; 491 }); 492 493 return true; 494 } 495 } // namespace 496 497 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 498 OptimizationRemarkEmitter &ORE, 499 DominatorTree *DT, TargetLibraryInfo &TLI) { 500 if (DisableMemOPOPT) 501 return false; 502 503 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 504 return false; 505 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 506 MemOPSizeOpt.perform(); 507 return MemOPSizeOpt.isChanged(); 508 } 509 510 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 511 BlockFrequencyInfo &BFI = 512 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 513 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 514 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 515 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 516 TargetLibraryInfo &TLI = 517 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 518 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 519 } 520 521 namespace llvm { 522 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 523 524 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 525 FunctionAnalysisManager &FAM) { 526 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 527 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 528 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 529 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 530 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 531 if (!Changed) 532 return PreservedAnalyses::all(); 533 auto PA = PreservedAnalyses(); 534 PA.preserve<GlobalsAA>(); 535 PA.preserve<DominatorTreeAnalysis>(); 536 return PA; 537 } 538 } // namespace llvm 539