1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 25 #include "llvm/Analysis/TargetLibraryInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/LLVMContext.h" 35 #include "llvm/IR/PassManager.h" 36 #include "llvm/IR/Type.h" 37 #include "llvm/InitializePasses.h" 38 #include "llvm/Pass.h" 39 #include "llvm/PassRegistry.h" 40 #include "llvm/ProfileData/InstrProf.h" 41 #define INSTR_PROF_VALUE_PROF_MEMOP_API 42 #include "llvm/ProfileData/InstrProfData.inc" 43 #include "llvm/Support/Casting.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Transforms/Instrumentation.h" 49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 50 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 51 #include <cassert> 52 #include <cstdint> 53 #include <vector> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "pgo-memop-opt" 58 59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 61 62 // The minimum call count to optimize memory intrinsic calls. 63 static cl::opt<unsigned> 64 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 65 cl::init(1000), 66 cl::desc("The minimum count to optimize memory " 67 "intrinsic calls")); 68 69 // Command line option to disable memory intrinsic optimization. The default is 70 // false. This is for debug purpose. 71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 72 cl::Hidden, cl::desc("Disable optimize")); 73 74 // The percent threshold to optimize memory intrinsic calls. 75 static cl::opt<unsigned> 76 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 77 cl::Hidden, cl::ZeroOrMore, 78 cl::desc("The percentage threshold for the " 79 "memory intrinsic calls optimization")); 80 81 // Maximum number of versions for optimizing memory intrinsic call. 82 static cl::opt<unsigned> 83 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 84 cl::ZeroOrMore, 85 cl::desc("The max version for the optimized memory " 86 " intrinsic calls")); 87 88 // Scale the counts from the annotation using the BB count value. 89 static cl::opt<bool> 90 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 91 cl::desc("Scale the memop size counts using the basic " 92 " block count value")); 93 94 cl::opt<bool> 95 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 96 cl::Hidden, 97 cl::desc("Size-specialize memcmp and bcmp calls")); 98 99 static cl::opt<unsigned> 100 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 101 cl::desc("Optimize the memop size <= this value")); 102 103 namespace { 104 105 static const char *getMIName(const MemIntrinsic *MI) { 106 switch (MI->getIntrinsicID()) { 107 case Intrinsic::memcpy: 108 return "memcpy"; 109 case Intrinsic::memmove: 110 return "memmove"; 111 case Intrinsic::memset: 112 return "memset"; 113 default: 114 return "unknown"; 115 } 116 } 117 118 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 119 struct MemOp { 120 Instruction *I; 121 MemOp(MemIntrinsic *MI) : I(MI) {} 122 MemOp(CallInst *CI) : I(CI) {} 123 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 124 CallInst *asCI() { return cast<CallInst>(I); } 125 MemOp clone() { 126 if (auto MI = asMI()) 127 return MemOp(cast<MemIntrinsic>(MI->clone())); 128 return MemOp(cast<CallInst>(asCI()->clone())); 129 } 130 Value *getLength() { 131 if (auto MI = asMI()) 132 return MI->getLength(); 133 return asCI()->getArgOperand(2); 134 } 135 void setLength(Value *Length) { 136 if (auto MI = asMI()) 137 return MI->setLength(Length); 138 asCI()->setArgOperand(2, Length); 139 } 140 StringRef getFuncName() { 141 if (auto MI = asMI()) 142 return MI->getCalledFunction()->getName(); 143 return asCI()->getCalledFunction()->getName(); 144 } 145 bool isMemmove() { 146 if (auto MI = asMI()) 147 if (MI->getIntrinsicID() == Intrinsic::memmove) 148 return true; 149 return false; 150 } 151 bool isMemcmp(TargetLibraryInfo &TLI) { 152 LibFunc Func; 153 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 154 Func == LibFunc_memcmp) { 155 return true; 156 } 157 return false; 158 } 159 bool isBcmp(TargetLibraryInfo &TLI) { 160 LibFunc Func; 161 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 162 Func == LibFunc_bcmp) { 163 return true; 164 } 165 return false; 166 } 167 const char *getName(TargetLibraryInfo &TLI) { 168 if (auto MI = asMI()) 169 return getMIName(MI); 170 LibFunc Func; 171 if (TLI.getLibFunc(*asCI(), Func)) { 172 if (Func == LibFunc_memcmp) 173 return "memcmp"; 174 if (Func == LibFunc_bcmp) 175 return "bcmp"; 176 } 177 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 178 return nullptr; 179 } 180 }; 181 182 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 183 public: 184 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 185 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 186 TargetLibraryInfo &TLI) 187 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 188 ValueDataArray = 189 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); 190 } 191 bool isChanged() const { return Changed; } 192 void perform() { 193 WorkList.clear(); 194 visit(Func); 195 196 for (auto &MO : WorkList) { 197 ++NumOfPGOMemOPAnnotate; 198 if (perform(MO)) { 199 Changed = true; 200 ++NumOfPGOMemOPOpt; 201 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 202 << "is Transformed.\n"); 203 } 204 } 205 } 206 207 void visitMemIntrinsic(MemIntrinsic &MI) { 208 Value *Length = MI.getLength(); 209 // Not perform on constant length calls. 210 if (isa<ConstantInt>(Length)) 211 return; 212 WorkList.push_back(MemOp(&MI)); 213 } 214 215 void visitCallInst(CallInst &CI) { 216 LibFunc Func; 217 if (TLI.getLibFunc(CI, Func) && 218 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 219 !isa<ConstantInt>(CI.getArgOperand(2))) { 220 WorkList.push_back(MemOp(&CI)); 221 } 222 } 223 224 private: 225 Function &Func; 226 BlockFrequencyInfo &BFI; 227 OptimizationRemarkEmitter &ORE; 228 DominatorTree *DT; 229 TargetLibraryInfo &TLI; 230 bool Changed; 231 std::vector<MemOp> WorkList; 232 // The space to read the profile annotation. 233 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 234 bool perform(MemOp MO); 235 }; 236 237 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 238 assert(Count <= TotalCount); 239 if (Count < MemOPCountThreshold) 240 return false; 241 if (Count < TotalCount * MemOPPercentThreshold / 100) 242 return false; 243 return true; 244 } 245 246 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 247 uint64_t Denom) { 248 if (!MemOPScaleCount) 249 return Count; 250 bool Overflowed; 251 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 252 return ScaleCount / Denom; 253 } 254 255 bool MemOPSizeOpt::perform(MemOp MO) { 256 assert(MO.I); 257 if (MO.isMemmove()) 258 return false; 259 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 260 return false; 261 262 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; 263 uint64_t TotalCount; 264 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, 265 ValueDataArray.get(), NumVals, TotalCount)) 266 return false; 267 268 uint64_t ActualCount = TotalCount; 269 uint64_t SavedTotalCount = TotalCount; 270 if (MemOPScaleCount) { 271 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 272 if (!BBEdgeCount) 273 return false; 274 ActualCount = *BBEdgeCount; 275 } 276 277 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 278 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 279 << ActualCount << "\n"); 280 LLVM_DEBUG( 281 for (auto &VD 282 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 283 284 if (ActualCount < MemOPCountThreshold) 285 return false; 286 // Skip if the total value profiled count is 0, in which case we can't 287 // scale up the counts properly (and there is no profitable transformation). 288 if (TotalCount == 0) 289 return false; 290 291 TotalCount = ActualCount; 292 if (MemOPScaleCount) 293 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 294 << " denominator = " << SavedTotalCount << "\n"); 295 296 // Keeping track of the count of the default case: 297 uint64_t RemainCount = TotalCount; 298 uint64_t SavedRemainCount = SavedTotalCount; 299 SmallVector<uint64_t, 16> SizeIds; 300 SmallVector<uint64_t, 16> CaseCounts; 301 uint64_t MaxCount = 0; 302 unsigned Version = 0; 303 int64_t LastV = -1; 304 // Default case is in the front -- save the slot here. 305 CaseCounts.push_back(0); 306 SmallVector<InstrProfValueData, 24> RemainingVDs; 307 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 308 auto &VD = *I; 309 int64_t V = VD.Value; 310 uint64_t C = VD.Count; 311 if (MemOPScaleCount) 312 C = getScaledCount(C, ActualCount, SavedTotalCount); 313 314 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 315 RemainingVDs.push_back(VD); 316 continue; 317 } 318 319 // ValueCounts are sorted on the count. Break at the first un-profitable 320 // value. 321 if (!isProfitable(C, RemainCount)) { 322 RemainingVDs.insert(RemainingVDs.end(), I, E); 323 break; 324 } 325 326 if (V == LastV) { 327 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName() 328 << ": Two consecutive, identical values in MemOp value" 329 "counts.\n"); 330 return false; 331 } 332 333 LastV = V; 334 335 SizeIds.push_back(V); 336 CaseCounts.push_back(C); 337 if (C > MaxCount) 338 MaxCount = C; 339 340 assert(RemainCount >= C); 341 RemainCount -= C; 342 assert(SavedRemainCount >= VD.Count); 343 SavedRemainCount -= VD.Count; 344 345 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 346 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 347 break; 348 } 349 } 350 351 if (Version == 0) 352 return false; 353 354 CaseCounts[0] = RemainCount; 355 if (RemainCount > MaxCount) 356 MaxCount = RemainCount; 357 358 uint64_t SumForOpt = TotalCount - RemainCount; 359 360 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 361 << " Versions (covering " << SumForOpt << " out of " 362 << TotalCount << ")\n"); 363 364 // mem_op(..., size) 365 // ==> 366 // switch (size) { 367 // case s1: 368 // mem_op(..., s1); 369 // goto merge_bb; 370 // case s2: 371 // mem_op(..., s2); 372 // goto merge_bb; 373 // ... 374 // default: 375 // mem_op(..., size); 376 // goto merge_bb; 377 // } 378 // merge_bb: 379 380 BasicBlock *BB = MO.I->getParent(); 381 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 382 LLVM_DEBUG(dbgs() << *BB << "\n"); 383 auto OrigBBFreq = BFI.getBlockFreq(BB); 384 385 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 386 BasicBlock::iterator It(*MO.I); 387 ++It; 388 assert(It != DefaultBB->end()); 389 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 390 MergeBB->setName("MemOP.Merge"); 391 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 392 DefaultBB->setName("MemOP.Default"); 393 394 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 395 auto &Ctx = Func.getContext(); 396 IRBuilder<> IRB(BB); 397 BB->getTerminator()->eraseFromParent(); 398 Value *SizeVar = MO.getLength(); 399 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 400 Type *MemOpTy = MO.I->getType(); 401 PHINode *PHI = nullptr; 402 if (!MemOpTy->isVoidTy()) { 403 // Insert a phi for the return values at the merge block. 404 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 405 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 406 MO.I->replaceAllUsesWith(PHI); 407 PHI->addIncoming(MO.I, DefaultBB); 408 } 409 410 // Clear the value profile data. 411 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 412 // If all promoted, we don't need the MD.prof metadata. 413 if (SavedRemainCount > 0 || Version != NumVals) { 414 // Otherwise we need update with the un-promoted records back. 415 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); 416 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, 417 IPVK_MemOPSize, NumVals); 418 } 419 420 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 421 422 std::vector<DominatorTree::UpdateType> Updates; 423 if (DT) 424 Updates.reserve(2 * SizeIds.size()); 425 426 for (uint64_t SizeId : SizeIds) { 427 BasicBlock *CaseBB = BasicBlock::Create( 428 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 429 MemOp NewMO = MO.clone(); 430 // Fix the argument. 431 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 432 assert(SizeType && "Expected integer type size argument."); 433 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 434 NewMO.setLength(CaseSizeId); 435 CaseBB->getInstList().push_back(NewMO.I); 436 IRBuilder<> IRBCase(CaseBB); 437 IRBCase.CreateBr(MergeBB); 438 SI->addCase(CaseSizeId, CaseBB); 439 if (!MemOpTy->isVoidTy()) 440 PHI->addIncoming(NewMO.I, CaseBB); 441 if (DT) { 442 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 443 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 444 } 445 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 446 } 447 DTU.applyUpdates(Updates); 448 Updates.clear(); 449 450 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 451 452 LLVM_DEBUG(dbgs() << *BB << "\n"); 453 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 454 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 455 456 ORE.emit([&]() { 457 using namespace ore; 458 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 459 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 460 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 461 << " for " << NV("Versions", Version) << " versions"; 462 }); 463 464 return true; 465 } 466 } // namespace 467 468 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 469 OptimizationRemarkEmitter &ORE, 470 DominatorTree *DT, TargetLibraryInfo &TLI) { 471 if (DisableMemOPOPT) 472 return false; 473 474 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 475 return false; 476 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 477 MemOPSizeOpt.perform(); 478 return MemOPSizeOpt.isChanged(); 479 } 480 481 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 482 FunctionAnalysisManager &FAM) { 483 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 484 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 485 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 486 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 487 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 488 if (!Changed) 489 return PreservedAnalyses::all(); 490 auto PA = PreservedAnalyses(); 491 PA.preserve<DominatorTreeAnalysis>(); 492 return PA; 493 } 494