1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the transformation that optimizes memory intrinsics 10 // such as memcpy using the size value profile. When memory intrinsic size 11 // value profile metadata is available, a single memory intrinsic is expanded 12 // to a sequence of guarded specialized versions that are called with the 13 // hottest size(s), for later expansion into more optimal inline sequences. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Twine.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/DomTreeUpdater.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Dominators.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/LLVMContext.h" 34 #include "llvm/IR/PassManager.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/ProfileData/InstrProf.h" 37 #define INSTR_PROF_VALUE_PROF_MEMOP_API 38 #include "llvm/ProfileData/InstrProfData.inc" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Debug.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/MathExtras.h" 44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include <cassert> 47 #include <cstdint> 48 #include <vector> 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "pgo-memop-opt" 53 54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 56 57 // The minimum call count to optimize memory intrinsic calls. 58 static cl::opt<unsigned> 59 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 60 cl::init(1000), 61 cl::desc("The minimum count to optimize memory " 62 "intrinsic calls")); 63 64 // Command line option to disable memory intrinsic optimization. The default is 65 // false. This is for debug purpose. 66 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 67 cl::Hidden, cl::desc("Disable optimize")); 68 69 // The percent threshold to optimize memory intrinsic calls. 70 static cl::opt<unsigned> 71 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 72 cl::Hidden, cl::ZeroOrMore, 73 cl::desc("The percentage threshold for the " 74 "memory intrinsic calls optimization")); 75 76 // Maximum number of versions for optimizing memory intrinsic call. 77 static cl::opt<unsigned> 78 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 79 cl::ZeroOrMore, 80 cl::desc("The max version for the optimized memory " 81 " intrinsic calls")); 82 83 // Scale the counts from the annotation using the BB count value. 84 static cl::opt<bool> 85 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 86 cl::desc("Scale the memop size counts using the basic " 87 " block count value")); 88 89 cl::opt<bool> 90 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), 91 cl::Hidden, 92 cl::desc("Size-specialize memcmp and bcmp calls")); 93 94 static cl::opt<unsigned> 95 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), 96 cl::desc("Optimize the memop size <= this value")); 97 98 namespace { 99 100 static const char *getMIName(const MemIntrinsic *MI) { 101 switch (MI->getIntrinsicID()) { 102 case Intrinsic::memcpy: 103 return "memcpy"; 104 case Intrinsic::memmove: 105 return "memmove"; 106 case Intrinsic::memset: 107 return "memset"; 108 default: 109 return "unknown"; 110 } 111 } 112 113 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). 114 struct MemOp { 115 Instruction *I; 116 MemOp(MemIntrinsic *MI) : I(MI) {} 117 MemOp(CallInst *CI) : I(CI) {} 118 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); } 119 CallInst *asCI() { return cast<CallInst>(I); } 120 MemOp clone() { 121 if (auto MI = asMI()) 122 return MemOp(cast<MemIntrinsic>(MI->clone())); 123 return MemOp(cast<CallInst>(asCI()->clone())); 124 } 125 Value *getLength() { 126 if (auto MI = asMI()) 127 return MI->getLength(); 128 return asCI()->getArgOperand(2); 129 } 130 void setLength(Value *Length) { 131 if (auto MI = asMI()) 132 return MI->setLength(Length); 133 asCI()->setArgOperand(2, Length); 134 } 135 StringRef getFuncName() { 136 if (auto MI = asMI()) 137 return MI->getCalledFunction()->getName(); 138 return asCI()->getCalledFunction()->getName(); 139 } 140 bool isMemmove() { 141 if (auto MI = asMI()) 142 if (MI->getIntrinsicID() == Intrinsic::memmove) 143 return true; 144 return false; 145 } 146 bool isMemcmp(TargetLibraryInfo &TLI) { 147 LibFunc Func; 148 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 149 Func == LibFunc_memcmp) { 150 return true; 151 } 152 return false; 153 } 154 bool isBcmp(TargetLibraryInfo &TLI) { 155 LibFunc Func; 156 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && 157 Func == LibFunc_bcmp) { 158 return true; 159 } 160 return false; 161 } 162 const char *getName(TargetLibraryInfo &TLI) { 163 if (auto MI = asMI()) 164 return getMIName(MI); 165 LibFunc Func; 166 if (TLI.getLibFunc(*asCI(), Func)) { 167 if (Func == LibFunc_memcmp) 168 return "memcmp"; 169 if (Func == LibFunc_bcmp) 170 return "bcmp"; 171 } 172 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); 173 return nullptr; 174 } 175 }; 176 177 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 178 public: 179 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 180 OptimizationRemarkEmitter &ORE, DominatorTree *DT, 181 TargetLibraryInfo &TLI) 182 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { 183 ValueDataArray = 184 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); 185 } 186 bool isChanged() const { return Changed; } 187 void perform() { 188 WorkList.clear(); 189 visit(Func); 190 191 for (auto &MO : WorkList) { 192 ++NumOfPGOMemOPAnnotate; 193 if (perform(MO)) { 194 Changed = true; 195 ++NumOfPGOMemOPOpt; 196 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() 197 << "is Transformed.\n"); 198 } 199 } 200 } 201 202 void visitMemIntrinsic(MemIntrinsic &MI) { 203 Value *Length = MI.getLength(); 204 // Not perform on constant length calls. 205 if (isa<ConstantInt>(Length)) 206 return; 207 WorkList.push_back(MemOp(&MI)); 208 } 209 210 void visitCallInst(CallInst &CI) { 211 LibFunc Func; 212 if (TLI.getLibFunc(CI, Func) && 213 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && 214 !isa<ConstantInt>(CI.getArgOperand(2))) { 215 WorkList.push_back(MemOp(&CI)); 216 } 217 } 218 219 private: 220 Function &Func; 221 BlockFrequencyInfo &BFI; 222 OptimizationRemarkEmitter &ORE; 223 DominatorTree *DT; 224 TargetLibraryInfo &TLI; 225 bool Changed; 226 std::vector<MemOp> WorkList; 227 // The space to read the profile annotation. 228 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 229 bool perform(MemOp MO); 230 }; 231 232 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 233 assert(Count <= TotalCount); 234 if (Count < MemOPCountThreshold) 235 return false; 236 if (Count < TotalCount * MemOPPercentThreshold / 100) 237 return false; 238 return true; 239 } 240 241 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 242 uint64_t Denom) { 243 if (!MemOPScaleCount) 244 return Count; 245 bool Overflowed; 246 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 247 return ScaleCount / Denom; 248 } 249 250 bool MemOPSizeOpt::perform(MemOp MO) { 251 assert(MO.I); 252 if (MO.isMemmove()) 253 return false; 254 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) 255 return false; 256 257 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; 258 uint64_t TotalCount; 259 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, 260 ValueDataArray.get(), NumVals, TotalCount)) 261 return false; 262 263 uint64_t ActualCount = TotalCount; 264 uint64_t SavedTotalCount = TotalCount; 265 if (MemOPScaleCount) { 266 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); 267 if (!BBEdgeCount) 268 return false; 269 ActualCount = *BBEdgeCount; 270 } 271 272 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 273 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 274 << ActualCount << "\n"); 275 LLVM_DEBUG( 276 for (auto &VD 277 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 278 279 if (ActualCount < MemOPCountThreshold) 280 return false; 281 // Skip if the total value profiled count is 0, in which case we can't 282 // scale up the counts properly (and there is no profitable transformation). 283 if (TotalCount == 0) 284 return false; 285 286 TotalCount = ActualCount; 287 if (MemOPScaleCount) 288 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 289 << " denominator = " << SavedTotalCount << "\n"); 290 291 // Keeping track of the count of the default case: 292 uint64_t RemainCount = TotalCount; 293 uint64_t SavedRemainCount = SavedTotalCount; 294 SmallVector<uint64_t, 16> SizeIds; 295 SmallVector<uint64_t, 16> CaseCounts; 296 uint64_t MaxCount = 0; 297 unsigned Version = 0; 298 int64_t LastV = -1; 299 // Default case is in the front -- save the slot here. 300 CaseCounts.push_back(0); 301 SmallVector<InstrProfValueData, 24> RemainingVDs; 302 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { 303 auto &VD = *I; 304 int64_t V = VD.Value; 305 uint64_t C = VD.Count; 306 if (MemOPScaleCount) 307 C = getScaledCount(C, ActualCount, SavedTotalCount); 308 309 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { 310 RemainingVDs.push_back(VD); 311 continue; 312 } 313 314 // ValueCounts are sorted on the count. Break at the first un-profitable 315 // value. 316 if (!isProfitable(C, RemainCount)) { 317 RemainingVDs.insert(RemainingVDs.end(), I, E); 318 break; 319 } 320 321 if (V == LastV) { 322 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName() 323 << ": Two consecutive, identical values in MemOp value" 324 "counts.\n"); 325 return false; 326 } 327 328 LastV = V; 329 330 SizeIds.push_back(V); 331 CaseCounts.push_back(C); 332 if (C > MaxCount) 333 MaxCount = C; 334 335 assert(RemainCount >= C); 336 RemainCount -= C; 337 assert(SavedRemainCount >= VD.Count); 338 SavedRemainCount -= VD.Count; 339 340 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { 341 RemainingVDs.insert(RemainingVDs.end(), I + 1, E); 342 break; 343 } 344 } 345 346 if (Version == 0) 347 return false; 348 349 CaseCounts[0] = RemainCount; 350 if (RemainCount > MaxCount) 351 MaxCount = RemainCount; 352 353 uint64_t SumForOpt = TotalCount - RemainCount; 354 355 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 356 << " Versions (covering " << SumForOpt << " out of " 357 << TotalCount << ")\n"); 358 359 // mem_op(..., size) 360 // ==> 361 // switch (size) { 362 // case s1: 363 // mem_op(..., s1); 364 // goto merge_bb; 365 // case s2: 366 // mem_op(..., s2); 367 // goto merge_bb; 368 // ... 369 // default: 370 // mem_op(..., size); 371 // goto merge_bb; 372 // } 373 // merge_bb: 374 375 BasicBlock *BB = MO.I->getParent(); 376 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 377 LLVM_DEBUG(dbgs() << *BB << "\n"); 378 auto OrigBBFreq = BFI.getBlockFreq(BB); 379 380 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); 381 BasicBlock::iterator It(*MO.I); 382 ++It; 383 assert(It != DefaultBB->end()); 384 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 385 MergeBB->setName("MemOP.Merge"); 386 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 387 DefaultBB->setName("MemOP.Default"); 388 389 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 390 auto &Ctx = Func.getContext(); 391 IRBuilder<> IRB(BB); 392 BB->getTerminator()->eraseFromParent(); 393 Value *SizeVar = MO.getLength(); 394 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 395 Type *MemOpTy = MO.I->getType(); 396 PHINode *PHI = nullptr; 397 if (!MemOpTy->isVoidTy()) { 398 // Insert a phi for the return values at the merge block. 399 IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); 400 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); 401 MO.I->replaceAllUsesWith(PHI); 402 PHI->addIncoming(MO.I, DefaultBB); 403 } 404 405 // Clear the value profile data. 406 MO.I->setMetadata(LLVMContext::MD_prof, nullptr); 407 // If all promoted, we don't need the MD.prof metadata. 408 if (SavedRemainCount > 0 || Version != NumVals) { 409 // Otherwise we need update with the un-promoted records back. 410 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); 411 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount, 412 IPVK_MemOPSize, NumVals); 413 } 414 415 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 416 417 std::vector<DominatorTree::UpdateType> Updates; 418 if (DT) 419 Updates.reserve(2 * SizeIds.size()); 420 421 for (uint64_t SizeId : SizeIds) { 422 BasicBlock *CaseBB = BasicBlock::Create( 423 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 424 MemOp NewMO = MO.clone(); 425 // Fix the argument. 426 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType()); 427 assert(SizeType && "Expected integer type size argument."); 428 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 429 NewMO.setLength(CaseSizeId); 430 CaseBB->getInstList().push_back(NewMO.I); 431 IRBuilder<> IRBCase(CaseBB); 432 IRBCase.CreateBr(MergeBB); 433 SI->addCase(CaseSizeId, CaseBB); 434 if (!MemOpTy->isVoidTy()) 435 PHI->addIncoming(NewMO.I, CaseBB); 436 if (DT) { 437 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 438 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 439 } 440 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 441 } 442 DTU.applyUpdates(Updates); 443 Updates.clear(); 444 445 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 446 447 LLVM_DEBUG(dbgs() << *BB << "\n"); 448 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 449 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 450 451 ORE.emit([&]() { 452 using namespace ore; 453 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) 454 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " 455 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) 456 << " for " << NV("Versions", Version) << " versions"; 457 }); 458 459 return true; 460 } 461 } // namespace 462 463 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 464 OptimizationRemarkEmitter &ORE, 465 DominatorTree *DT, TargetLibraryInfo &TLI) { 466 if (DisableMemOPOPT) 467 return false; 468 469 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 470 return false; 471 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); 472 MemOPSizeOpt.perform(); 473 return MemOPSizeOpt.isChanged(); 474 } 475 476 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 477 FunctionAnalysisManager &FAM) { 478 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 479 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 480 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 481 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 482 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); 483 if (!Changed) 484 return PreservedAnalyses::all(); 485 auto PA = PreservedAnalyses(); 486 PA.preserve<DominatorTreeAnalysis>(); 487 return PA; 488 } 489