1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the transformation that optimizes memory intrinsics 11 // such as memcpy using the size value profile. When memory intrinsic size 12 // value profile metadata is available, a single memory intrinsic is expanded 13 // to a sequence of guarded specialized versions that are called with the 14 // hottest size(s), for later expansion into more optimal inline sequences. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/GlobalsModRef.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/InstrTypes.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/InstVisitor.h" 34 #include "llvm/IR/LLVMContext.h" 35 #include "llvm/IR/PassManager.h" 36 #include "llvm/IR/Type.h" 37 #include "llvm/Pass.h" 38 #include "llvm/PassRegistry.h" 39 #include "llvm/PassSupport.h" 40 #include "llvm/ProfileData/InstrProf.h" 41 #include "llvm/Support/Casting.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/ErrorHandling.h" 45 #include "llvm/Support/MathExtras.h" 46 #include "llvm/Transforms/Instrumentation.h" 47 #include "llvm/Transforms/PGOInstrumentation.h" 48 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 49 #include <cassert> 50 #include <cstdint> 51 #include <vector> 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "pgo-memop-opt" 56 57 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 58 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 59 60 // The minimum call count to optimize memory intrinsic calls. 61 static cl::opt<unsigned> 62 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 63 cl::init(1000), 64 cl::desc("The minimum count to optimize memory " 65 "intrinsic calls")); 66 67 // Command line option to disable memory intrinsic optimization. The default is 68 // false. This is for debug purpose. 69 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 70 cl::Hidden, cl::desc("Disable optimize")); 71 72 // The percent threshold to optimize memory intrinsic calls. 73 static cl::opt<unsigned> 74 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 75 cl::Hidden, cl::ZeroOrMore, 76 cl::desc("The percentage threshold for the " 77 "memory intrinsic calls optimization")); 78 79 // Maximum number of versions for optimizing memory intrinsic call. 80 static cl::opt<unsigned> 81 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 82 cl::ZeroOrMore, 83 cl::desc("The max version for the optimized memory " 84 " intrinsic calls")); 85 86 // Scale the counts from the annotation using the BB count value. 87 static cl::opt<bool> 88 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 89 cl::desc("Scale the memop size counts using the basic " 90 " block count value")); 91 92 // This option sets the rangge of precise profile memop sizes. 93 extern cl::opt<std::string> MemOPSizeRange; 94 95 // This option sets the value that groups large memop sizes 96 extern cl::opt<unsigned> MemOPSizeLarge; 97 98 namespace { 99 class PGOMemOPSizeOptLegacyPass : public FunctionPass { 100 public: 101 static char ID; 102 103 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 104 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 105 } 106 107 StringRef getPassName() const override { return "PGOMemOPSize"; } 108 109 private: 110 bool runOnFunction(Function &F) override; 111 void getAnalysisUsage(AnalysisUsage &AU) const override { 112 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 113 AU.addPreserved<GlobalsAAWrapperPass>(); 114 } 115 }; 116 } // end anonymous namespace 117 118 char PGOMemOPSizeOptLegacyPass::ID = 0; 119 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 120 "Optimize memory intrinsic using its size value profile", 121 false, false) 122 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 123 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 124 "Optimize memory intrinsic using its size value profile", 125 false, false) 126 127 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 128 return new PGOMemOPSizeOptLegacyPass(); 129 } 130 131 namespace { 132 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 133 public: 134 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI) 135 : Func(Func), BFI(BFI), Changed(false) { 136 ValueDataArray = 137 llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 138 // Get the MemOPSize range information from option MemOPSizeRange, 139 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, 140 PreciseRangeLast); 141 } 142 bool isChanged() const { return Changed; } 143 void perform() { 144 WorkList.clear(); 145 visit(Func); 146 147 for (auto &MI : WorkList) { 148 ++NumOfPGOMemOPAnnotate; 149 if (perform(MI)) { 150 Changed = true; 151 ++NumOfPGOMemOPOpt; 152 DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName() 153 << "is Transformed.\n"); 154 } 155 } 156 } 157 158 void visitMemIntrinsic(MemIntrinsic &MI) { 159 Value *Length = MI.getLength(); 160 // Not perform on constant length calls. 161 if (dyn_cast<ConstantInt>(Length)) 162 return; 163 WorkList.push_back(&MI); 164 } 165 166 private: 167 Function &Func; 168 BlockFrequencyInfo &BFI; 169 bool Changed; 170 std::vector<MemIntrinsic *> WorkList; 171 // Start of the previse range. 172 int64_t PreciseRangeStart; 173 // Last value of the previse range. 174 int64_t PreciseRangeLast; 175 // The space to read the profile annotation. 176 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 177 bool perform(MemIntrinsic *MI); 178 179 // This kind shows which group the value falls in. For PreciseValue, we have 180 // the profile count for that value. LargeGroup groups the values that are in 181 // range [LargeValue, +inf). NonLargeGroup groups the rest of values. 182 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; 183 184 MemOPSizeKind getMemOPSizeKind(int64_t Value) const { 185 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) 186 return LargeGroup; 187 if (Value == PreciseRangeLast + 1) 188 return NonLargeGroup; 189 return PreciseValue; 190 } 191 }; 192 193 static const char *getMIName(const MemIntrinsic *MI) { 194 switch (MI->getIntrinsicID()) { 195 case Intrinsic::memcpy: 196 return "memcpy"; 197 case Intrinsic::memmove: 198 return "memmove"; 199 case Intrinsic::memset: 200 return "memset"; 201 default: 202 return "unknown"; 203 } 204 } 205 206 static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 207 assert(Count <= TotalCount); 208 if (Count < MemOPCountThreshold) 209 return false; 210 if (Count < TotalCount * MemOPPercentThreshold / 100) 211 return false; 212 return true; 213 } 214 215 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 216 uint64_t Denom) { 217 if (!MemOPScaleCount) 218 return Count; 219 bool Overflowed; 220 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 221 return ScaleCount / Denom; 222 } 223 224 bool MemOPSizeOpt::perform(MemIntrinsic *MI) { 225 assert(MI); 226 if (MI->getIntrinsicID() == Intrinsic::memmove) 227 return false; 228 229 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 230 uint64_t TotalCount; 231 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, 232 ValueDataArray.get(), NumVals, TotalCount)) 233 return false; 234 235 uint64_t ActualCount = TotalCount; 236 uint64_t SavedTotalCount = TotalCount; 237 if (MemOPScaleCount) { 238 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); 239 if (!BBEdgeCount) 240 return false; 241 ActualCount = *BBEdgeCount; 242 } 243 244 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 245 DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount 246 << "\n"); 247 DEBUG( 248 for (auto &VD 249 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 250 251 if (ActualCount < MemOPCountThreshold) 252 return false; 253 // Skip if the total value profiled count is 0, in which case we can't 254 // scale up the counts properly (and there is no profitable transformation). 255 if (TotalCount == 0) 256 return false; 257 258 TotalCount = ActualCount; 259 if (MemOPScaleCount) 260 DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 261 << " denominator = " << SavedTotalCount << "\n"); 262 263 // Keeping track of the count of the default case: 264 uint64_t RemainCount = TotalCount; 265 uint64_t SavedRemainCount = SavedTotalCount; 266 SmallVector<uint64_t, 16> SizeIds; 267 SmallVector<uint64_t, 16> CaseCounts; 268 uint64_t MaxCount = 0; 269 unsigned Version = 0; 270 // Default case is in the front -- save the slot here. 271 CaseCounts.push_back(0); 272 for (auto &VD : VDs) { 273 int64_t V = VD.Value; 274 uint64_t C = VD.Count; 275 if (MemOPScaleCount) 276 C = getScaledCount(C, ActualCount, SavedTotalCount); 277 278 // Only care precise value here. 279 if (getMemOPSizeKind(V) != PreciseValue) 280 continue; 281 282 // ValueCounts are sorted on the count. Break at the first un-profitable 283 // value. 284 if (!isProfitable(C, RemainCount)) 285 break; 286 287 SizeIds.push_back(V); 288 CaseCounts.push_back(C); 289 if (C > MaxCount) 290 MaxCount = C; 291 292 assert(RemainCount >= C); 293 RemainCount -= C; 294 assert(SavedRemainCount >= VD.Count); 295 SavedRemainCount -= VD.Count; 296 297 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 298 break; 299 } 300 301 if (Version == 0) 302 return false; 303 304 CaseCounts[0] = RemainCount; 305 if (RemainCount > MaxCount) 306 MaxCount = RemainCount; 307 308 uint64_t SumForOpt = TotalCount - RemainCount; 309 310 DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 311 << " Versions (covering " << SumForOpt << " out of " 312 << TotalCount << ")\n"); 313 314 // mem_op(..., size) 315 // ==> 316 // switch (size) { 317 // case s1: 318 // mem_op(..., s1); 319 // goto merge_bb; 320 // case s2: 321 // mem_op(..., s2); 322 // goto merge_bb; 323 // ... 324 // default: 325 // mem_op(..., size); 326 // goto merge_bb; 327 // } 328 // merge_bb: 329 330 BasicBlock *BB = MI->getParent(); 331 DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 332 DEBUG(dbgs() << *BB << "\n"); 333 auto OrigBBFreq = BFI.getBlockFreq(BB); 334 335 BasicBlock *DefaultBB = SplitBlock(BB, MI); 336 BasicBlock::iterator It(*MI); 337 ++It; 338 assert(It != DefaultBB->end()); 339 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It)); 340 MergeBB->setName("MemOP.Merge"); 341 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 342 DefaultBB->setName("MemOP.Default"); 343 344 auto &Ctx = Func.getContext(); 345 IRBuilder<> IRB(BB); 346 BB->getTerminator()->eraseFromParent(); 347 Value *SizeVar = MI->getLength(); 348 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 349 350 // Clear the value profile data. 351 MI->setMetadata(LLVMContext::MD_prof, nullptr); 352 // If all promoted, we don't need the MD.prof metadata. 353 if (SavedRemainCount > 0 || Version != NumVals) 354 // Otherwise we need update with the un-promoted records back. 355 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), 356 SavedRemainCount, IPVK_MemOPSize, NumVals); 357 358 DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 359 360 for (uint64_t SizeId : SizeIds) { 361 ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId); 362 BasicBlock *CaseBB = BasicBlock::Create( 363 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 364 Instruction *NewInst = MI->clone(); 365 // Fix the argument. 366 dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId); 367 CaseBB->getInstList().push_back(NewInst); 368 IRBuilder<> IRBCase(CaseBB); 369 IRBCase.CreateBr(MergeBB); 370 SI->addCase(CaseSizeId, CaseBB); 371 DEBUG(dbgs() << *CaseBB << "\n"); 372 } 373 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 374 375 DEBUG(dbgs() << *BB << "\n"); 376 DEBUG(dbgs() << *DefaultBB << "\n"); 377 DEBUG(dbgs() << *MergeBB << "\n"); 378 379 emitOptimizationRemark(Func.getContext(), "memop-opt", Func, 380 MI->getDebugLoc(), 381 Twine("optimize ") + getMIName(MI) + " with count " + 382 Twine(SumForOpt) + " out of " + Twine(TotalCount) + 383 " for " + Twine(Version) + " versions"); 384 385 return true; 386 } 387 } // namespace 388 389 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) { 390 if (DisableMemOPOPT) 391 return false; 392 393 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 394 return false; 395 MemOPSizeOpt MemOPSizeOpt(F, BFI); 396 MemOPSizeOpt.perform(); 397 return MemOPSizeOpt.isChanged(); 398 } 399 400 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 401 BlockFrequencyInfo &BFI = 402 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 403 return PGOMemOPSizeOptImpl(F, BFI); 404 } 405 406 namespace llvm { 407 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 408 409 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 410 FunctionAnalysisManager &FAM) { 411 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 412 bool Changed = PGOMemOPSizeOptImpl(F, BFI); 413 if (!Changed) 414 return PreservedAnalyses::all(); 415 auto PA = PreservedAnalyses(); 416 PA.preserve<GlobalsAA>(); 417 return PA; 418 } 419 } // namespace llvm 420