1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Dominators.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/LLVMContext.h"
34 #include "llvm/IR/PassManager.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/ProfileData/InstrProf.h"
37 #define INSTR_PROF_VALUE_PROF_MEMOP_API
38 #include "llvm/ProfileData/InstrProfData.inc"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MathExtras.h"
44 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
45 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <vector>
49
50 using namespace llvm;
51
52 #define DEBUG_TYPE "pgo-memop-opt"
53
54 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
55 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
56
57 // The minimum call count to optimize memory intrinsic calls.
58 static cl::opt<unsigned>
59 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
60 cl::desc("The minimum count to optimize memory "
61 "intrinsic calls"));
62
63 // Command line option to disable memory intrinsic optimization. The default is
64 // false. This is for debug purpose.
65 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
66 cl::Hidden, cl::desc("Disable optimize"));
67
68 // The percent threshold to optimize memory intrinsic calls.
69 static cl::opt<unsigned>
70 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
71 cl::Hidden,
72 cl::desc("The percentage threshold for the "
73 "memory intrinsic calls optimization"));
74
75 // Maximum number of versions for optimizing memory intrinsic call.
76 static cl::opt<unsigned>
77 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
78 cl::desc("The max version for the optimized memory "
79 " intrinsic calls"));
80
81 // Scale the counts from the annotation using the BB count value.
82 static cl::opt<bool>
83 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
84 cl::desc("Scale the memop size counts using the basic "
85 " block count value"));
86
87 cl::opt<bool>
88 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
89 cl::Hidden,
90 cl::desc("Size-specialize memcmp and bcmp calls"));
91
92 static cl::opt<unsigned>
93 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
94 cl::desc("Optimize the memop size <= this value"));
95
96 namespace {
97
getMIName(const MemIntrinsic * MI)98 static const char *getMIName(const MemIntrinsic *MI) {
99 switch (MI->getIntrinsicID()) {
100 case Intrinsic::memcpy:
101 return "memcpy";
102 case Intrinsic::memmove:
103 return "memmove";
104 case Intrinsic::memset:
105 return "memset";
106 default:
107 return "unknown";
108 }
109 }
110
111 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
112 struct MemOp {
113 Instruction *I;
MemOp__anon419b87b30111::MemOp114 MemOp(MemIntrinsic *MI) : I(MI) {}
MemOp__anon419b87b30111::MemOp115 MemOp(CallInst *CI) : I(CI) {}
asMI__anon419b87b30111::MemOp116 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
asCI__anon419b87b30111::MemOp117 CallInst *asCI() { return cast<CallInst>(I); }
clone__anon419b87b30111::MemOp118 MemOp clone() {
119 if (auto MI = asMI())
120 return MemOp(cast<MemIntrinsic>(MI->clone()));
121 return MemOp(cast<CallInst>(asCI()->clone()));
122 }
getLength__anon419b87b30111::MemOp123 Value *getLength() {
124 if (auto MI = asMI())
125 return MI->getLength();
126 return asCI()->getArgOperand(2);
127 }
setLength__anon419b87b30111::MemOp128 void setLength(Value *Length) {
129 if (auto MI = asMI())
130 return MI->setLength(Length);
131 asCI()->setArgOperand(2, Length);
132 }
getFuncName__anon419b87b30111::MemOp133 StringRef getFuncName() {
134 if (auto MI = asMI())
135 return MI->getCalledFunction()->getName();
136 return asCI()->getCalledFunction()->getName();
137 }
isMemmove__anon419b87b30111::MemOp138 bool isMemmove() {
139 if (auto MI = asMI())
140 if (MI->getIntrinsicID() == Intrinsic::memmove)
141 return true;
142 return false;
143 }
isMemcmp__anon419b87b30111::MemOp144 bool isMemcmp(TargetLibraryInfo &TLI) {
145 LibFunc Func;
146 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
147 Func == LibFunc_memcmp) {
148 return true;
149 }
150 return false;
151 }
isBcmp__anon419b87b30111::MemOp152 bool isBcmp(TargetLibraryInfo &TLI) {
153 LibFunc Func;
154 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
155 Func == LibFunc_bcmp) {
156 return true;
157 }
158 return false;
159 }
getName__anon419b87b30111::MemOp160 const char *getName(TargetLibraryInfo &TLI) {
161 if (auto MI = asMI())
162 return getMIName(MI);
163 LibFunc Func;
164 if (TLI.getLibFunc(*asCI(), Func)) {
165 if (Func == LibFunc_memcmp)
166 return "memcmp";
167 if (Func == LibFunc_bcmp)
168 return "bcmp";
169 }
170 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
171 return nullptr;
172 }
173 };
174
175 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
176 public:
MemOPSizeOpt(Function & Func,BlockFrequencyInfo & BFI,OptimizationRemarkEmitter & ORE,DominatorTree * DT,TargetLibraryInfo & TLI)177 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
178 OptimizationRemarkEmitter &ORE, DominatorTree *DT,
179 TargetLibraryInfo &TLI)
180 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
181 ValueDataArray =
182 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
183 }
isChanged() const184 bool isChanged() const { return Changed; }
perform()185 void perform() {
186 WorkList.clear();
187 visit(Func);
188
189 for (auto &MO : WorkList) {
190 ++NumOfPGOMemOPAnnotate;
191 if (perform(MO)) {
192 Changed = true;
193 ++NumOfPGOMemOPOpt;
194 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
195 << "is Transformed.\n");
196 }
197 }
198 }
199
visitMemIntrinsic(MemIntrinsic & MI)200 void visitMemIntrinsic(MemIntrinsic &MI) {
201 Value *Length = MI.getLength();
202 // Not perform on constant length calls.
203 if (isa<ConstantInt>(Length))
204 return;
205 WorkList.push_back(MemOp(&MI));
206 }
207
visitCallInst(CallInst & CI)208 void visitCallInst(CallInst &CI) {
209 LibFunc Func;
210 if (TLI.getLibFunc(CI, Func) &&
211 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
212 !isa<ConstantInt>(CI.getArgOperand(2))) {
213 WorkList.push_back(MemOp(&CI));
214 }
215 }
216
217 private:
218 Function &Func;
219 BlockFrequencyInfo &BFI;
220 OptimizationRemarkEmitter &ORE;
221 DominatorTree *DT;
222 TargetLibraryInfo &TLI;
223 bool Changed;
224 std::vector<MemOp> WorkList;
225 // The space to read the profile annotation.
226 std::unique_ptr<InstrProfValueData[]> ValueDataArray;
227 bool perform(MemOp MO);
228 };
229
isProfitable(uint64_t Count,uint64_t TotalCount)230 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
231 assert(Count <= TotalCount);
232 if (Count < MemOPCountThreshold)
233 return false;
234 if (Count < TotalCount * MemOPPercentThreshold / 100)
235 return false;
236 return true;
237 }
238
getScaledCount(uint64_t Count,uint64_t Num,uint64_t Denom)239 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
240 uint64_t Denom) {
241 if (!MemOPScaleCount)
242 return Count;
243 bool Overflowed;
244 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
245 return ScaleCount / Denom;
246 }
247
perform(MemOp MO)248 bool MemOPSizeOpt::perform(MemOp MO) {
249 assert(MO.I);
250 if (MO.isMemmove())
251 return false;
252 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
253 return false;
254
255 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
256 uint64_t TotalCount;
257 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
258 ValueDataArray.get(), NumVals, TotalCount))
259 return false;
260
261 uint64_t ActualCount = TotalCount;
262 uint64_t SavedTotalCount = TotalCount;
263 if (MemOPScaleCount) {
264 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
265 if (!BBEdgeCount)
266 return false;
267 ActualCount = *BBEdgeCount;
268 }
269
270 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
271 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
272 << ActualCount << "\n");
273 LLVM_DEBUG(
274 for (auto &VD
275 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
276
277 if (ActualCount < MemOPCountThreshold)
278 return false;
279 // Skip if the total value profiled count is 0, in which case we can't
280 // scale up the counts properly (and there is no profitable transformation).
281 if (TotalCount == 0)
282 return false;
283
284 TotalCount = ActualCount;
285 if (MemOPScaleCount)
286 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
287 << " denominator = " << SavedTotalCount << "\n");
288
289 // Keeping track of the count of the default case:
290 uint64_t RemainCount = TotalCount;
291 uint64_t SavedRemainCount = SavedTotalCount;
292 SmallVector<uint64_t, 16> SizeIds;
293 SmallVector<uint64_t, 16> CaseCounts;
294 uint64_t MaxCount = 0;
295 unsigned Version = 0;
296 int64_t LastV = -1;
297 // Default case is in the front -- save the slot here.
298 CaseCounts.push_back(0);
299 SmallVector<InstrProfValueData, 24> RemainingVDs;
300 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
301 auto &VD = *I;
302 int64_t V = VD.Value;
303 uint64_t C = VD.Count;
304 if (MemOPScaleCount)
305 C = getScaledCount(C, ActualCount, SavedTotalCount);
306
307 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
308 RemainingVDs.push_back(VD);
309 continue;
310 }
311
312 // ValueCounts are sorted on the count. Break at the first un-profitable
313 // value.
314 if (!isProfitable(C, RemainCount)) {
315 RemainingVDs.insert(RemainingVDs.end(), I, E);
316 break;
317 }
318
319 if (V == LastV) {
320 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName()
321 << ": Two consecutive, identical values in MemOp value"
322 "counts.\n");
323 return false;
324 }
325
326 LastV = V;
327
328 SizeIds.push_back(V);
329 CaseCounts.push_back(C);
330 if (C > MaxCount)
331 MaxCount = C;
332
333 assert(RemainCount >= C);
334 RemainCount -= C;
335 assert(SavedRemainCount >= VD.Count);
336 SavedRemainCount -= VD.Count;
337
338 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
339 RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
340 break;
341 }
342 }
343
344 if (Version == 0)
345 return false;
346
347 CaseCounts[0] = RemainCount;
348 if (RemainCount > MaxCount)
349 MaxCount = RemainCount;
350
351 uint64_t SumForOpt = TotalCount - RemainCount;
352
353 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
354 << " Versions (covering " << SumForOpt << " out of "
355 << TotalCount << ")\n");
356
357 // mem_op(..., size)
358 // ==>
359 // switch (size) {
360 // case s1:
361 // mem_op(..., s1);
362 // goto merge_bb;
363 // case s2:
364 // mem_op(..., s2);
365 // goto merge_bb;
366 // ...
367 // default:
368 // mem_op(..., size);
369 // goto merge_bb;
370 // }
371 // merge_bb:
372
373 BasicBlock *BB = MO.I->getParent();
374 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
375 LLVM_DEBUG(dbgs() << *BB << "\n");
376 auto OrigBBFreq = BFI.getBlockFreq(BB);
377
378 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
379 BasicBlock::iterator It(*MO.I);
380 ++It;
381 assert(It != DefaultBB->end());
382 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
383 MergeBB->setName("MemOP.Merge");
384 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
385 DefaultBB->setName("MemOP.Default");
386
387 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
388 auto &Ctx = Func.getContext();
389 IRBuilder<> IRB(BB);
390 BB->getTerminator()->eraseFromParent();
391 Value *SizeVar = MO.getLength();
392 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
393 Type *MemOpTy = MO.I->getType();
394 PHINode *PHI = nullptr;
395 if (!MemOpTy->isVoidTy()) {
396 // Insert a phi for the return values at the merge block.
397 IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
398 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
399 MO.I->replaceAllUsesWith(PHI);
400 PHI->addIncoming(MO.I, DefaultBB);
401 }
402
403 // Clear the value profile data.
404 MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
405 // If all promoted, we don't need the MD.prof metadata.
406 if (SavedRemainCount > 0 || Version != NumVals) {
407 // Otherwise we need update with the un-promoted records back.
408 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
409 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
410 IPVK_MemOPSize, NumVals);
411 }
412
413 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
414
415 std::vector<DominatorTree::UpdateType> Updates;
416 if (DT)
417 Updates.reserve(2 * SizeIds.size());
418
419 for (uint64_t SizeId : SizeIds) {
420 BasicBlock *CaseBB = BasicBlock::Create(
421 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
422 MemOp NewMO = MO.clone();
423 // Fix the argument.
424 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
425 assert(SizeType && "Expected integer type size argument.");
426 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
427 NewMO.setLength(CaseSizeId);
428 CaseBB->getInstList().push_back(NewMO.I);
429 IRBuilder<> IRBCase(CaseBB);
430 IRBCase.CreateBr(MergeBB);
431 SI->addCase(CaseSizeId, CaseBB);
432 if (!MemOpTy->isVoidTy())
433 PHI->addIncoming(NewMO.I, CaseBB);
434 if (DT) {
435 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
436 Updates.push_back({DominatorTree::Insert, BB, CaseBB});
437 }
438 LLVM_DEBUG(dbgs() << *CaseBB << "\n");
439 }
440 DTU.applyUpdates(Updates);
441 Updates.clear();
442
443 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
444
445 LLVM_DEBUG(dbgs() << *BB << "\n");
446 LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
447 LLVM_DEBUG(dbgs() << *MergeBB << "\n");
448
449 ORE.emit([&]() {
450 using namespace ore;
451 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
452 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
453 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
454 << " for " << NV("Versions", Version) << " versions";
455 });
456
457 return true;
458 }
459 } // namespace
460
PGOMemOPSizeOptImpl(Function & F,BlockFrequencyInfo & BFI,OptimizationRemarkEmitter & ORE,DominatorTree * DT,TargetLibraryInfo & TLI)461 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
462 OptimizationRemarkEmitter &ORE,
463 DominatorTree *DT, TargetLibraryInfo &TLI) {
464 if (DisableMemOPOPT)
465 return false;
466
467 if (F.hasFnAttribute(Attribute::OptimizeForSize))
468 return false;
469 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
470 MemOPSizeOpt.perform();
471 return MemOPSizeOpt.isChanged();
472 }
473
run(Function & F,FunctionAnalysisManager & FAM)474 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
475 FunctionAnalysisManager &FAM) {
476 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
477 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
478 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
479 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
480 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
481 if (!Changed)
482 return PreservedAnalyses::all();
483 auto PA = PreservedAnalyses();
484 PA.preserve<DominatorTreeAnalysis>();
485 return PA;
486 }
487