1 //===-- IndirectCallPromotion.cpp - Optimizations based on value profiling ===//
2 //
3 //                      The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the transformation that promotes indirect calls to
11 // conditional direct calls when the indirect-call value profile metadata is
12 // available.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/ADT/Twine.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/GlobalsModRef.h"
22 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
23 #include "llvm/Analysis/IndirectCallSiteVisitor.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstrTypes.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/LLVMContext.h"
34 #include "llvm/IR/MDBuilder.h"
35 #include "llvm/IR/PassManager.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/Pass.h"
38 #include "llvm/PassRegistry.h"
39 #include "llvm/PassSupport.h"
40 #include "llvm/ProfileData/InstrProf.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/MathExtras.h"
46 #include "llvm/Transforms/Instrumentation.h"
47 #include "llvm/Transforms/PGOInstrumentation.h"
48 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
49 #include <cassert>
50 #include <cstdint>
51 #include <vector>
52 
53 using namespace llvm;
54 
55 #define DEBUG_TYPE "pgo-icall-prom"
56 
57 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
58 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
61 
62 // Command line option to disable indirect-call promotion with the default as
63 // false. This is for debug purpose.
64 static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
65                                 cl::desc("Disable indirect call promotion"));
66 
67 // Set the cutoff value for the promotion. If the value is other than 0, we
68 // stop the transformation once the total number of promotions equals the cutoff
69 // value.
70 // For debug use only.
71 static cl::opt<unsigned>
72     ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
73               cl::desc("Max number of promotions for this compilation"));
74 
75 // If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
76 // For debug use only.
77 static cl::opt<unsigned>
78     ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
79               cl::desc("Skip Callsite up to this number for this compilation"));
80 
81 // Set if the pass is called in LTO optimization. The difference for LTO mode
82 // is the pass won't prefix the source module name to the internal linkage
83 // symbols.
84 static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
85                                 cl::desc("Run indirect-call promotion in LTO "
86                                          "mode"));
87 
88 // Set if the pass is called in SamplePGO mode. The difference for SamplePGO
89 // mode is it will add prof metadatato the created direct call.
90 static cl::opt<bool>
91     ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
92                      cl::desc("Run indirect-call promotion in SamplePGO mode"));
93 
94 // If the option is set to true, only call instructions will be considered for
95 // transformation -- invoke instructions will be ignored.
96 static cl::opt<bool>
97     ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
98                 cl::desc("Run indirect-call promotion for call instructions "
99                          "only"));
100 
101 // If the option is set to true, only invoke instructions will be considered for
102 // transformation -- call instructions will be ignored.
103 static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
104                                    cl::Hidden,
105                                    cl::desc("Run indirect-call promotion for "
106                                             "invoke instruction only"));
107 
108 // Dump the function level IR if the transformation happened in this
109 // function. For debug use only.
110 static cl::opt<bool>
111     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
112                  cl::desc("Dump IR after transformation happens"));
113 
114 // The minimum call count to optimize memory intrinsic calls.
115 static cl::opt<unsigned>
116     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
117                         cl::init(1000),
118                         cl::desc("The minimum count to optimize memory "
119                                  "intrinsic calls"));
120 
121 // Command line option to disable memory intrinsic optimization. The default is
122 // false. This is for debug purpose.
123 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
124                                      cl::Hidden, cl::desc("Disable optimize"));
125 
126 // The percent threshold to optimize memory intrinsic calls.
127 static cl::opt<unsigned>
128     MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
129                           cl::Hidden, cl::ZeroOrMore,
130                           cl::desc("The percentage threshold for the "
131                                    "memory intrinsic calls optimization"));
132 
133 // Maximum number of versions for optimizing memory intrinsic call.
134 static cl::opt<unsigned>
135     MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
136                     cl::ZeroOrMore,
137                     cl::desc("The max version for the optimized memory "
138                              " intrinsic calls"));
139 
140 // Scale the counts from the annotation using the BB count value.
141 static cl::opt<bool>
142     MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
143                     cl::desc("Scale the memop size counts using the basic "
144                              " block count value"));
145 
146 // This option sets the rangge of precise profile memop sizes.
147 extern cl::opt<std::string> MemOPSizeRange;
148 
149 // This option sets the value that groups large memop sizes
150 extern cl::opt<unsigned> MemOPSizeLarge;
151 
152 namespace {
153 class PGOIndirectCallPromotionLegacyPass : public ModulePass {
154 public:
155   static char ID;
156 
157   PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
158       : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
159     initializePGOIndirectCallPromotionLegacyPassPass(
160         *PassRegistry::getPassRegistry());
161   }
162 
163   StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
164 
165 private:
166   bool runOnModule(Module &M) override;
167 
168   // If this pass is called in LTO. We need to special handling the PGOFuncName
169   // for the static variables due to LTO's internalization.
170   bool InLTO;
171 
172   // If this pass is called in SamplePGO. We need to add the prof metadata to
173   // the promoted direct call.
174   bool SamplePGO;
175 };
176 
177 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
178 public:
179   static char ID;
180 
181   PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
182     initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
183   }
184 
185   StringRef getPassName() const override { return "PGOMemOPSize"; }
186 
187 private:
188   bool runOnFunction(Function &F) override;
189   void getAnalysisUsage(AnalysisUsage &AU) const override {
190     AU.addRequired<BlockFrequencyInfoWrapperPass>();
191     AU.addPreserved<GlobalsAAWrapperPass>();
192   }
193 };
194 } // end anonymous namespace
195 
196 char PGOIndirectCallPromotionLegacyPass::ID = 0;
197 INITIALIZE_PASS(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
198                 "Use PGO instrumentation profile to promote indirect calls to "
199                 "direct calls.",
200                 false, false)
201 
202 ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
203                                                            bool SamplePGO) {
204   return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
205 }
206 
207 char PGOMemOPSizeOptLegacyPass::ID = 0;
208 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
209                       "Optimize memory intrinsic using its size value profile",
210                       false, false)
211 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
212 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
213                     "Optimize memory intrinsic using its size value profile",
214                     false, false)
215 
216 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
217   return new PGOMemOPSizeOptLegacyPass();
218 }
219 
220 namespace {
221 // The class for main data structure to promote indirect calls to conditional
222 // direct calls.
223 class ICallPromotionFunc {
224 private:
225   Function &F;
226   Module *M;
227 
228   // Symtab that maps indirect call profile values to function names and
229   // defines.
230   InstrProfSymtab *Symtab;
231 
232   bool SamplePGO;
233 
234   // Test if we can legally promote this direct-call of Target.
235   bool isPromotionLegal(Instruction *Inst, uint64_t Target, Function *&F,
236                         const char **Reason = nullptr);
237 
238   // A struct that records the direct target and it's call count.
239   struct PromotionCandidate {
240     Function *TargetFunction;
241     uint64_t Count;
242     PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
243   };
244 
245   // Check if the indirect-call call site should be promoted. Return the number
246   // of promotions. Inst is the candidate indirect call, ValueDataRef
247   // contains the array of value profile data for profiled targets,
248   // TotalCount is the total profiled count of call executions, and
249   // NumCandidates is the number of candidate entries in ValueDataRef.
250   std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
251       Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
252       uint64_t TotalCount, uint32_t NumCandidates);
253 
254   // Promote a list of targets for one indirect-call callsite. Return
255   // the number of promotions.
256   uint32_t tryToPromote(Instruction *Inst,
257                         const std::vector<PromotionCandidate> &Candidates,
258                         uint64_t &TotalCount);
259 
260   // Noncopyable
261   ICallPromotionFunc(const ICallPromotionFunc &other) = delete;
262   ICallPromotionFunc &operator=(const ICallPromotionFunc &other) = delete;
263 
264 public:
265   ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
266                      bool SamplePGO)
267       : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO) {}
268 
269   bool processFunction();
270 };
271 } // end anonymous namespace
272 
273 bool llvm::isLegalToPromote(Instruction *Inst, Function *F,
274                             const char **Reason) {
275   // Check the return type.
276   Type *CallRetType = Inst->getType();
277   if (!CallRetType->isVoidTy()) {
278     Type *FuncRetType = F->getReturnType();
279     if (FuncRetType != CallRetType &&
280         !CastInst::isBitCastable(FuncRetType, CallRetType)) {
281       if (Reason)
282         *Reason = "Return type mismatch";
283       return false;
284     }
285   }
286 
287   // Check if the arguments are compatible with the parameters
288   FunctionType *DirectCalleeType = F->getFunctionType();
289   unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
290   CallSite CS(Inst);
291   unsigned ArgNum = CS.arg_size();
292 
293   if (ParamNum != ArgNum && !DirectCalleeType->isVarArg()) {
294     if (Reason)
295       *Reason = "The number of arguments mismatch";
296     return false;
297   }
298 
299   for (unsigned I = 0; I < ParamNum; ++I) {
300     Type *PTy = DirectCalleeType->getFunctionParamType(I);
301     Type *ATy = CS.getArgument(I)->getType();
302     if (PTy == ATy)
303       continue;
304     if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy)) {
305       if (Reason)
306         *Reason = "Argument type mismatch";
307       return false;
308     }
309   }
310 
311   DEBUG(dbgs() << " #" << NumOfPGOICallPromotion << " Promote the icall to "
312                << F->getName() << "\n");
313   return true;
314 }
315 
316 bool ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
317                                           Function *&TargetFunction,
318                                           const char **Reason) {
319   TargetFunction = Symtab->getFunction(Target);
320   if (TargetFunction == nullptr) {
321     *Reason = "Cannot find the target";
322     return false;
323   }
324   return isLegalToPromote(Inst, TargetFunction, Reason);
325 }
326 
327 // Indirect-call promotion heuristic. The direct targets are sorted based on
328 // the count. Stop at the first target that is not promoted.
329 std::vector<ICallPromotionFunc::PromotionCandidate>
330 ICallPromotionFunc::getPromotionCandidatesForCallSite(
331     Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
332     uint64_t TotalCount, uint32_t NumCandidates) {
333   std::vector<PromotionCandidate> Ret;
334 
335   DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
336                << " Num_targets: " << ValueDataRef.size()
337                << " Num_candidates: " << NumCandidates << "\n");
338   NumOfPGOICallsites++;
339   if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
340     DEBUG(dbgs() << " Skip: User options.\n");
341     return Ret;
342   }
343 
344   for (uint32_t I = 0; I < NumCandidates; I++) {
345     uint64_t Count = ValueDataRef[I].Count;
346     assert(Count <= TotalCount);
347     uint64_t Target = ValueDataRef[I].Value;
348     DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
349                  << "  Target_func: " << Target << "\n");
350 
351     if (ICPInvokeOnly && dyn_cast<CallInst>(Inst)) {
352       DEBUG(dbgs() << " Not promote: User options.\n");
353       break;
354     }
355     if (ICPCallOnly && dyn_cast<InvokeInst>(Inst)) {
356       DEBUG(dbgs() << " Not promote: User option.\n");
357       break;
358     }
359     if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
360       DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
361       break;
362     }
363     Function *TargetFunction = nullptr;
364     const char *Reason = nullptr;
365     if (!isPromotionLegal(Inst, Target, TargetFunction, &Reason)) {
366       StringRef TargetFuncName = Symtab->getFuncName(Target);
367       DEBUG(dbgs() << " Not promote: " << Reason << "\n");
368       emitOptimizationRemarkMissed(
369           F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
370           Twine("Cannot promote indirect call to ") +
371               (TargetFuncName.empty() ? Twine(Target) : Twine(TargetFuncName)) +
372               Twine(" with count of ") + Twine(Count) + ": " + Reason);
373       break;
374     }
375     Ret.push_back(PromotionCandidate(TargetFunction, Count));
376     TotalCount -= Count;
377   }
378   return Ret;
379 }
380 
381 // Create a diamond structure for If_Then_Else. Also update the profile
382 // count. Do the fix-up for the invoke instruction.
383 static void createIfThenElse(Instruction *Inst, Function *DirectCallee,
384                              uint64_t Count, uint64_t TotalCount,
385                              BasicBlock **DirectCallBB,
386                              BasicBlock **IndirectCallBB,
387                              BasicBlock **MergeBB) {
388   CallSite CS(Inst);
389   Value *OrigCallee = CS.getCalledValue();
390 
391   IRBuilder<> BBBuilder(Inst);
392   LLVMContext &Ctx = Inst->getContext();
393   Value *BCI1 =
394       BBBuilder.CreateBitCast(OrigCallee, Type::getInt8PtrTy(Ctx), "");
395   Value *BCI2 =
396       BBBuilder.CreateBitCast(DirectCallee, Type::getInt8PtrTy(Ctx), "");
397   Value *PtrCmp = BBBuilder.CreateICmpEQ(BCI1, BCI2, "");
398 
399   uint64_t ElseCount = TotalCount - Count;
400   uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
401   uint64_t Scale = calculateCountScale(MaxCount);
402   MDBuilder MDB(Inst->getContext());
403   MDNode *BranchWeights = MDB.createBranchWeights(
404       scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
405   TerminatorInst *ThenTerm, *ElseTerm;
406   SplitBlockAndInsertIfThenElse(PtrCmp, Inst, &ThenTerm, &ElseTerm,
407                                 BranchWeights);
408   *DirectCallBB = ThenTerm->getParent();
409   (*DirectCallBB)->setName("if.true.direct_targ");
410   *IndirectCallBB = ElseTerm->getParent();
411   (*IndirectCallBB)->setName("if.false.orig_indirect");
412   *MergeBB = Inst->getParent();
413   (*MergeBB)->setName("if.end.icp");
414 
415   // Special handing of Invoke instructions.
416   InvokeInst *II = dyn_cast<InvokeInst>(Inst);
417   if (!II)
418     return;
419 
420   // We don't need branch instructions for invoke.
421   ThenTerm->eraseFromParent();
422   ElseTerm->eraseFromParent();
423 
424   // Add jump from Merge BB to the NormalDest. This is needed for the newly
425   // created direct invoke stmt -- as its NormalDst will be fixed up to MergeBB.
426   BranchInst::Create(II->getNormalDest(), *MergeBB);
427 }
428 
429 // Find the PHI in BB that have the CallResult as the operand.
430 static bool getCallRetPHINode(BasicBlock *BB, Instruction *Inst) {
431   BasicBlock *From = Inst->getParent();
432   for (auto &I : *BB) {
433     PHINode *PHI = dyn_cast<PHINode>(&I);
434     if (!PHI)
435       continue;
436     int IX = PHI->getBasicBlockIndex(From);
437     if (IX == -1)
438       continue;
439     Value *V = PHI->getIncomingValue(IX);
440     if (dyn_cast<Instruction>(V) == Inst)
441       return true;
442   }
443   return false;
444 }
445 
446 // This method fixes up PHI nodes in BB where BB is the UnwindDest of an
447 // invoke instruction. In BB, there may be PHIs with incoming block being
448 // OrigBB (the MergeBB after if-then-else splitting). After moving the invoke
449 // instructions to its own BB, OrigBB is no longer the predecessor block of BB.
450 // Instead two new predecessors are added: IndirectCallBB and DirectCallBB,
451 // so the PHI node's incoming BBs need to be fixed up accordingly.
452 static void fixupPHINodeForUnwind(Instruction *Inst, BasicBlock *BB,
453                                   BasicBlock *OrigBB,
454                                   BasicBlock *IndirectCallBB,
455                                   BasicBlock *DirectCallBB) {
456   for (auto &I : *BB) {
457     PHINode *PHI = dyn_cast<PHINode>(&I);
458     if (!PHI)
459       continue;
460     int IX = PHI->getBasicBlockIndex(OrigBB);
461     if (IX == -1)
462       continue;
463     Value *V = PHI->getIncomingValue(IX);
464     PHI->addIncoming(V, IndirectCallBB);
465     PHI->setIncomingBlock(IX, DirectCallBB);
466   }
467 }
468 
469 // This method fixes up PHI nodes in BB where BB is the NormalDest of an
470 // invoke instruction. In BB, there may be PHIs with incoming block being
471 // OrigBB (the MergeBB after if-then-else splitting). After moving the invoke
472 // instructions to its own BB, a new incoming edge will be added to the original
473 // NormalDstBB from the IndirectCallBB.
474 static void fixupPHINodeForNormalDest(Instruction *Inst, BasicBlock *BB,
475                                       BasicBlock *OrigBB,
476                                       BasicBlock *IndirectCallBB,
477                                       Instruction *NewInst) {
478   for (auto &I : *BB) {
479     PHINode *PHI = dyn_cast<PHINode>(&I);
480     if (!PHI)
481       continue;
482     int IX = PHI->getBasicBlockIndex(OrigBB);
483     if (IX == -1)
484       continue;
485     Value *V = PHI->getIncomingValue(IX);
486     if (dyn_cast<Instruction>(V) == Inst) {
487       PHI->setIncomingBlock(IX, IndirectCallBB);
488       PHI->addIncoming(NewInst, OrigBB);
489       continue;
490     }
491     PHI->addIncoming(V, IndirectCallBB);
492   }
493 }
494 
495 // Add a bitcast instruction to the direct-call return value if needed.
496 static Instruction *insertCallRetCast(const Instruction *Inst,
497                                       Instruction *DirectCallInst,
498                                       Function *DirectCallee) {
499   if (Inst->getType()->isVoidTy())
500     return DirectCallInst;
501 
502   Type *CallRetType = Inst->getType();
503   Type *FuncRetType = DirectCallee->getReturnType();
504   if (FuncRetType == CallRetType)
505     return DirectCallInst;
506 
507   BasicBlock *InsertionBB;
508   if (CallInst *CI = dyn_cast<CallInst>(DirectCallInst))
509     InsertionBB = CI->getParent();
510   else
511     InsertionBB = (dyn_cast<InvokeInst>(DirectCallInst))->getNormalDest();
512 
513   return (new BitCastInst(DirectCallInst, CallRetType, "",
514                           InsertionBB->getTerminator()));
515 }
516 
517 // Create a DirectCall instruction in the DirectCallBB.
518 // Parameter Inst is the indirect-call (invoke) instruction.
519 // DirectCallee is the decl of the direct-call (invoke) target.
520 // DirecallBB is the BB that the direct-call (invoke) instruction is inserted.
521 // MergeBB is the bottom BB of the if-then-else-diamond after the
522 // transformation. For invoke instruction, the edges from DirectCallBB and
523 // IndirectCallBB to MergeBB are removed before this call (during
524 // createIfThenElse).
525 static Instruction *createDirectCallInst(const Instruction *Inst,
526                                          Function *DirectCallee,
527                                          BasicBlock *DirectCallBB,
528                                          BasicBlock *MergeBB) {
529   Instruction *NewInst = Inst->clone();
530   if (CallInst *CI = dyn_cast<CallInst>(NewInst)) {
531     CI->setCalledFunction(DirectCallee);
532     CI->mutateFunctionType(DirectCallee->getFunctionType());
533   } else {
534     // Must be an invoke instruction. Direct invoke's normal destination is
535     // fixed up to MergeBB. MergeBB is the place where return cast is inserted.
536     // Also since IndirectCallBB does not have an edge to MergeBB, there is no
537     // need to insert new PHIs into MergeBB.
538     InvokeInst *II = dyn_cast<InvokeInst>(NewInst);
539     assert(II);
540     II->setCalledFunction(DirectCallee);
541     II->mutateFunctionType(DirectCallee->getFunctionType());
542     II->setNormalDest(MergeBB);
543   }
544 
545   DirectCallBB->getInstList().insert(DirectCallBB->getFirstInsertionPt(),
546                                      NewInst);
547 
548   // Clear the value profile data.
549   NewInst->setMetadata(LLVMContext::MD_prof, nullptr);
550   CallSite NewCS(NewInst);
551   FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
552   unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
553   for (unsigned I = 0; I < ParamNum; ++I) {
554     Type *ATy = NewCS.getArgument(I)->getType();
555     Type *PTy = DirectCalleeType->getParamType(I);
556     if (ATy != PTy) {
557       BitCastInst *BI = new BitCastInst(NewCS.getArgument(I), PTy, "", NewInst);
558       NewCS.setArgument(I, BI);
559     }
560   }
561 
562   return insertCallRetCast(Inst, NewInst, DirectCallee);
563 }
564 
565 // Create a PHI to unify the return values of calls.
566 static void insertCallRetPHI(Instruction *Inst, Instruction *CallResult,
567                              Function *DirectCallee) {
568   if (Inst->getType()->isVoidTy())
569     return;
570 
571   BasicBlock *RetValBB = CallResult->getParent();
572 
573   BasicBlock *PHIBB;
574   if (InvokeInst *II = dyn_cast<InvokeInst>(CallResult))
575     RetValBB = II->getNormalDest();
576 
577   PHIBB = RetValBB->getSingleSuccessor();
578   if (getCallRetPHINode(PHIBB, Inst))
579     return;
580 
581   PHINode *CallRetPHI = PHINode::Create(Inst->getType(), 0);
582   PHIBB->getInstList().push_front(CallRetPHI);
583   Inst->replaceAllUsesWith(CallRetPHI);
584   CallRetPHI->addIncoming(Inst, Inst->getParent());
585   CallRetPHI->addIncoming(CallResult, RetValBB);
586 }
587 
588 // This function does the actual indirect-call promotion transformation:
589 // For an indirect-call like:
590 //     Ret = (*Foo)(Args);
591 // It transforms to:
592 //     if (Foo == DirectCallee)
593 //        Ret1 = DirectCallee(Args);
594 //     else
595 //        Ret2 = (*Foo)(Args);
596 //     Ret = phi(Ret1, Ret2);
597 // It adds type casts for the args do not match the parameters and the return
598 // value. Branch weights metadata also updated.
599 // If \p AttachProfToDirectCall is true, a prof metadata is attached to the
600 // new direct call to contain \p Count. This is used by SamplePGO inliner to
601 // check callsite hotness.
602 // Returns the promoted direct call instruction.
603 Instruction *llvm::promoteIndirectCall(Instruction *Inst,
604                                        Function *DirectCallee, uint64_t Count,
605                                        uint64_t TotalCount,
606                                        bool AttachProfToDirectCall) {
607   assert(DirectCallee != nullptr);
608   BasicBlock *BB = Inst->getParent();
609   // Just to suppress the non-debug build warning.
610   (void)BB;
611   DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
612   DEBUG(dbgs() << *BB << "\n");
613 
614   BasicBlock *DirectCallBB, *IndirectCallBB, *MergeBB;
615   createIfThenElse(Inst, DirectCallee, Count, TotalCount, &DirectCallBB,
616                    &IndirectCallBB, &MergeBB);
617 
618   Instruction *NewInst =
619       createDirectCallInst(Inst, DirectCallee, DirectCallBB, MergeBB);
620 
621   if (AttachProfToDirectCall) {
622     SmallVector<uint32_t, 1> Weights;
623     Weights.push_back(Count);
624     MDBuilder MDB(NewInst->getContext());
625     dyn_cast<Instruction>(NewInst->stripPointerCasts())
626         ->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
627   }
628 
629   // Move Inst from MergeBB to IndirectCallBB.
630   Inst->removeFromParent();
631   IndirectCallBB->getInstList().insert(IndirectCallBB->getFirstInsertionPt(),
632                                        Inst);
633 
634   if (InvokeInst *II = dyn_cast<InvokeInst>(Inst)) {
635     // At this point, the original indirect invoke instruction has the original
636     // UnwindDest and NormalDest. For the direct invoke instruction, the
637     // NormalDest points to MergeBB, and MergeBB jumps to the original
638     // NormalDest. MergeBB might have a new bitcast instruction for the return
639     // value. The PHIs are with the original NormalDest. Since we now have two
640     // incoming edges to NormalDest and UnwindDest, we have to do some fixups.
641     //
642     // UnwindDest will not use the return value. So pass nullptr here.
643     fixupPHINodeForUnwind(Inst, II->getUnwindDest(), MergeBB, IndirectCallBB,
644                           DirectCallBB);
645     // We don't need to update the operand from NormalDest for DirectCallBB.
646     // Pass nullptr here.
647     fixupPHINodeForNormalDest(Inst, II->getNormalDest(), MergeBB,
648                               IndirectCallBB, NewInst);
649   }
650 
651   insertCallRetPHI(Inst, NewInst, DirectCallee);
652 
653   DEBUG(dbgs() << "\n== Basic Blocks After ==\n");
654   DEBUG(dbgs() << *BB << *DirectCallBB << *IndirectCallBB << *MergeBB << "\n");
655 
656   emitOptimizationRemark(
657       BB->getContext(), "pgo-icall-prom", *BB->getParent(), Inst->getDebugLoc(),
658       Twine("Promote indirect call to ") + DirectCallee->getName() +
659           " with count " + Twine(Count) + " out of " + Twine(TotalCount));
660   return NewInst;
661 }
662 
663 // Promote indirect-call to conditional direct-call for one callsite.
664 uint32_t ICallPromotionFunc::tryToPromote(
665     Instruction *Inst, const std::vector<PromotionCandidate> &Candidates,
666     uint64_t &TotalCount) {
667   uint32_t NumPromoted = 0;
668 
669   for (auto &C : Candidates) {
670     uint64_t Count = C.Count;
671     promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount, SamplePGO);
672     assert(TotalCount >= Count);
673     TotalCount -= Count;
674     NumOfPGOICallPromotion++;
675     NumPromoted++;
676   }
677   return NumPromoted;
678 }
679 
680 // Traverse all the indirect-call callsite and get the value profile
681 // annotation to perform indirect-call promotion.
682 bool ICallPromotionFunc::processFunction() {
683   bool Changed = false;
684   ICallPromotionAnalysis ICallAnalysis;
685   for (auto &I : findIndirectCallSites(F)) {
686     uint32_t NumVals, NumCandidates;
687     uint64_t TotalCount;
688     auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
689         I, NumVals, TotalCount, NumCandidates);
690     if (!NumCandidates)
691       continue;
692     auto PromotionCandidates = getPromotionCandidatesForCallSite(
693         I, ICallProfDataRef, TotalCount, NumCandidates);
694     uint32_t NumPromoted = tryToPromote(I, PromotionCandidates, TotalCount);
695     if (NumPromoted == 0)
696       continue;
697 
698     Changed = true;
699     // Adjust the MD.prof metadata. First delete the old one.
700     I->setMetadata(LLVMContext::MD_prof, nullptr);
701     // If all promoted, we don't need the MD.prof metadata.
702     if (TotalCount == 0 || NumPromoted == NumVals)
703       continue;
704     // Otherwise we need update with the un-promoted records back.
705     annotateValueSite(*M, *I, ICallProfDataRef.slice(NumPromoted), TotalCount,
706                       IPVK_IndirectCallTarget, NumCandidates);
707   }
708   return Changed;
709 }
710 
711 // A wrapper function that does the actual work.
712 static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {
713   if (DisableICP)
714     return false;
715   InstrProfSymtab Symtab;
716   Symtab.create(M, InLTO);
717   bool Changed = false;
718   for (auto &F : M) {
719     if (F.isDeclaration())
720       continue;
721     if (F.hasFnAttribute(Attribute::OptimizeNone))
722       continue;
723     ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO);
724     bool FuncChanged = ICallPromotion.processFunction();
725     if (ICPDUMPAFTER && FuncChanged) {
726       DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
727       DEBUG(dbgs() << "\n");
728     }
729     Changed |= FuncChanged;
730     if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
731       DEBUG(dbgs() << " Stop: Cutoff reached.\n");
732       break;
733     }
734   }
735   return Changed;
736 }
737 
738 bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
739   // Command-line option has the priority for InLTO.
740   return promoteIndirectCalls(M, InLTO | ICPLTOMode,
741                               SamplePGO | ICPSamplePGOMode);
742 }
743 
744 PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
745                                                 ModuleAnalysisManager &AM) {
746   if (!promoteIndirectCalls(M, InLTO | ICPLTOMode,
747                             SamplePGO | ICPSamplePGOMode))
748     return PreservedAnalyses::all();
749 
750   return PreservedAnalyses::none();
751 }
752 
753 namespace {
754 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
755 public:
756   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
757       : Func(Func), BFI(BFI), Changed(false) {
758     ValueDataArray =
759         llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
760     // Get the MemOPSize range information from option MemOPSizeRange,
761     getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
762                                 PreciseRangeLast);
763   }
764   bool isChanged() const { return Changed; }
765   void perform() {
766     WorkList.clear();
767     visit(Func);
768 
769     for (auto &MI : WorkList) {
770       ++NumOfPGOMemOPAnnotate;
771       if (perform(MI)) {
772         Changed = true;
773         ++NumOfPGOMemOPOpt;
774         DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
775                      << "is Transformed.\n");
776       }
777     }
778   }
779 
780   void visitMemIntrinsic(MemIntrinsic &MI) {
781     Value *Length = MI.getLength();
782     // Not perform on constant length calls.
783     if (dyn_cast<ConstantInt>(Length))
784       return;
785     WorkList.push_back(&MI);
786   }
787 
788 private:
789   Function &Func;
790   BlockFrequencyInfo &BFI;
791   bool Changed;
792   std::vector<MemIntrinsic *> WorkList;
793   // Start of the previse range.
794   int64_t PreciseRangeStart;
795   // Last value of the previse range.
796   int64_t PreciseRangeLast;
797   // The space to read the profile annotation.
798   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
799   bool perform(MemIntrinsic *MI);
800 
801   // This kind shows which group the value falls in. For PreciseValue, we have
802   // the profile count for that value. LargeGroup groups the values that are in
803   // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
804   enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
805 
806   MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
807     if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
808       return LargeGroup;
809     if (Value == PreciseRangeLast + 1)
810       return NonLargeGroup;
811     return PreciseValue;
812   }
813 };
814 
815 static const char *getMIName(const MemIntrinsic *MI) {
816   switch (MI->getIntrinsicID()) {
817   case Intrinsic::memcpy:
818     return "memcpy";
819   case Intrinsic::memmove:
820     return "memmove";
821   case Intrinsic::memset:
822     return "memset";
823   default:
824     return "unknown";
825   }
826 }
827 
828 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
829   assert(Count <= TotalCount);
830   if (Count < MemOPCountThreshold)
831     return false;
832   if (Count < TotalCount * MemOPPercentThreshold / 100)
833     return false;
834   return true;
835 }
836 
837 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
838                                       uint64_t Denom) {
839   if (!MemOPScaleCount)
840     return Count;
841   bool Overflowed;
842   uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
843   return ScaleCount / Denom;
844 }
845 
846 bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
847   assert(MI);
848   if (MI->getIntrinsicID() == Intrinsic::memmove)
849     return false;
850 
851   uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
852   uint64_t TotalCount;
853   if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
854                                 ValueDataArray.get(), NumVals, TotalCount))
855     return false;
856 
857   uint64_t ActualCount = TotalCount;
858   uint64_t SavedTotalCount = TotalCount;
859   if (MemOPScaleCount) {
860     auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
861     if (!BBEdgeCount)
862       return false;
863     ActualCount = *BBEdgeCount;
864   }
865 
866   ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
867   DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
868                << "\n");
869   DEBUG(
870       for (auto &VD
871            : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
872 
873   if (ActualCount < MemOPCountThreshold)
874     return false;
875   // Skip if the total value profiled count is 0, in which case we can't
876   // scale up the counts properly (and there is no profitable transformation).
877   if (TotalCount == 0)
878     return false;
879 
880   TotalCount = ActualCount;
881   if (MemOPScaleCount)
882     DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
883                  << " denominator = " << SavedTotalCount << "\n");
884 
885   // Keeping track of the count of the default case:
886   uint64_t RemainCount = TotalCount;
887   SmallVector<uint64_t, 16> SizeIds;
888   SmallVector<uint64_t, 16> CaseCounts;
889   uint64_t MaxCount = 0;
890   unsigned Version = 0;
891   // Default case is in the front -- save the slot here.
892   CaseCounts.push_back(0);
893   for (auto &VD : VDs) {
894     int64_t V = VD.Value;
895     uint64_t C = VD.Count;
896     if (MemOPScaleCount)
897       C = getScaledCount(C, ActualCount, SavedTotalCount);
898 
899     // Only care precise value here.
900     if (getMemOPSizeKind(V) != PreciseValue)
901       continue;
902 
903     // ValueCounts are sorted on the count. Break at the first un-profitable
904     // value.
905     if (!isProfitable(C, RemainCount))
906       break;
907 
908     SizeIds.push_back(V);
909     CaseCounts.push_back(C);
910     if (C > MaxCount)
911       MaxCount = C;
912 
913     assert(RemainCount >= C);
914     RemainCount -= C;
915 
916     if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
917       break;
918   }
919 
920   if (Version == 0)
921     return false;
922 
923   CaseCounts[0] = RemainCount;
924   if (RemainCount > MaxCount)
925     MaxCount = RemainCount;
926 
927   uint64_t SumForOpt = TotalCount - RemainCount;
928 
929   DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
930                << " Versions (covering " << SumForOpt << " out of "
931                << TotalCount << ")\n");
932 
933   // mem_op(..., size)
934   // ==>
935   // switch (size) {
936   //   case s1:
937   //      mem_op(..., s1);
938   //      goto merge_bb;
939   //   case s2:
940   //      mem_op(..., s2);
941   //      goto merge_bb;
942   //   ...
943   //   default:
944   //      mem_op(..., size);
945   //      goto merge_bb;
946   // }
947   // merge_bb:
948 
949   BasicBlock *BB = MI->getParent();
950   DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
951   DEBUG(dbgs() << *BB << "\n");
952   auto OrigBBFreq = BFI.getBlockFreq(BB);
953 
954   BasicBlock *DefaultBB = SplitBlock(BB, MI);
955   BasicBlock::iterator It(*MI);
956   ++It;
957   assert(It != DefaultBB->end());
958   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
959   MergeBB->setName("MemOP.Merge");
960   BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
961   DefaultBB->setName("MemOP.Default");
962 
963   auto &Ctx = Func.getContext();
964   IRBuilder<> IRB(BB);
965   BB->getTerminator()->eraseFromParent();
966   Value *SizeVar = MI->getLength();
967   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
968 
969   // Clear the value profile data.
970   MI->setMetadata(LLVMContext::MD_prof, nullptr);
971 
972   DEBUG(dbgs() << "\n\n== Basic Block After==\n");
973 
974   for (uint64_t SizeId : SizeIds) {
975     ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
976     BasicBlock *CaseBB = BasicBlock::Create(
977         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
978     Instruction *NewInst = MI->clone();
979     // Fix the argument.
980     dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
981     CaseBB->getInstList().push_back(NewInst);
982     IRBuilder<> IRBCase(CaseBB);
983     IRBCase.CreateBr(MergeBB);
984     SI->addCase(CaseSizeId, CaseBB);
985     DEBUG(dbgs() << *CaseBB << "\n");
986   }
987   setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
988 
989   DEBUG(dbgs() << *BB << "\n");
990   DEBUG(dbgs() << *DefaultBB << "\n");
991   DEBUG(dbgs() << *MergeBB << "\n");
992 
993   emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
994                          MI->getDebugLoc(),
995                          Twine("optimize ") + getMIName(MI) + " with count " +
996                              Twine(SumForOpt) + " out of " + Twine(TotalCount) +
997                              " for " + Twine(Version) + " versions");
998 
999   return true;
1000 }
1001 } // namespace
1002 
1003 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
1004   if (DisableMemOPOPT)
1005     return false;
1006 
1007   if (F.hasFnAttribute(Attribute::OptimizeForSize))
1008     return false;
1009   MemOPSizeOpt MemOPSizeOpt(F, BFI);
1010   MemOPSizeOpt.perform();
1011   return MemOPSizeOpt.isChanged();
1012 }
1013 
1014 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
1015   BlockFrequencyInfo &BFI =
1016       getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1017   return PGOMemOPSizeOptImpl(F, BFI);
1018 }
1019 
1020 namespace llvm {
1021 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
1022 
1023 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
1024                                        FunctionAnalysisManager &FAM) {
1025   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
1026   bool Changed = PGOMemOPSizeOptImpl(F, BFI);
1027   if (!Changed)
1028     return PreservedAnalyses::all();
1029   auto  PA = PreservedAnalyses();
1030   PA.preserve<GlobalsAA>();
1031   return PA;
1032 }
1033 } // namespace llvm
1034