1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass munges the code in the input function to better prepare it for
10 // SelectionDAG-based code generation. This works around limitations in it's
11 // basic-block-at-a-time approach. It should eventually be removed.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/APInt.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/MapVector.h"
19 #include "llvm/ADT/PointerIntPair.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/BlockFrequencyInfo.h"
25 #include "llvm/Analysis/BranchProbabilityInfo.h"
26 #include "llvm/Analysis/ConstantFolding.h"
27 #include "llvm/Analysis/InstructionSimplify.h"
28 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/Analysis/MemoryBuiltins.h"
30 #include "llvm/Analysis/ProfileSummaryInfo.h"
31 #include "llvm/Analysis/TargetLibraryInfo.h"
32 #include "llvm/Analysis/TargetTransformInfo.h"
33 #include "llvm/Analysis/ValueTracking.h"
34 #include "llvm/Analysis/VectorUtils.h"
35 #include "llvm/CodeGen/Analysis.h"
36 #include "llvm/CodeGen/ISDOpcodes.h"
37 #include "llvm/CodeGen/SelectionDAGNodes.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/TargetPassConfig.h"
40 #include "llvm/CodeGen/TargetSubtargetInfo.h"
41 #include "llvm/CodeGen/ValueTypes.h"
42 #include "llvm/Config/llvm-config.h"
43 #include "llvm/IR/Argument.h"
44 #include "llvm/IR/Attributes.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/Constants.h"
48 #include "llvm/IR/DataLayout.h"
49 #include "llvm/IR/DerivedTypes.h"
50 #include "llvm/IR/Dominators.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/IR/GetElementPtrTypeIterator.h"
53 #include "llvm/IR/GlobalValue.h"
54 #include "llvm/IR/GlobalVariable.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/InlineAsm.h"
57 #include "llvm/IR/InstrTypes.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/IntrinsicInst.h"
61 #include "llvm/IR/Intrinsics.h"
62 #include "llvm/IR/IntrinsicsAArch64.h"
63 #include "llvm/IR/IntrinsicsX86.h"
64 #include "llvm/IR/LLVMContext.h"
65 #include "llvm/IR/MDBuilder.h"
66 #include "llvm/IR/Module.h"
67 #include "llvm/IR/Operator.h"
68 #include "llvm/IR/PatternMatch.h"
69 #include "llvm/IR/Statepoint.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/IR/ValueHandle.h"
75 #include "llvm/IR/ValueMap.h"
76 #include "llvm/InitializePasses.h"
77 #include "llvm/Pass.h"
78 #include "llvm/Support/BlockFrequency.h"
79 #include "llvm/Support/BranchProbability.h"
80 #include "llvm/Support/Casting.h"
81 #include "llvm/Support/CommandLine.h"
82 #include "llvm/Support/Compiler.h"
83 #include "llvm/Support/Debug.h"
84 #include "llvm/Support/ErrorHandling.h"
85 #include "llvm/Support/MachineValueType.h"
86 #include "llvm/Support/MathExtras.h"
87 #include "llvm/Support/raw_ostream.h"
88 #include "llvm/Target/TargetMachine.h"
89 #include "llvm/Target/TargetOptions.h"
90 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
91 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
92 #include "llvm/Transforms/Utils/Local.h"
93 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
94 #include "llvm/Transforms/Utils/SizeOpts.h"
95 #include <algorithm>
96 #include <cassert>
97 #include <cstdint>
98 #include <iterator>
99 #include <limits>
100 #include <memory>
101 #include <utility>
102 #include <vector>
103 
104 using namespace llvm;
105 using namespace llvm::PatternMatch;
106 
107 #define DEBUG_TYPE "codegenprepare"
108 
109 STATISTIC(NumBlocksElim, "Number of blocks eliminated");
110 STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
111 STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
112 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
113                       "sunken Cmps");
114 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
115                        "of sunken Casts");
116 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
117                           "computations were sunk");
118 STATISTIC(NumMemoryInstsPhiCreated,
119           "Number of phis created when address "
120           "computations were sunk to memory instructions");
121 STATISTIC(NumMemoryInstsSelectCreated,
122           "Number of select created when address "
123           "computations were sunk to memory instructions");
124 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
125 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
126 STATISTIC(NumAndsAdded,
127           "Number of and mask instructions added to form ext loads");
128 STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
129 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
130 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
131 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
132 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
133 
134 static cl::opt<bool> DisableBranchOpts(
135   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
136   cl::desc("Disable branch optimizations in CodeGenPrepare"));
137 
138 static cl::opt<bool>
139     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
140                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
141 
142 static cl::opt<bool> DisableSelectToBranch(
143   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
144   cl::desc("Disable select to branch conversion."));
145 
146 static cl::opt<bool> AddrSinkUsingGEPs(
147   "addr-sink-using-gep", cl::Hidden, cl::init(true),
148   cl::desc("Address sinking in CGP using GEPs."));
149 
150 static cl::opt<bool> EnableAndCmpSinking(
151    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
152    cl::desc("Enable sinkinig and/cmp into branches."));
153 
154 static cl::opt<bool> DisableStoreExtract(
155     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
156     cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
157 
158 static cl::opt<bool> StressStoreExtract(
159     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
160     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
161 
162 static cl::opt<bool> DisableExtLdPromotion(
163     "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
164     cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
165              "CodeGenPrepare"));
166 
167 static cl::opt<bool> StressExtLdPromotion(
168     "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
169     cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
170              "optimization in CodeGenPrepare"));
171 
172 static cl::opt<bool> DisablePreheaderProtect(
173     "disable-preheader-prot", cl::Hidden, cl::init(false),
174     cl::desc("Disable protection against removing loop preheaders"));
175 
176 static cl::opt<bool> ProfileGuidedSectionPrefix(
177     "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
178     cl::desc("Use profile info to add section prefix for hot/cold functions"));
179 
180 static cl::opt<bool> ProfileUnknownInSpecialSection(
181     "profile-unknown-in-special-section", cl::Hidden, cl::init(false),
182     cl::ZeroOrMore,
183     cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
184              "profile, we cannot tell the function is cold for sure because "
185              "it may be a function newly added without ever being sampled. "
186              "With the flag enabled, compiler can put such profile unknown "
187              "functions into a special section, so runtime system can choose "
188              "to handle it in a different way than .text section, to save "
189              "RAM for example. "));
190 
191 static cl::opt<unsigned> FreqRatioToSkipMerge(
192     "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
193     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
194              "(frequency of destination block) is greater than this ratio"));
195 
196 static cl::opt<bool> ForceSplitStore(
197     "force-split-store", cl::Hidden, cl::init(false),
198     cl::desc("Force store splitting no matter what the target query says."));
199 
200 static cl::opt<bool>
201 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
202     cl::desc("Enable merging of redundant sexts when one is dominating"
203     " the other."), cl::init(true));
204 
205 static cl::opt<bool> DisableComplexAddrModes(
206     "disable-complex-addr-modes", cl::Hidden, cl::init(false),
207     cl::desc("Disables combining addressing modes with different parts "
208              "in optimizeMemoryInst."));
209 
210 static cl::opt<bool>
211 AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
212                 cl::desc("Allow creation of Phis in Address sinking."));
213 
214 static cl::opt<bool>
215 AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true),
216                    cl::desc("Allow creation of selects in Address sinking."));
217 
218 static cl::opt<bool> AddrSinkCombineBaseReg(
219     "addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
220     cl::desc("Allow combining of BaseReg field in Address sinking."));
221 
222 static cl::opt<bool> AddrSinkCombineBaseGV(
223     "addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
224     cl::desc("Allow combining of BaseGV field in Address sinking."));
225 
226 static cl::opt<bool> AddrSinkCombineBaseOffs(
227     "addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
228     cl::desc("Allow combining of BaseOffs field in Address sinking."));
229 
230 static cl::opt<bool> AddrSinkCombineScaledReg(
231     "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
232     cl::desc("Allow combining of ScaledReg field in Address sinking."));
233 
234 static cl::opt<bool>
235     EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
236                          cl::init(true),
237                          cl::desc("Enable splitting large offset of GEP."));
238 
239 static cl::opt<bool> EnableICMP_EQToICMP_ST(
240     "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),
241     cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
242 
243 static cl::opt<bool>
244     VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),
245                      cl::desc("Enable BFI update verification for "
246                               "CodeGenPrepare."));
247 
248 static cl::opt<bool> OptimizePhiTypes(
249     "cgp-optimize-phi-types", cl::Hidden, cl::init(false),
250     cl::desc("Enable converting phi types in CodeGenPrepare"));
251 
252 namespace {
253 
254 enum ExtType {
255   ZeroExtension,   // Zero extension has been seen.
256   SignExtension,   // Sign extension has been seen.
257   BothExtension    // This extension type is used if we saw sext after
258                    // ZeroExtension had been set, or if we saw zext after
259                    // SignExtension had been set. It makes the type
260                    // information of a promoted instruction invalid.
261 };
262 
263 using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
264 using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
265 using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
266 using SExts = SmallVector<Instruction *, 16>;
267 using ValueToSExts = DenseMap<Value *, SExts>;
268 
269 class TypePromotionTransaction;
270 
271   class CodeGenPrepare : public FunctionPass {
272     const TargetMachine *TM = nullptr;
273     const TargetSubtargetInfo *SubtargetInfo;
274     const TargetLowering *TLI = nullptr;
275     const TargetRegisterInfo *TRI;
276     const TargetTransformInfo *TTI = nullptr;
277     const TargetLibraryInfo *TLInfo;
278     const LoopInfo *LI;
279     std::unique_ptr<BlockFrequencyInfo> BFI;
280     std::unique_ptr<BranchProbabilityInfo> BPI;
281     ProfileSummaryInfo *PSI;
282 
283     /// As we scan instructions optimizing them, this is the next instruction
284     /// to optimize. Transforms that can invalidate this should update it.
285     BasicBlock::iterator CurInstIterator;
286 
287     /// Keeps track of non-local addresses that have been sunk into a block.
288     /// This allows us to avoid inserting duplicate code for blocks with
289     /// multiple load/stores of the same address. The usage of WeakTrackingVH
290     /// enables SunkAddrs to be treated as a cache whose entries can be
291     /// invalidated if a sunken address computation has been erased.
292     ValueMap<Value*, WeakTrackingVH> SunkAddrs;
293 
294     /// Keeps track of all instructions inserted for the current function.
295     SetOfInstrs InsertedInsts;
296 
297     /// Keeps track of the type of the related instruction before their
298     /// promotion for the current function.
299     InstrToOrigTy PromotedInsts;
300 
301     /// Keep track of instructions removed during promotion.
302     SetOfInstrs RemovedInsts;
303 
304     /// Keep track of sext chains based on their initial value.
305     DenseMap<Value *, Instruction *> SeenChainsForSExt;
306 
307     /// Keep track of GEPs accessing the same data structures such as structs or
308     /// arrays that are candidates to be split later because of their large
309     /// size.
310     MapVector<
311         AssertingVH<Value>,
312         SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
313         LargeOffsetGEPMap;
314 
315     /// Keep track of new GEP base after splitting the GEPs having large offset.
316     SmallSet<AssertingVH<Value>, 2> NewGEPBases;
317 
318     /// Map serial numbers to Large offset GEPs.
319     DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
320 
321     /// Keep track of SExt promoted.
322     ValueToSExts ValToSExtendedUses;
323 
324     /// True if the function has the OptSize attribute.
325     bool OptSize;
326 
327     /// DataLayout for the Function being processed.
328     const DataLayout *DL = nullptr;
329 
330     /// Building the dominator tree can be expensive, so we only build it
331     /// lazily and update it when required.
332     std::unique_ptr<DominatorTree> DT;
333 
334   public:
335     static char ID; // Pass identification, replacement for typeid
336 
337     CodeGenPrepare() : FunctionPass(ID) {
338       initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
339     }
340 
341     bool runOnFunction(Function &F) override;
342 
343     StringRef getPassName() const override { return "CodeGen Prepare"; }
344 
345     void getAnalysisUsage(AnalysisUsage &AU) const override {
346       // FIXME: When we can selectively preserve passes, preserve the domtree.
347       AU.addRequired<ProfileSummaryInfoWrapperPass>();
348       AU.addRequired<TargetLibraryInfoWrapperPass>();
349       AU.addRequired<TargetPassConfig>();
350       AU.addRequired<TargetTransformInfoWrapperPass>();
351       AU.addRequired<LoopInfoWrapperPass>();
352     }
353 
354   private:
355     template <typename F>
356     void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
357       // Substituting can cause recursive simplifications, which can invalidate
358       // our iterator.  Use a WeakTrackingVH to hold onto it in case this
359       // happens.
360       Value *CurValue = &*CurInstIterator;
361       WeakTrackingVH IterHandle(CurValue);
362 
363       f();
364 
365       // If the iterator instruction was recursively deleted, start over at the
366       // start of the block.
367       if (IterHandle != CurValue) {
368         CurInstIterator = BB->begin();
369         SunkAddrs.clear();
370       }
371     }
372 
373     // Get the DominatorTree, building if necessary.
374     DominatorTree &getDT(Function &F) {
375       if (!DT)
376         DT = std::make_unique<DominatorTree>(F);
377       return *DT;
378     }
379 
380     bool eliminateFallThrough(Function &F);
381     bool eliminateMostlyEmptyBlocks(Function &F);
382     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
383     bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
384     void eliminateMostlyEmptyBlock(BasicBlock *BB);
385     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
386                                        bool isPreheader);
387     bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
388     bool optimizeInst(Instruction *I, bool &ModifiedDT);
389     bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
390                             Type *AccessTy, unsigned AddrSpace);
391     bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
392     bool optimizeInlineAsmInst(CallInst *CS);
393     bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
394     bool optimizeExt(Instruction *&I);
395     bool optimizeExtUses(Instruction *I);
396     bool optimizeLoadExt(LoadInst *Load);
397     bool optimizeShiftInst(BinaryOperator *BO);
398     bool optimizeFunnelShift(IntrinsicInst *Fsh);
399     bool optimizeSelectInst(SelectInst *SI);
400     bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
401     bool optimizeSwitchInst(SwitchInst *SI);
402     bool optimizeExtractElementInst(Instruction *Inst);
403     bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
404     bool fixupDbgValue(Instruction *I);
405     bool placeDbgValues(Function &F);
406     bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
407                       LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
408     bool tryToPromoteExts(TypePromotionTransaction &TPT,
409                           const SmallVectorImpl<Instruction *> &Exts,
410                           SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
411                           unsigned CreatedInstsCost = 0);
412     bool mergeSExts(Function &F);
413     bool splitLargeGEPOffsets();
414     bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
415                          SmallPtrSetImpl<Instruction *> &DeletedInstrs);
416     bool optimizePhiTypes(Function &F);
417     bool performAddressTypePromotion(
418         Instruction *&Inst,
419         bool AllowPromotionWithoutCommonHeader,
420         bool HasPromoted, TypePromotionTransaction &TPT,
421         SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
422     bool splitBranchCondition(Function &F, bool &ModifiedDT);
423     bool simplifyOffsetableRelocate(GCStatepointInst &I);
424 
425     bool tryToSinkFreeOperands(Instruction *I);
426     bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,
427                                      Value *Arg1, CmpInst *Cmp,
428                                      Intrinsic::ID IID);
429     bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
430     bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
431     bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
432     void verifyBFIUpdates(Function &F);
433   };
434 
435 } // end anonymous namespace
436 
437 char CodeGenPrepare::ID = 0;
438 
439 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
440                       "Optimize for code generation", false, false)
441 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
442 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
443                     "Optimize for code generation", false, false)
444 
445 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
446 
447 bool CodeGenPrepare::runOnFunction(Function &F) {
448   if (skipFunction(F))
449     return false;
450 
451   DL = &F.getParent()->getDataLayout();
452 
453   bool EverMadeChange = false;
454   // Clear per function information.
455   InsertedInsts.clear();
456   PromotedInsts.clear();
457 
458   TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
459   SubtargetInfo = TM->getSubtargetImpl(F);
460   TLI = SubtargetInfo->getTargetLowering();
461   TRI = SubtargetInfo->getRegisterInfo();
462   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
463   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
464   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
465   BPI.reset(new BranchProbabilityInfo(F, *LI));
466   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
467   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
468   OptSize = F.hasOptSize();
469   if (ProfileGuidedSectionPrefix) {
470     if (PSI->isFunctionHotInCallGraph(&F, *BFI))
471       F.setSectionPrefix(".hot");
472     else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
473       F.setSectionPrefix(".unlikely");
474     else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
475              PSI->isFunctionHotnessUnknown(F))
476       F.setSectionPrefix(".unknown");
477   }
478 
479   /// This optimization identifies DIV instructions that can be
480   /// profitably bypassed and carried out with a shorter, faster divide.
481   if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
482     const DenseMap<unsigned int, unsigned int> &BypassWidths =
483         TLI->getBypassSlowDivWidths();
484     BasicBlock* BB = &*F.begin();
485     while (BB != nullptr) {
486       // bypassSlowDivision may create new BBs, but we don't want to reapply the
487       // optimization to those blocks.
488       BasicBlock* Next = BB->getNextNode();
489       // F.hasOptSize is already checked in the outer if statement.
490       if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
491         EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
492       BB = Next;
493     }
494   }
495 
496   // Eliminate blocks that contain only PHI nodes and an
497   // unconditional branch.
498   EverMadeChange |= eliminateMostlyEmptyBlocks(F);
499 
500   bool ModifiedDT = false;
501   if (!DisableBranchOpts)
502     EverMadeChange |= splitBranchCondition(F, ModifiedDT);
503 
504   // Split some critical edges where one of the sources is an indirect branch,
505   // to help generate sane code for PHIs involving such edges.
506   EverMadeChange |= SplitIndirectBrCriticalEdges(F);
507 
508   bool MadeChange = true;
509   while (MadeChange) {
510     MadeChange = false;
511     DT.reset();
512     for (Function::iterator I = F.begin(); I != F.end(); ) {
513       BasicBlock *BB = &*I++;
514       bool ModifiedDTOnIteration = false;
515       MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
516 
517       // Restart BB iteration if the dominator tree of the Function was changed
518       if (ModifiedDTOnIteration)
519         break;
520     }
521     if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
522       MadeChange |= mergeSExts(F);
523     if (!LargeOffsetGEPMap.empty())
524       MadeChange |= splitLargeGEPOffsets();
525     MadeChange |= optimizePhiTypes(F);
526 
527     if (MadeChange)
528       eliminateFallThrough(F);
529 
530     // Really free removed instructions during promotion.
531     for (Instruction *I : RemovedInsts)
532       I->deleteValue();
533 
534     EverMadeChange |= MadeChange;
535     SeenChainsForSExt.clear();
536     ValToSExtendedUses.clear();
537     RemovedInsts.clear();
538     LargeOffsetGEPMap.clear();
539     LargeOffsetGEPID.clear();
540   }
541 
542   SunkAddrs.clear();
543 
544   if (!DisableBranchOpts) {
545     MadeChange = false;
546     // Use a set vector to get deterministic iteration order. The order the
547     // blocks are removed may affect whether or not PHI nodes in successors
548     // are removed.
549     SmallSetVector<BasicBlock*, 8> WorkList;
550     for (BasicBlock &BB : F) {
551       SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
552       MadeChange |= ConstantFoldTerminator(&BB, true);
553       if (!MadeChange) continue;
554 
555       for (SmallVectorImpl<BasicBlock*>::iterator
556              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
557         if (pred_begin(*II) == pred_end(*II))
558           WorkList.insert(*II);
559     }
560 
561     // Delete the dead blocks and any of their dead successors.
562     MadeChange |= !WorkList.empty();
563     while (!WorkList.empty()) {
564       BasicBlock *BB = WorkList.pop_back_val();
565       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
566 
567       DeleteDeadBlock(BB);
568 
569       for (SmallVectorImpl<BasicBlock*>::iterator
570              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
571         if (pred_begin(*II) == pred_end(*II))
572           WorkList.insert(*II);
573     }
574 
575     // Merge pairs of basic blocks with unconditional branches, connected by
576     // a single edge.
577     if (EverMadeChange || MadeChange)
578       MadeChange |= eliminateFallThrough(F);
579 
580     EverMadeChange |= MadeChange;
581   }
582 
583   if (!DisableGCOpts) {
584     SmallVector<GCStatepointInst *, 2> Statepoints;
585     for (BasicBlock &BB : F)
586       for (Instruction &I : BB)
587         if (auto *SP = dyn_cast<GCStatepointInst>(&I))
588           Statepoints.push_back(SP);
589     for (auto &I : Statepoints)
590       EverMadeChange |= simplifyOffsetableRelocate(*I);
591   }
592 
593   // Do this last to clean up use-before-def scenarios introduced by other
594   // preparatory transforms.
595   EverMadeChange |= placeDbgValues(F);
596 
597 #ifndef NDEBUG
598   if (VerifyBFIUpdates)
599     verifyBFIUpdates(F);
600 #endif
601 
602   return EverMadeChange;
603 }
604 
605 // Verify BFI has been updated correctly by recomputing BFI and comparing them.
606 void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
607   DominatorTree NewDT(F);
608   LoopInfo NewLI(NewDT);
609   BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
610   BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
611   NewBFI.verifyMatch(*BFI);
612 }
613 
614 /// Merge basic blocks which are connected by a single edge, where one of the
615 /// basic blocks has a single successor pointing to the other basic block,
616 /// which has a single predecessor.
617 bool CodeGenPrepare::eliminateFallThrough(Function &F) {
618   bool Changed = false;
619   // Scan all of the blocks in the function, except for the entry block.
620   // Use a temporary array to avoid iterator being invalidated when
621   // deleting blocks.
622   SmallVector<WeakTrackingVH, 16> Blocks;
623   for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
624     Blocks.push_back(&Block);
625 
626   for (auto &Block : Blocks) {
627     auto *BB = cast_or_null<BasicBlock>(Block);
628     if (!BB)
629       continue;
630     // If the destination block has a single pred, then this is a trivial
631     // edge, just collapse it.
632     BasicBlock *SinglePred = BB->getSinglePredecessor();
633 
634     // Don't merge if BB's address is taken.
635     if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
636 
637     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
638     if (Term && !Term->isConditional()) {
639       Changed = true;
640       LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
641 
642       // Merge BB into SinglePred and delete it.
643       MergeBlockIntoPredecessor(BB);
644     }
645   }
646   return Changed;
647 }
648 
649 /// Find a destination block from BB if BB is mergeable empty block.
650 BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
651   // If this block doesn't end with an uncond branch, ignore it.
652   BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
653   if (!BI || !BI->isUnconditional())
654     return nullptr;
655 
656   // If the instruction before the branch (skipping debug info) isn't a phi
657   // node, then other stuff is happening here.
658   BasicBlock::iterator BBI = BI->getIterator();
659   if (BBI != BB->begin()) {
660     --BBI;
661     while (isa<DbgInfoIntrinsic>(BBI)) {
662       if (BBI == BB->begin())
663         break;
664       --BBI;
665     }
666     if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
667       return nullptr;
668   }
669 
670   // Do not break infinite loops.
671   BasicBlock *DestBB = BI->getSuccessor(0);
672   if (DestBB == BB)
673     return nullptr;
674 
675   if (!canMergeBlocks(BB, DestBB))
676     DestBB = nullptr;
677 
678   return DestBB;
679 }
680 
681 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
682 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
683 /// edges in ways that are non-optimal for isel. Start by eliminating these
684 /// blocks so we can split them the way we want them.
685 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
686   SmallPtrSet<BasicBlock *, 16> Preheaders;
687   SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
688   while (!LoopList.empty()) {
689     Loop *L = LoopList.pop_back_val();
690     LoopList.insert(LoopList.end(), L->begin(), L->end());
691     if (BasicBlock *Preheader = L->getLoopPreheader())
692       Preheaders.insert(Preheader);
693   }
694 
695   bool MadeChange = false;
696   // Copy blocks into a temporary array to avoid iterator invalidation issues
697   // as we remove them.
698   // Note that this intentionally skips the entry block.
699   SmallVector<WeakTrackingVH, 16> Blocks;
700   for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
701     Blocks.push_back(&Block);
702 
703   for (auto &Block : Blocks) {
704     BasicBlock *BB = cast_or_null<BasicBlock>(Block);
705     if (!BB)
706       continue;
707     BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
708     if (!DestBB ||
709         !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
710       continue;
711 
712     eliminateMostlyEmptyBlock(BB);
713     MadeChange = true;
714   }
715   return MadeChange;
716 }
717 
718 bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
719                                                    BasicBlock *DestBB,
720                                                    bool isPreheader) {
721   // Do not delete loop preheaders if doing so would create a critical edge.
722   // Loop preheaders can be good locations to spill registers. If the
723   // preheader is deleted and we create a critical edge, registers may be
724   // spilled in the loop body instead.
725   if (!DisablePreheaderProtect && isPreheader &&
726       !(BB->getSinglePredecessor() &&
727         BB->getSinglePredecessor()->getSingleSuccessor()))
728     return false;
729 
730   // Skip merging if the block's successor is also a successor to any callbr
731   // that leads to this block.
732   // FIXME: Is this really needed? Is this a correctness issue?
733   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
734     if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
735       for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
736         if (DestBB == CBI->getSuccessor(i))
737           return false;
738   }
739 
740   // Try to skip merging if the unique predecessor of BB is terminated by a
741   // switch or indirect branch instruction, and BB is used as an incoming block
742   // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
743   // add COPY instructions in the predecessor of BB instead of BB (if it is not
744   // merged). Note that the critical edge created by merging such blocks wont be
745   // split in MachineSink because the jump table is not analyzable. By keeping
746   // such empty block (BB), ISel will place COPY instructions in BB, not in the
747   // predecessor of BB.
748   BasicBlock *Pred = BB->getUniquePredecessor();
749   if (!Pred ||
750       !(isa<SwitchInst>(Pred->getTerminator()) ||
751         isa<IndirectBrInst>(Pred->getTerminator())))
752     return true;
753 
754   if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
755     return true;
756 
757   // We use a simple cost heuristic which determine skipping merging is
758   // profitable if the cost of skipping merging is less than the cost of
759   // merging : Cost(skipping merging) < Cost(merging BB), where the
760   // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
761   // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
762   // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
763   //   Freq(Pred) / Freq(BB) > 2.
764   // Note that if there are multiple empty blocks sharing the same incoming
765   // value for the PHIs in the DestBB, we consider them together. In such
766   // case, Cost(merging BB) will be the sum of their frequencies.
767 
768   if (!isa<PHINode>(DestBB->begin()))
769     return true;
770 
771   SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
772 
773   // Find all other incoming blocks from which incoming values of all PHIs in
774   // DestBB are the same as the ones from BB.
775   for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
776        ++PI) {
777     BasicBlock *DestBBPred = *PI;
778     if (DestBBPred == BB)
779       continue;
780 
781     if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
782           return DestPN.getIncomingValueForBlock(BB) ==
783                  DestPN.getIncomingValueForBlock(DestBBPred);
784         }))
785       SameIncomingValueBBs.insert(DestBBPred);
786   }
787 
788   // See if all BB's incoming values are same as the value from Pred. In this
789   // case, no reason to skip merging because COPYs are expected to be place in
790   // Pred already.
791   if (SameIncomingValueBBs.count(Pred))
792     return true;
793 
794   BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
795   BlockFrequency BBFreq = BFI->getBlockFreq(BB);
796 
797   for (auto *SameValueBB : SameIncomingValueBBs)
798     if (SameValueBB->getUniquePredecessor() == Pred &&
799         DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
800       BBFreq += BFI->getBlockFreq(SameValueBB);
801 
802   return PredFreq.getFrequency() <=
803          BBFreq.getFrequency() * FreqRatioToSkipMerge;
804 }
805 
806 /// Return true if we can merge BB into DestBB if there is a single
807 /// unconditional branch between them, and BB contains no other non-phi
808 /// instructions.
809 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
810                                     const BasicBlock *DestBB) const {
811   // We only want to eliminate blocks whose phi nodes are used by phi nodes in
812   // the successor.  If there are more complex condition (e.g. preheaders),
813   // don't mess around with them.
814   for (const PHINode &PN : BB->phis()) {
815     for (const User *U : PN.users()) {
816       const Instruction *UI = cast<Instruction>(U);
817       if (UI->getParent() != DestBB || !isa<PHINode>(UI))
818         return false;
819       // If User is inside DestBB block and it is a PHINode then check
820       // incoming value. If incoming value is not from BB then this is
821       // a complex condition (e.g. preheaders) we want to avoid here.
822       if (UI->getParent() == DestBB) {
823         if (const PHINode *UPN = dyn_cast<PHINode>(UI))
824           for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
825             Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
826             if (Insn && Insn->getParent() == BB &&
827                 Insn->getParent() != UPN->getIncomingBlock(I))
828               return false;
829           }
830       }
831     }
832   }
833 
834   // If BB and DestBB contain any common predecessors, then the phi nodes in BB
835   // and DestBB may have conflicting incoming values for the block.  If so, we
836   // can't merge the block.
837   const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
838   if (!DestBBPN) return true;  // no conflict.
839 
840   // Collect the preds of BB.
841   SmallPtrSet<const BasicBlock*, 16> BBPreds;
842   if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
843     // It is faster to get preds from a PHI than with pred_iterator.
844     for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
845       BBPreds.insert(BBPN->getIncomingBlock(i));
846   } else {
847     BBPreds.insert(pred_begin(BB), pred_end(BB));
848   }
849 
850   // Walk the preds of DestBB.
851   for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
852     BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
853     if (BBPreds.count(Pred)) {   // Common predecessor?
854       for (const PHINode &PN : DestBB->phis()) {
855         const Value *V1 = PN.getIncomingValueForBlock(Pred);
856         const Value *V2 = PN.getIncomingValueForBlock(BB);
857 
858         // If V2 is a phi node in BB, look up what the mapped value will be.
859         if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
860           if (V2PN->getParent() == BB)
861             V2 = V2PN->getIncomingValueForBlock(Pred);
862 
863         // If there is a conflict, bail out.
864         if (V1 != V2) return false;
865       }
866     }
867   }
868 
869   return true;
870 }
871 
872 /// Eliminate a basic block that has only phi's and an unconditional branch in
873 /// it.
874 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
875   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
876   BasicBlock *DestBB = BI->getSuccessor(0);
877 
878   LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
879                     << *BB << *DestBB);
880 
881   // If the destination block has a single pred, then this is a trivial edge,
882   // just collapse it.
883   if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
884     if (SinglePred != DestBB) {
885       assert(SinglePred == BB &&
886              "Single predecessor not the same as predecessor");
887       // Merge DestBB into SinglePred/BB and delete it.
888       MergeBlockIntoPredecessor(DestBB);
889       // Note: BB(=SinglePred) will not be deleted on this path.
890       // DestBB(=its single successor) is the one that was deleted.
891       LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
892       return;
893     }
894   }
895 
896   // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
897   // to handle the new incoming edges it is about to have.
898   for (PHINode &PN : DestBB->phis()) {
899     // Remove the incoming value for BB, and remember it.
900     Value *InVal = PN.removeIncomingValue(BB, false);
901 
902     // Two options: either the InVal is a phi node defined in BB or it is some
903     // value that dominates BB.
904     PHINode *InValPhi = dyn_cast<PHINode>(InVal);
905     if (InValPhi && InValPhi->getParent() == BB) {
906       // Add all of the input values of the input PHI as inputs of this phi.
907       for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
908         PN.addIncoming(InValPhi->getIncomingValue(i),
909                        InValPhi->getIncomingBlock(i));
910     } else {
911       // Otherwise, add one instance of the dominating value for each edge that
912       // we will be adding.
913       if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
914         for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
915           PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
916       } else {
917         for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
918           PN.addIncoming(InVal, *PI);
919       }
920     }
921   }
922 
923   // The PHIs are now updated, change everything that refers to BB to use
924   // DestBB and remove BB.
925   BB->replaceAllUsesWith(DestBB);
926   BB->eraseFromParent();
927   ++NumBlocksElim;
928 
929   LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
930 }
931 
932 // Computes a map of base pointer relocation instructions to corresponding
933 // derived pointer relocation instructions given a vector of all relocate calls
934 static void computeBaseDerivedRelocateMap(
935     const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
936     DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
937         &RelocateInstMap) {
938   // Collect information in two maps: one primarily for locating the base object
939   // while filling the second map; the second map is the final structure holding
940   // a mapping between Base and corresponding Derived relocate calls
941   DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
942   for (auto *ThisRelocate : AllRelocateCalls) {
943     auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
944                             ThisRelocate->getDerivedPtrIndex());
945     RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
946   }
947   for (auto &Item : RelocateIdxMap) {
948     std::pair<unsigned, unsigned> Key = Item.first;
949     if (Key.first == Key.second)
950       // Base relocation: nothing to insert
951       continue;
952 
953     GCRelocateInst *I = Item.second;
954     auto BaseKey = std::make_pair(Key.first, Key.first);
955 
956     // We're iterating over RelocateIdxMap so we cannot modify it.
957     auto MaybeBase = RelocateIdxMap.find(BaseKey);
958     if (MaybeBase == RelocateIdxMap.end())
959       // TODO: We might want to insert a new base object relocate and gep off
960       // that, if there are enough derived object relocates.
961       continue;
962 
963     RelocateInstMap[MaybeBase->second].push_back(I);
964   }
965 }
966 
967 // Accepts a GEP and extracts the operands into a vector provided they're all
968 // small integer constants
969 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
970                                           SmallVectorImpl<Value *> &OffsetV) {
971   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
972     // Only accept small constant integer operands
973     auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
974     if (!Op || Op->getZExtValue() > 20)
975       return false;
976   }
977 
978   for (unsigned i = 1; i < GEP->getNumOperands(); i++)
979     OffsetV.push_back(GEP->getOperand(i));
980   return true;
981 }
982 
983 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
984 // replace, computes a replacement, and affects it.
985 static bool
986 simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
987                           const SmallVectorImpl<GCRelocateInst *> &Targets) {
988   bool MadeChange = false;
989   // We must ensure the relocation of derived pointer is defined after
990   // relocation of base pointer. If we find a relocation corresponding to base
991   // defined earlier than relocation of base then we move relocation of base
992   // right before found relocation. We consider only relocation in the same
993   // basic block as relocation of base. Relocations from other basic block will
994   // be skipped by optimization and we do not care about them.
995   for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
996        &*R != RelocatedBase; ++R)
997     if (auto *RI = dyn_cast<GCRelocateInst>(R))
998       if (RI->getStatepoint() == RelocatedBase->getStatepoint())
999         if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
1000           RelocatedBase->moveBefore(RI);
1001           break;
1002         }
1003 
1004   for (GCRelocateInst *ToReplace : Targets) {
1005     assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
1006            "Not relocating a derived object of the original base object");
1007     if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
1008       // A duplicate relocate call. TODO: coalesce duplicates.
1009       continue;
1010     }
1011 
1012     if (RelocatedBase->getParent() != ToReplace->getParent()) {
1013       // Base and derived relocates are in different basic blocks.
1014       // In this case transform is only valid when base dominates derived
1015       // relocate. However it would be too expensive to check dominance
1016       // for each such relocate, so we skip the whole transformation.
1017       continue;
1018     }
1019 
1020     Value *Base = ToReplace->getBasePtr();
1021     auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
1022     if (!Derived || Derived->getPointerOperand() != Base)
1023       continue;
1024 
1025     SmallVector<Value *, 2> OffsetV;
1026     if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
1027       continue;
1028 
1029     // Create a Builder and replace the target callsite with a gep
1030     assert(RelocatedBase->getNextNode() &&
1031            "Should always have one since it's not a terminator");
1032 
1033     // Insert after RelocatedBase
1034     IRBuilder<> Builder(RelocatedBase->getNextNode());
1035     Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
1036 
1037     // If gc_relocate does not match the actual type, cast it to the right type.
1038     // In theory, there must be a bitcast after gc_relocate if the type does not
1039     // match, and we should reuse it to get the derived pointer. But it could be
1040     // cases like this:
1041     // bb1:
1042     //  ...
1043     //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
1044     //  br label %merge
1045     //
1046     // bb2:
1047     //  ...
1048     //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
1049     //  br label %merge
1050     //
1051     // merge:
1052     //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
1053     //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
1054     //
1055     // In this case, we can not find the bitcast any more. So we insert a new bitcast
1056     // no matter there is already one or not. In this way, we can handle all cases, and
1057     // the extra bitcast should be optimized away in later passes.
1058     Value *ActualRelocatedBase = RelocatedBase;
1059     if (RelocatedBase->getType() != Base->getType()) {
1060       ActualRelocatedBase =
1061           Builder.CreateBitCast(RelocatedBase, Base->getType());
1062     }
1063     Value *Replacement = Builder.CreateGEP(
1064         Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
1065     Replacement->takeName(ToReplace);
1066     // If the newly generated derived pointer's type does not match the original derived
1067     // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
1068     Value *ActualReplacement = Replacement;
1069     if (Replacement->getType() != ToReplace->getType()) {
1070       ActualReplacement =
1071           Builder.CreateBitCast(Replacement, ToReplace->getType());
1072     }
1073     ToReplace->replaceAllUsesWith(ActualReplacement);
1074     ToReplace->eraseFromParent();
1075 
1076     MadeChange = true;
1077   }
1078   return MadeChange;
1079 }
1080 
1081 // Turns this:
1082 //
1083 // %base = ...
1084 // %ptr = gep %base + 15
1085 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1086 // %base' = relocate(%tok, i32 4, i32 4)
1087 // %ptr' = relocate(%tok, i32 4, i32 5)
1088 // %val = load %ptr'
1089 //
1090 // into this:
1091 //
1092 // %base = ...
1093 // %ptr = gep %base + 15
1094 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1095 // %base' = gc.relocate(%tok, i32 4, i32 4)
1096 // %ptr' = gep %base' + 15
1097 // %val = load %ptr'
1098 bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
1099   bool MadeChange = false;
1100   SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
1101   for (auto *U : I.users())
1102     if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
1103       // Collect all the relocate calls associated with a statepoint
1104       AllRelocateCalls.push_back(Relocate);
1105 
1106   // We need at least one base pointer relocation + one derived pointer
1107   // relocation to mangle
1108   if (AllRelocateCalls.size() < 2)
1109     return false;
1110 
1111   // RelocateInstMap is a mapping from the base relocate instruction to the
1112   // corresponding derived relocate instructions
1113   DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
1114   computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
1115   if (RelocateInstMap.empty())
1116     return false;
1117 
1118   for (auto &Item : RelocateInstMap)
1119     // Item.first is the RelocatedBase to offset against
1120     // Item.second is the vector of Targets to replace
1121     MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
1122   return MadeChange;
1123 }
1124 
1125 /// Sink the specified cast instruction into its user blocks.
1126 static bool SinkCast(CastInst *CI) {
1127   BasicBlock *DefBB = CI->getParent();
1128 
1129   /// InsertedCasts - Only insert a cast in each block once.
1130   DenseMap<BasicBlock*, CastInst*> InsertedCasts;
1131 
1132   bool MadeChange = false;
1133   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
1134        UI != E; ) {
1135     Use &TheUse = UI.getUse();
1136     Instruction *User = cast<Instruction>(*UI);
1137 
1138     // Figure out which BB this cast is used in.  For PHI's this is the
1139     // appropriate predecessor block.
1140     BasicBlock *UserBB = User->getParent();
1141     if (PHINode *PN = dyn_cast<PHINode>(User)) {
1142       UserBB = PN->getIncomingBlock(TheUse);
1143     }
1144 
1145     // Preincrement use iterator so we don't invalidate it.
1146     ++UI;
1147 
1148     // The first insertion point of a block containing an EH pad is after the
1149     // pad.  If the pad is the user, we cannot sink the cast past the pad.
1150     if (User->isEHPad())
1151       continue;
1152 
1153     // If the block selected to receive the cast is an EH pad that does not
1154     // allow non-PHI instructions before the terminator, we can't sink the
1155     // cast.
1156     if (UserBB->getTerminator()->isEHPad())
1157       continue;
1158 
1159     // If this user is in the same block as the cast, don't change the cast.
1160     if (UserBB == DefBB) continue;
1161 
1162     // If we have already inserted a cast into this block, use it.
1163     CastInst *&InsertedCast = InsertedCasts[UserBB];
1164 
1165     if (!InsertedCast) {
1166       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1167       assert(InsertPt != UserBB->end());
1168       InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
1169                                       CI->getType(), "", &*InsertPt);
1170       InsertedCast->setDebugLoc(CI->getDebugLoc());
1171     }
1172 
1173     // Replace a use of the cast with a use of the new cast.
1174     TheUse = InsertedCast;
1175     MadeChange = true;
1176     ++NumCastUses;
1177   }
1178 
1179   // If we removed all uses, nuke the cast.
1180   if (CI->use_empty()) {
1181     salvageDebugInfo(*CI);
1182     CI->eraseFromParent();
1183     MadeChange = true;
1184   }
1185 
1186   return MadeChange;
1187 }
1188 
1189 /// If the specified cast instruction is a noop copy (e.g. it's casting from
1190 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
1191 /// reduce the number of virtual registers that must be created and coalesced.
1192 ///
1193 /// Return true if any changes are made.
1194 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
1195                                        const DataLayout &DL) {
1196   // Sink only "cheap" (or nop) address-space casts.  This is a weaker condition
1197   // than sinking only nop casts, but is helpful on some platforms.
1198   if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
1199     if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),
1200                                  ASC->getDestAddressSpace()))
1201       return false;
1202   }
1203 
1204   // If this is a noop copy,
1205   EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1206   EVT DstVT = TLI.getValueType(DL, CI->getType());
1207 
1208   // This is an fp<->int conversion?
1209   if (SrcVT.isInteger() != DstVT.isInteger())
1210     return false;
1211 
1212   // If this is an extension, it will be a zero or sign extension, which
1213   // isn't a noop.
1214   if (SrcVT.bitsLT(DstVT)) return false;
1215 
1216   // If these values will be promoted, find out what they will be promoted
1217   // to.  This helps us consider truncates on PPC as noop copies when they
1218   // are.
1219   if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
1220       TargetLowering::TypePromoteInteger)
1221     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
1222   if (TLI.getTypeAction(CI->getContext(), DstVT) ==
1223       TargetLowering::TypePromoteInteger)
1224     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
1225 
1226   // If, after promotion, these are the same types, this is a noop copy.
1227   if (SrcVT != DstVT)
1228     return false;
1229 
1230   return SinkCast(CI);
1231 }
1232 
1233 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
1234                                                  Value *Arg0, Value *Arg1,
1235                                                  CmpInst *Cmp,
1236                                                  Intrinsic::ID IID) {
1237   if (BO->getParent() != Cmp->getParent()) {
1238     // We used to use a dominator tree here to allow multi-block optimization.
1239     // But that was problematic because:
1240     // 1. It could cause a perf regression by hoisting the math op into the
1241     //    critical path.
1242     // 2. It could cause a perf regression by creating a value that was live
1243     //    across multiple blocks and increasing register pressure.
1244     // 3. Use of a dominator tree could cause large compile-time regression.
1245     //    This is because we recompute the DT on every change in the main CGP
1246     //    run-loop. The recomputing is probably unnecessary in many cases, so if
1247     //    that was fixed, using a DT here would be ok.
1248     return false;
1249   }
1250 
1251   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
1252   if (BO->getOpcode() == Instruction::Add &&
1253       IID == Intrinsic::usub_with_overflow) {
1254     assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
1255     Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
1256   }
1257 
1258   // Insert at the first instruction of the pair.
1259   Instruction *InsertPt = nullptr;
1260   for (Instruction &Iter : *Cmp->getParent()) {
1261     // If BO is an XOR, it is not guaranteed that it comes after both inputs to
1262     // the overflow intrinsic are defined.
1263     if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
1264       InsertPt = &Iter;
1265       break;
1266     }
1267   }
1268   assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
1269 
1270   IRBuilder<> Builder(InsertPt);
1271   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
1272   if (BO->getOpcode() != Instruction::Xor) {
1273     Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
1274     BO->replaceAllUsesWith(Math);
1275   } else
1276     assert(BO->hasOneUse() &&
1277            "Patterns with XOr should use the BO only in the compare");
1278   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
1279   Cmp->replaceAllUsesWith(OV);
1280   Cmp->eraseFromParent();
1281   BO->eraseFromParent();
1282   return true;
1283 }
1284 
1285 /// Match special-case patterns that check for unsigned add overflow.
1286 static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
1287                                                    BinaryOperator *&Add) {
1288   // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
1289   // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
1290   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
1291 
1292   // We are not expecting non-canonical/degenerate code. Just bail out.
1293   if (isa<Constant>(A))
1294     return false;
1295 
1296   ICmpInst::Predicate Pred = Cmp->getPredicate();
1297   if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))
1298     B = ConstantInt::get(B->getType(), 1);
1299   else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))
1300     B = ConstantInt::get(B->getType(), -1);
1301   else
1302     return false;
1303 
1304   // Check the users of the variable operand of the compare looking for an add
1305   // with the adjusted constant.
1306   for (User *U : A->users()) {
1307     if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {
1308       Add = cast<BinaryOperator>(U);
1309       return true;
1310     }
1311   }
1312   return false;
1313 }
1314 
1315 /// Try to combine the compare into a call to the llvm.uadd.with.overflow
1316 /// intrinsic. Return true if any changes were made.
1317 bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
1318                                                bool &ModifiedDT) {
1319   Value *A, *B;
1320   BinaryOperator *Add;
1321   if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
1322     if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
1323       return false;
1324     // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
1325     A = Add->getOperand(0);
1326     B = Add->getOperand(1);
1327   }
1328 
1329   if (!TLI->shouldFormOverflowOp(ISD::UADDO,
1330                                  TLI->getValueType(*DL, Add->getType()),
1331                                  Add->hasNUsesOrMore(2)))
1332     return false;
1333 
1334   // We don't want to move around uses of condition values this late, so we
1335   // check if it is legal to create the call to the intrinsic in the basic
1336   // block containing the icmp.
1337   if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
1338     return false;
1339 
1340   if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
1341                                    Intrinsic::uadd_with_overflow))
1342     return false;
1343 
1344   // Reset callers - do not crash by iterating over a dead instruction.
1345   ModifiedDT = true;
1346   return true;
1347 }
1348 
1349 bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
1350                                                bool &ModifiedDT) {
1351   // We are not expecting non-canonical/degenerate code. Just bail out.
1352   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
1353   if (isa<Constant>(A) && isa<Constant>(B))
1354     return false;
1355 
1356   // Convert (A u> B) to (A u< B) to simplify pattern matching.
1357   ICmpInst::Predicate Pred = Cmp->getPredicate();
1358   if (Pred == ICmpInst::ICMP_UGT) {
1359     std::swap(A, B);
1360     Pred = ICmpInst::ICMP_ULT;
1361   }
1362   // Convert special-case: (A == 0) is the same as (A u< 1).
1363   if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
1364     B = ConstantInt::get(B->getType(), 1);
1365     Pred = ICmpInst::ICMP_ULT;
1366   }
1367   // Convert special-case: (A != 0) is the same as (0 u< A).
1368   if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {
1369     std::swap(A, B);
1370     Pred = ICmpInst::ICMP_ULT;
1371   }
1372   if (Pred != ICmpInst::ICMP_ULT)
1373     return false;
1374 
1375   // Walk the users of a variable operand of a compare looking for a subtract or
1376   // add with that same operand. Also match the 2nd operand of the compare to
1377   // the add/sub, but that may be a negated constant operand of an add.
1378   Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
1379   BinaryOperator *Sub = nullptr;
1380   for (User *U : CmpVariableOperand->users()) {
1381     // A - B, A u< B --> usubo(A, B)
1382     if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
1383       Sub = cast<BinaryOperator>(U);
1384       break;
1385     }
1386 
1387     // A + (-C), A u< C (canonicalized form of (sub A, C))
1388     const APInt *CmpC, *AddC;
1389     if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
1390         match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {
1391       Sub = cast<BinaryOperator>(U);
1392       break;
1393     }
1394   }
1395   if (!Sub)
1396     return false;
1397 
1398   if (!TLI->shouldFormOverflowOp(ISD::USUBO,
1399                                  TLI->getValueType(*DL, Sub->getType()),
1400                                  Sub->hasNUsesOrMore(2)))
1401     return false;
1402 
1403   if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
1404                                    Cmp, Intrinsic::usub_with_overflow))
1405     return false;
1406 
1407   // Reset callers - do not crash by iterating over a dead instruction.
1408   ModifiedDT = true;
1409   return true;
1410 }
1411 
1412 /// Sink the given CmpInst into user blocks to reduce the number of virtual
1413 /// registers that must be created and coalesced. This is a clear win except on
1414 /// targets with multiple condition code registers (PowerPC), where it might
1415 /// lose; some adjustment may be wanted there.
1416 ///
1417 /// Return true if any changes are made.
1418 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
1419   if (TLI.hasMultipleConditionRegisters())
1420     return false;
1421 
1422   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
1423   if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
1424     return false;
1425 
1426   // Only insert a cmp in each block once.
1427   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
1428 
1429   bool MadeChange = false;
1430   for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
1431        UI != E; ) {
1432     Use &TheUse = UI.getUse();
1433     Instruction *User = cast<Instruction>(*UI);
1434 
1435     // Preincrement use iterator so we don't invalidate it.
1436     ++UI;
1437 
1438     // Don't bother for PHI nodes.
1439     if (isa<PHINode>(User))
1440       continue;
1441 
1442     // Figure out which BB this cmp is used in.
1443     BasicBlock *UserBB = User->getParent();
1444     BasicBlock *DefBB = Cmp->getParent();
1445 
1446     // If this user is in the same block as the cmp, don't change the cmp.
1447     if (UserBB == DefBB) continue;
1448 
1449     // If we have already inserted a cmp into this block, use it.
1450     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
1451 
1452     if (!InsertedCmp) {
1453       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1454       assert(InsertPt != UserBB->end());
1455       InsertedCmp =
1456           CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),
1457                           Cmp->getOperand(0), Cmp->getOperand(1), "",
1458                           &*InsertPt);
1459       // Propagate the debug info.
1460       InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
1461     }
1462 
1463     // Replace a use of the cmp with a use of the new cmp.
1464     TheUse = InsertedCmp;
1465     MadeChange = true;
1466     ++NumCmpUses;
1467   }
1468 
1469   // If we removed all uses, nuke the cmp.
1470   if (Cmp->use_empty()) {
1471     Cmp->eraseFromParent();
1472     MadeChange = true;
1473   }
1474 
1475   return MadeChange;
1476 }
1477 
1478 /// For pattern like:
1479 ///
1480 ///   DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB)
1481 ///   ...
1482 /// DomBB:
1483 ///   ...
1484 ///   br DomCond, TrueBB, CmpBB
1485 /// CmpBB: (with DomBB being the single predecessor)
1486 ///   ...
1487 ///   Cmp = icmp eq CmpOp0, CmpOp1
1488 ///   ...
1489 ///
1490 /// It would use two comparison on targets that lowering of icmp sgt/slt is
1491 /// different from lowering of icmp eq (PowerPC). This function try to convert
1492 /// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'.
1493 /// After that, DomCond and Cmp can use the same comparison so reduce one
1494 /// comparison.
1495 ///
1496 /// Return true if any changes are made.
1497 static bool foldICmpWithDominatingICmp(CmpInst *Cmp,
1498                                        const TargetLowering &TLI) {
1499   if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp())
1500     return false;
1501 
1502   ICmpInst::Predicate Pred = Cmp->getPredicate();
1503   if (Pred != ICmpInst::ICMP_EQ)
1504     return false;
1505 
1506   // If icmp eq has users other than BranchInst and SelectInst, converting it to
1507   // icmp slt/sgt would introduce more redundant LLVM IR.
1508   for (User *U : Cmp->users()) {
1509     if (isa<BranchInst>(U))
1510       continue;
1511     if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == Cmp)
1512       continue;
1513     return false;
1514   }
1515 
1516   // This is a cheap/incomplete check for dominance - just match a single
1517   // predecessor with a conditional branch.
1518   BasicBlock *CmpBB = Cmp->getParent();
1519   BasicBlock *DomBB = CmpBB->getSinglePredecessor();
1520   if (!DomBB)
1521     return false;
1522 
1523   // We want to ensure that the only way control gets to the comparison of
1524   // interest is that a less/greater than comparison on the same operands is
1525   // false.
1526   Value *DomCond;
1527   BasicBlock *TrueBB, *FalseBB;
1528   if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
1529     return false;
1530   if (CmpBB != FalseBB)
1531     return false;
1532 
1533   Value *CmpOp0 = Cmp->getOperand(0), *CmpOp1 = Cmp->getOperand(1);
1534   ICmpInst::Predicate DomPred;
1535   if (!match(DomCond, m_ICmp(DomPred, m_Specific(CmpOp0), m_Specific(CmpOp1))))
1536     return false;
1537   if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT)
1538     return false;
1539 
1540   // Convert the equality comparison to the opposite of the dominating
1541   // comparison and swap the direction for all branch/select users.
1542   // We have conceptually converted:
1543   // Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>;
1544   // to
1545   // Res = (a < b) ? <LT_RES> : (a > b)  ? <GT_RES> : <EQ_RES>;
1546   // And similarly for branches.
1547   for (User *U : Cmp->users()) {
1548     if (auto *BI = dyn_cast<BranchInst>(U)) {
1549       assert(BI->isConditional() && "Must be conditional");
1550       BI->swapSuccessors();
1551       continue;
1552     }
1553     if (auto *SI = dyn_cast<SelectInst>(U)) {
1554       // Swap operands
1555       SI->swapValues();
1556       SI->swapProfMetadata();
1557       continue;
1558     }
1559     llvm_unreachable("Must be a branch or a select");
1560   }
1561   Cmp->setPredicate(CmpInst::getSwappedPredicate(DomPred));
1562   return true;
1563 }
1564 
1565 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
1566   if (sinkCmpExpression(Cmp, *TLI))
1567     return true;
1568 
1569   if (combineToUAddWithOverflow(Cmp, ModifiedDT))
1570     return true;
1571 
1572   if (combineToUSubWithOverflow(Cmp, ModifiedDT))
1573     return true;
1574 
1575   if (foldICmpWithDominatingICmp(Cmp, *TLI))
1576     return true;
1577 
1578   return false;
1579 }
1580 
1581 /// Duplicate and sink the given 'and' instruction into user blocks where it is
1582 /// used in a compare to allow isel to generate better code for targets where
1583 /// this operation can be combined.
1584 ///
1585 /// Return true if any changes are made.
1586 static bool sinkAndCmp0Expression(Instruction *AndI,
1587                                   const TargetLowering &TLI,
1588                                   SetOfInstrs &InsertedInsts) {
1589   // Double-check that we're not trying to optimize an instruction that was
1590   // already optimized by some other part of this pass.
1591   assert(!InsertedInsts.count(AndI) &&
1592          "Attempting to optimize already optimized and instruction");
1593   (void) InsertedInsts;
1594 
1595   // Nothing to do for single use in same basic block.
1596   if (AndI->hasOneUse() &&
1597       AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
1598     return false;
1599 
1600   // Try to avoid cases where sinking/duplicating is likely to increase register
1601   // pressure.
1602   if (!isa<ConstantInt>(AndI->getOperand(0)) &&
1603       !isa<ConstantInt>(AndI->getOperand(1)) &&
1604       AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
1605     return false;
1606 
1607   for (auto *U : AndI->users()) {
1608     Instruction *User = cast<Instruction>(U);
1609 
1610     // Only sink 'and' feeding icmp with 0.
1611     if (!isa<ICmpInst>(User))
1612       return false;
1613 
1614     auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
1615     if (!CmpC || !CmpC->isZero())
1616       return false;
1617   }
1618 
1619   if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
1620     return false;
1621 
1622   LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
1623   LLVM_DEBUG(AndI->getParent()->dump());
1624 
1625   // Push the 'and' into the same block as the icmp 0.  There should only be
1626   // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
1627   // others, so we don't need to keep track of which BBs we insert into.
1628   for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
1629        UI != E; ) {
1630     Use &TheUse = UI.getUse();
1631     Instruction *User = cast<Instruction>(*UI);
1632 
1633     // Preincrement use iterator so we don't invalidate it.
1634     ++UI;
1635 
1636     LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
1637 
1638     // Keep the 'and' in the same place if the use is already in the same block.
1639     Instruction *InsertPt =
1640         User->getParent() == AndI->getParent() ? AndI : User;
1641     Instruction *InsertedAnd =
1642         BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
1643                                AndI->getOperand(1), "", InsertPt);
1644     // Propagate the debug info.
1645     InsertedAnd->setDebugLoc(AndI->getDebugLoc());
1646 
1647     // Replace a use of the 'and' with a use of the new 'and'.
1648     TheUse = InsertedAnd;
1649     ++NumAndUses;
1650     LLVM_DEBUG(User->getParent()->dump());
1651   }
1652 
1653   // We removed all uses, nuke the and.
1654   AndI->eraseFromParent();
1655   return true;
1656 }
1657 
1658 /// Check if the candidates could be combined with a shift instruction, which
1659 /// includes:
1660 /// 1. Truncate instruction
1661 /// 2. And instruction and the imm is a mask of the low bits:
1662 /// imm & (imm+1) == 0
1663 static bool isExtractBitsCandidateUse(Instruction *User) {
1664   if (!isa<TruncInst>(User)) {
1665     if (User->getOpcode() != Instruction::And ||
1666         !isa<ConstantInt>(User->getOperand(1)))
1667       return false;
1668 
1669     const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
1670 
1671     if ((Cimm & (Cimm + 1)).getBoolValue())
1672       return false;
1673   }
1674   return true;
1675 }
1676 
1677 /// Sink both shift and truncate instruction to the use of truncate's BB.
1678 static bool
1679 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
1680                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
1681                      const TargetLowering &TLI, const DataLayout &DL) {
1682   BasicBlock *UserBB = User->getParent();
1683   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
1684   auto *TruncI = cast<TruncInst>(User);
1685   bool MadeChange = false;
1686 
1687   for (Value::user_iterator TruncUI = TruncI->user_begin(),
1688                             TruncE = TruncI->user_end();
1689        TruncUI != TruncE;) {
1690 
1691     Use &TruncTheUse = TruncUI.getUse();
1692     Instruction *TruncUser = cast<Instruction>(*TruncUI);
1693     // Preincrement use iterator so we don't invalidate it.
1694 
1695     ++TruncUI;
1696 
1697     int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
1698     if (!ISDOpcode)
1699       continue;
1700 
1701     // If the use is actually a legal node, there will not be an
1702     // implicit truncate.
1703     // FIXME: always querying the result type is just an
1704     // approximation; some nodes' legality is determined by the
1705     // operand or other means. There's no good way to find out though.
1706     if (TLI.isOperationLegalOrCustom(
1707             ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
1708       continue;
1709 
1710     // Don't bother for PHI nodes.
1711     if (isa<PHINode>(TruncUser))
1712       continue;
1713 
1714     BasicBlock *TruncUserBB = TruncUser->getParent();
1715 
1716     if (UserBB == TruncUserBB)
1717       continue;
1718 
1719     BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
1720     CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
1721 
1722     if (!InsertedShift && !InsertedTrunc) {
1723       BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
1724       assert(InsertPt != TruncUserBB->end());
1725       // Sink the shift
1726       if (ShiftI->getOpcode() == Instruction::AShr)
1727         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1728                                                    "", &*InsertPt);
1729       else
1730         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1731                                                    "", &*InsertPt);
1732       InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
1733 
1734       // Sink the trunc
1735       BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
1736       TruncInsertPt++;
1737       assert(TruncInsertPt != TruncUserBB->end());
1738 
1739       InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
1740                                        TruncI->getType(), "", &*TruncInsertPt);
1741       InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());
1742 
1743       MadeChange = true;
1744 
1745       TruncTheUse = InsertedTrunc;
1746     }
1747   }
1748   return MadeChange;
1749 }
1750 
1751 /// Sink the shift *right* instruction into user blocks if the uses could
1752 /// potentially be combined with this shift instruction and generate BitExtract
1753 /// instruction. It will only be applied if the architecture supports BitExtract
1754 /// instruction. Here is an example:
1755 /// BB1:
1756 ///   %x.extract.shift = lshr i64 %arg1, 32
1757 /// BB2:
1758 ///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
1759 /// ==>
1760 ///
1761 /// BB2:
1762 ///   %x.extract.shift.1 = lshr i64 %arg1, 32
1763 ///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
1764 ///
1765 /// CodeGen will recognize the pattern in BB2 and generate BitExtract
1766 /// instruction.
1767 /// Return true if any changes are made.
1768 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
1769                                 const TargetLowering &TLI,
1770                                 const DataLayout &DL) {
1771   BasicBlock *DefBB = ShiftI->getParent();
1772 
1773   /// Only insert instructions in each block once.
1774   DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
1775 
1776   bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
1777 
1778   bool MadeChange = false;
1779   for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
1780        UI != E;) {
1781     Use &TheUse = UI.getUse();
1782     Instruction *User = cast<Instruction>(*UI);
1783     // Preincrement use iterator so we don't invalidate it.
1784     ++UI;
1785 
1786     // Don't bother for PHI nodes.
1787     if (isa<PHINode>(User))
1788       continue;
1789 
1790     if (!isExtractBitsCandidateUse(User))
1791       continue;
1792 
1793     BasicBlock *UserBB = User->getParent();
1794 
1795     if (UserBB == DefBB) {
1796       // If the shift and truncate instruction are in the same BB. The use of
1797       // the truncate(TruncUse) may still introduce another truncate if not
1798       // legal. In this case, we would like to sink both shift and truncate
1799       // instruction to the BB of TruncUse.
1800       // for example:
1801       // BB1:
1802       // i64 shift.result = lshr i64 opnd, imm
1803       // trunc.result = trunc shift.result to i16
1804       //
1805       // BB2:
1806       //   ----> We will have an implicit truncate here if the architecture does
1807       //   not have i16 compare.
1808       // cmp i16 trunc.result, opnd2
1809       //
1810       if (isa<TruncInst>(User) && shiftIsLegal
1811           // If the type of the truncate is legal, no truncate will be
1812           // introduced in other basic blocks.
1813           &&
1814           (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
1815         MadeChange =
1816             SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
1817 
1818       continue;
1819     }
1820     // If we have already inserted a shift into this block, use it.
1821     BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
1822 
1823     if (!InsertedShift) {
1824       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1825       assert(InsertPt != UserBB->end());
1826 
1827       if (ShiftI->getOpcode() == Instruction::AShr)
1828         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1829                                                    "", &*InsertPt);
1830       else
1831         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1832                                                    "", &*InsertPt);
1833       InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
1834 
1835       MadeChange = true;
1836     }
1837 
1838     // Replace a use of the shift with a use of the new shift.
1839     TheUse = InsertedShift;
1840   }
1841 
1842   // If we removed all uses, or there are none, nuke the shift.
1843   if (ShiftI->use_empty()) {
1844     salvageDebugInfo(*ShiftI);
1845     ShiftI->eraseFromParent();
1846     MadeChange = true;
1847   }
1848 
1849   return MadeChange;
1850 }
1851 
1852 /// If counting leading or trailing zeros is an expensive operation and a zero
1853 /// input is defined, add a check for zero to avoid calling the intrinsic.
1854 ///
1855 /// We want to transform:
1856 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
1857 ///
1858 /// into:
1859 ///   entry:
1860 ///     %cmpz = icmp eq i64 %A, 0
1861 ///     br i1 %cmpz, label %cond.end, label %cond.false
1862 ///   cond.false:
1863 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
1864 ///     br label %cond.end
1865 ///   cond.end:
1866 ///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
1867 ///
1868 /// If the transform is performed, return true and set ModifiedDT to true.
1869 static bool despeculateCountZeros(IntrinsicInst *CountZeros,
1870                                   const TargetLowering *TLI,
1871                                   const DataLayout *DL,
1872                                   bool &ModifiedDT) {
1873   // If a zero input is undefined, it doesn't make sense to despeculate that.
1874   if (match(CountZeros->getOperand(1), m_One()))
1875     return false;
1876 
1877   // If it's cheap to speculate, there's nothing to do.
1878   auto IntrinsicID = CountZeros->getIntrinsicID();
1879   if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
1880       (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
1881     return false;
1882 
1883   // Only handle legal scalar cases. Anything else requires too much work.
1884   Type *Ty = CountZeros->getType();
1885   unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
1886   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
1887     return false;
1888 
1889   // The intrinsic will be sunk behind a compare against zero and branch.
1890   BasicBlock *StartBlock = CountZeros->getParent();
1891   BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
1892 
1893   // Create another block after the count zero intrinsic. A PHI will be added
1894   // in this block to select the result of the intrinsic or the bit-width
1895   // constant if the input to the intrinsic is zero.
1896   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
1897   BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
1898 
1899   // Set up a builder to create a compare, conditional branch, and PHI.
1900   IRBuilder<> Builder(CountZeros->getContext());
1901   Builder.SetInsertPoint(StartBlock->getTerminator());
1902   Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
1903 
1904   // Replace the unconditional branch that was created by the first split with
1905   // a compare against zero and a conditional branch.
1906   Value *Zero = Constant::getNullValue(Ty);
1907   Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
1908   Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
1909   StartBlock->getTerminator()->eraseFromParent();
1910 
1911   // Create a PHI in the end block to select either the output of the intrinsic
1912   // or the bit width of the operand.
1913   Builder.SetInsertPoint(&EndBlock->front());
1914   PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
1915   CountZeros->replaceAllUsesWith(PN);
1916   Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
1917   PN->addIncoming(BitWidth, StartBlock);
1918   PN->addIncoming(CountZeros, CallBlock);
1919 
1920   // We are explicitly handling the zero case, so we can set the intrinsic's
1921   // undefined zero argument to 'true'. This will also prevent reprocessing the
1922   // intrinsic; we only despeculate when a zero input is defined.
1923   CountZeros->setArgOperand(1, Builder.getTrue());
1924   ModifiedDT = true;
1925   return true;
1926 }
1927 
1928 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
1929   BasicBlock *BB = CI->getParent();
1930 
1931   // Lower inline assembly if we can.
1932   // If we found an inline asm expession, and if the target knows how to
1933   // lower it to normal LLVM code, do so now.
1934   if (CI->isInlineAsm()) {
1935     if (TLI->ExpandInlineAsm(CI)) {
1936       // Avoid invalidating the iterator.
1937       CurInstIterator = BB->begin();
1938       // Avoid processing instructions out of order, which could cause
1939       // reuse before a value is defined.
1940       SunkAddrs.clear();
1941       return true;
1942     }
1943     // Sink address computing for memory operands into the block.
1944     if (optimizeInlineAsmInst(CI))
1945       return true;
1946   }
1947 
1948   // Align the pointer arguments to this call if the target thinks it's a good
1949   // idea
1950   unsigned MinSize, PrefAlign;
1951   if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
1952     for (auto &Arg : CI->arg_operands()) {
1953       // We want to align both objects whose address is used directly and
1954       // objects whose address is used in casts and GEPs, though it only makes
1955       // sense for GEPs if the offset is a multiple of the desired alignment and
1956       // if size - offset meets the size threshold.
1957       if (!Arg->getType()->isPointerTy())
1958         continue;
1959       APInt Offset(DL->getIndexSizeInBits(
1960                        cast<PointerType>(Arg->getType())->getAddressSpace()),
1961                    0);
1962       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
1963       uint64_t Offset2 = Offset.getLimitedValue();
1964       if ((Offset2 & (PrefAlign-1)) != 0)
1965         continue;
1966       AllocaInst *AI;
1967       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
1968           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
1969         AI->setAlignment(Align(PrefAlign));
1970       // Global variables can only be aligned if they are defined in this
1971       // object (i.e. they are uniquely initialized in this object), and
1972       // over-aligning global variables that have an explicit section is
1973       // forbidden.
1974       GlobalVariable *GV;
1975       if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
1976           GV->getPointerAlignment(*DL) < PrefAlign &&
1977           DL->getTypeAllocSize(GV->getValueType()) >=
1978               MinSize + Offset2)
1979         GV->setAlignment(MaybeAlign(PrefAlign));
1980     }
1981     // If this is a memcpy (or similar) then we may be able to improve the
1982     // alignment
1983     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
1984       Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
1985       MaybeAlign MIDestAlign = MI->getDestAlign();
1986       if (!MIDestAlign || DestAlign > *MIDestAlign)
1987         MI->setDestAlignment(DestAlign);
1988       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
1989         MaybeAlign MTISrcAlign = MTI->getSourceAlign();
1990         Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
1991         if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
1992           MTI->setSourceAlignment(SrcAlign);
1993       }
1994     }
1995   }
1996 
1997   // If we have a cold call site, try to sink addressing computation into the
1998   // cold block.  This interacts with our handling for loads and stores to
1999   // ensure that we can fold all uses of a potential addressing computation
2000   // into their uses.  TODO: generalize this to work over profiling data
2001   if (CI->hasFnAttr(Attribute::Cold) &&
2002       !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
2003     for (auto &Arg : CI->arg_operands()) {
2004       if (!Arg->getType()->isPointerTy())
2005         continue;
2006       unsigned AS = Arg->getType()->getPointerAddressSpace();
2007       return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
2008     }
2009 
2010   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
2011   if (II) {
2012     switch (II->getIntrinsicID()) {
2013     default: break;
2014     case Intrinsic::assume: {
2015       II->eraseFromParent();
2016       return true;
2017     }
2018 
2019     case Intrinsic::experimental_widenable_condition: {
2020       // Give up on future widening oppurtunties so that we can fold away dead
2021       // paths and merge blocks before going into block-local instruction
2022       // selection.
2023       if (II->use_empty()) {
2024         II->eraseFromParent();
2025         return true;
2026       }
2027       Constant *RetVal = ConstantInt::getTrue(II->getContext());
2028       resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
2029         replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
2030       });
2031       return true;
2032     }
2033     case Intrinsic::objectsize:
2034       llvm_unreachable("llvm.objectsize.* should have been lowered already");
2035     case Intrinsic::is_constant:
2036       llvm_unreachable("llvm.is.constant.* should have been lowered already");
2037     case Intrinsic::aarch64_stlxr:
2038     case Intrinsic::aarch64_stxr: {
2039       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
2040       if (!ExtVal || !ExtVal->hasOneUse() ||
2041           ExtVal->getParent() == CI->getParent())
2042         return false;
2043       // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
2044       ExtVal->moveBefore(CI);
2045       // Mark this instruction as "inserted by CGP", so that other
2046       // optimizations don't touch it.
2047       InsertedInsts.insert(ExtVal);
2048       return true;
2049     }
2050 
2051     case Intrinsic::launder_invariant_group:
2052     case Intrinsic::strip_invariant_group: {
2053       Value *ArgVal = II->getArgOperand(0);
2054       auto it = LargeOffsetGEPMap.find(II);
2055       if (it != LargeOffsetGEPMap.end()) {
2056           // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
2057           // Make sure not to have to deal with iterator invalidation
2058           // after possibly adding ArgVal to LargeOffsetGEPMap.
2059           auto GEPs = std::move(it->second);
2060           LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
2061           LargeOffsetGEPMap.erase(II);
2062       }
2063 
2064       II->replaceAllUsesWith(ArgVal);
2065       II->eraseFromParent();
2066       return true;
2067     }
2068     case Intrinsic::cttz:
2069     case Intrinsic::ctlz:
2070       // If counting zeros is expensive, try to avoid it.
2071       return despeculateCountZeros(II, TLI, DL, ModifiedDT);
2072     case Intrinsic::fshl:
2073     case Intrinsic::fshr:
2074       return optimizeFunnelShift(II);
2075     case Intrinsic::dbg_value:
2076       return fixupDbgValue(II);
2077     case Intrinsic::vscale: {
2078       // If datalayout has no special restrictions on vector data layout,
2079       // replace `llvm.vscale` by an equivalent constant expression
2080       // to benefit from cheap constant propagation.
2081       Type *ScalableVectorTy =
2082           VectorType::get(Type::getInt8Ty(II->getContext()), 1, true);
2083       if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) {
2084         auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo());
2085         auto *One = ConstantInt::getSigned(II->getType(), 1);
2086         auto *CGep =
2087             ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);
2088         II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));
2089         II->eraseFromParent();
2090         return true;
2091       }
2092       break;
2093     }
2094     case Intrinsic::masked_gather:
2095       return optimizeGatherScatterInst(II, II->getArgOperand(0));
2096     case Intrinsic::masked_scatter:
2097       return optimizeGatherScatterInst(II, II->getArgOperand(1));
2098     }
2099 
2100     SmallVector<Value *, 2> PtrOps;
2101     Type *AccessTy;
2102     if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
2103       while (!PtrOps.empty()) {
2104         Value *PtrVal = PtrOps.pop_back_val();
2105         unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2106         if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
2107           return true;
2108       }
2109   }
2110 
2111   // From here on out we're working with named functions.
2112   if (!CI->getCalledFunction()) return false;
2113 
2114   // Lower all default uses of _chk calls.  This is very similar
2115   // to what InstCombineCalls does, but here we are only lowering calls
2116   // to fortified library functions (e.g. __memcpy_chk) that have the default
2117   // "don't know" as the objectsize.  Anything else should be left alone.
2118   FortifiedLibCallSimplifier Simplifier(TLInfo, true);
2119   IRBuilder<> Builder(CI);
2120   if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
2121     CI->replaceAllUsesWith(V);
2122     CI->eraseFromParent();
2123     return true;
2124   }
2125 
2126   return false;
2127 }
2128 
2129 /// Look for opportunities to duplicate return instructions to the predecessor
2130 /// to enable tail call optimizations. The case it is currently looking for is:
2131 /// @code
2132 /// bb0:
2133 ///   %tmp0 = tail call i32 @f0()
2134 ///   br label %return
2135 /// bb1:
2136 ///   %tmp1 = tail call i32 @f1()
2137 ///   br label %return
2138 /// bb2:
2139 ///   %tmp2 = tail call i32 @f2()
2140 ///   br label %return
2141 /// return:
2142 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
2143 ///   ret i32 %retval
2144 /// @endcode
2145 ///
2146 /// =>
2147 ///
2148 /// @code
2149 /// bb0:
2150 ///   %tmp0 = tail call i32 @f0()
2151 ///   ret i32 %tmp0
2152 /// bb1:
2153 ///   %tmp1 = tail call i32 @f1()
2154 ///   ret i32 %tmp1
2155 /// bb2:
2156 ///   %tmp2 = tail call i32 @f2()
2157 ///   ret i32 %tmp2
2158 /// @endcode
2159 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
2160   ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
2161   if (!RetI)
2162     return false;
2163 
2164   PHINode *PN = nullptr;
2165   ExtractValueInst *EVI = nullptr;
2166   BitCastInst *BCI = nullptr;
2167   Value *V = RetI->getReturnValue();
2168   if (V) {
2169     BCI = dyn_cast<BitCastInst>(V);
2170     if (BCI)
2171       V = BCI->getOperand(0);
2172 
2173     EVI = dyn_cast<ExtractValueInst>(V);
2174     if (EVI) {
2175       V = EVI->getOperand(0);
2176       if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),
2177                        [](unsigned idx) { return idx == 0; }))
2178         return false;
2179     }
2180 
2181     PN = dyn_cast<PHINode>(V);
2182     if (!PN)
2183       return false;
2184   }
2185 
2186   if (PN && PN->getParent() != BB)
2187     return false;
2188 
2189   // Make sure there are no instructions between the PHI and return, or that the
2190   // return is the first instruction in the block.
2191   if (PN) {
2192     BasicBlock::iterator BI = BB->begin();
2193     // Skip over debug and the bitcast.
2194     do {
2195       ++BI;
2196     } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI);
2197     if (&*BI != RetI)
2198       return false;
2199   } else {
2200     BasicBlock::iterator BI = BB->begin();
2201     while (isa<DbgInfoIntrinsic>(BI)) ++BI;
2202     if (&*BI != RetI)
2203       return false;
2204   }
2205 
2206   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
2207   /// call.
2208   const Function *F = BB->getParent();
2209   SmallVector<BasicBlock*, 4> TailCallBBs;
2210   if (PN) {
2211     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
2212       // Look through bitcasts.
2213       Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
2214       CallInst *CI = dyn_cast<CallInst>(IncomingVal);
2215       BasicBlock *PredBB = PN->getIncomingBlock(I);
2216       // Make sure the phi value is indeed produced by the tail call.
2217       if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
2218           TLI->mayBeEmittedAsTailCall(CI) &&
2219           attributesPermitTailCall(F, CI, RetI, *TLI))
2220         TailCallBBs.push_back(PredBB);
2221     }
2222   } else {
2223     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
2224     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
2225       if (!VisitedBBs.insert(*PI).second)
2226         continue;
2227 
2228       BasicBlock::InstListType &InstList = (*PI)->getInstList();
2229       BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
2230       BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
2231       do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
2232       if (RI == RE)
2233         continue;
2234 
2235       CallInst *CI = dyn_cast<CallInst>(&*RI);
2236       if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
2237           attributesPermitTailCall(F, CI, RetI, *TLI))
2238         TailCallBBs.push_back(*PI);
2239     }
2240   }
2241 
2242   bool Changed = false;
2243   for (auto const &TailCallBB : TailCallBBs) {
2244     // Make sure the call instruction is followed by an unconditional branch to
2245     // the return block.
2246     BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());
2247     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
2248       continue;
2249 
2250     // Duplicate the return into TailCallBB.
2251     (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
2252     assert(!VerifyBFIUpdates ||
2253            BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
2254     BFI->setBlockFreq(
2255         BB,
2256         (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
2257     ModifiedDT = Changed = true;
2258     ++NumRetsDup;
2259   }
2260 
2261   // If we eliminated all predecessors of the block, delete the block now.
2262   if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
2263     BB->eraseFromParent();
2264 
2265   return Changed;
2266 }
2267 
2268 //===----------------------------------------------------------------------===//
2269 // Memory Optimization
2270 //===----------------------------------------------------------------------===//
2271 
2272 namespace {
2273 
2274 /// This is an extended version of TargetLowering::AddrMode
2275 /// which holds actual Value*'s for register values.
2276 struct ExtAddrMode : public TargetLowering::AddrMode {
2277   Value *BaseReg = nullptr;
2278   Value *ScaledReg = nullptr;
2279   Value *OriginalValue = nullptr;
2280   bool InBounds = true;
2281 
2282   enum FieldName {
2283     NoField        = 0x00,
2284     BaseRegField   = 0x01,
2285     BaseGVField    = 0x02,
2286     BaseOffsField  = 0x04,
2287     ScaledRegField = 0x08,
2288     ScaleField     = 0x10,
2289     MultipleFields = 0xff
2290   };
2291 
2292 
2293   ExtAddrMode() = default;
2294 
2295   void print(raw_ostream &OS) const;
2296   void dump() const;
2297 
2298   FieldName compare(const ExtAddrMode &other) {
2299     // First check that the types are the same on each field, as differing types
2300     // is something we can't cope with later on.
2301     if (BaseReg && other.BaseReg &&
2302         BaseReg->getType() != other.BaseReg->getType())
2303       return MultipleFields;
2304     if (BaseGV && other.BaseGV &&
2305         BaseGV->getType() != other.BaseGV->getType())
2306       return MultipleFields;
2307     if (ScaledReg && other.ScaledReg &&
2308         ScaledReg->getType() != other.ScaledReg->getType())
2309       return MultipleFields;
2310 
2311     // Conservatively reject 'inbounds' mismatches.
2312     if (InBounds != other.InBounds)
2313       return MultipleFields;
2314 
2315     // Check each field to see if it differs.
2316     unsigned Result = NoField;
2317     if (BaseReg != other.BaseReg)
2318       Result |= BaseRegField;
2319     if (BaseGV != other.BaseGV)
2320       Result |= BaseGVField;
2321     if (BaseOffs != other.BaseOffs)
2322       Result |= BaseOffsField;
2323     if (ScaledReg != other.ScaledReg)
2324       Result |= ScaledRegField;
2325     // Don't count 0 as being a different scale, because that actually means
2326     // unscaled (which will already be counted by having no ScaledReg).
2327     if (Scale && other.Scale && Scale != other.Scale)
2328       Result |= ScaleField;
2329 
2330     if (countPopulation(Result) > 1)
2331       return MultipleFields;
2332     else
2333       return static_cast<FieldName>(Result);
2334   }
2335 
2336   // An AddrMode is trivial if it involves no calculation i.e. it is just a base
2337   // with no offset.
2338   bool isTrivial() {
2339     // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
2340     // trivial if at most one of these terms is nonzero, except that BaseGV and
2341     // BaseReg both being zero actually means a null pointer value, which we
2342     // consider to be 'non-zero' here.
2343     return !BaseOffs && !Scale && !(BaseGV && BaseReg);
2344   }
2345 
2346   Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {
2347     switch (Field) {
2348     default:
2349       return nullptr;
2350     case BaseRegField:
2351       return BaseReg;
2352     case BaseGVField:
2353       return BaseGV;
2354     case ScaledRegField:
2355       return ScaledReg;
2356     case BaseOffsField:
2357       return ConstantInt::get(IntPtrTy, BaseOffs);
2358     }
2359   }
2360 
2361   void SetCombinedField(FieldName Field, Value *V,
2362                         const SmallVectorImpl<ExtAddrMode> &AddrModes) {
2363     switch (Field) {
2364     default:
2365       llvm_unreachable("Unhandled fields are expected to be rejected earlier");
2366       break;
2367     case ExtAddrMode::BaseRegField:
2368       BaseReg = V;
2369       break;
2370     case ExtAddrMode::BaseGVField:
2371       // A combined BaseGV is an Instruction, not a GlobalValue, so it goes
2372       // in the BaseReg field.
2373       assert(BaseReg == nullptr);
2374       BaseReg = V;
2375       BaseGV = nullptr;
2376       break;
2377     case ExtAddrMode::ScaledRegField:
2378       ScaledReg = V;
2379       // If we have a mix of scaled and unscaled addrmodes then we want scale
2380       // to be the scale and not zero.
2381       if (!Scale)
2382         for (const ExtAddrMode &AM : AddrModes)
2383           if (AM.Scale) {
2384             Scale = AM.Scale;
2385             break;
2386           }
2387       break;
2388     case ExtAddrMode::BaseOffsField:
2389       // The offset is no longer a constant, so it goes in ScaledReg with a
2390       // scale of 1.
2391       assert(ScaledReg == nullptr);
2392       ScaledReg = V;
2393       Scale = 1;
2394       BaseOffs = 0;
2395       break;
2396     }
2397   }
2398 };
2399 
2400 } // end anonymous namespace
2401 
2402 #ifndef NDEBUG
2403 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
2404   AM.print(OS);
2405   return OS;
2406 }
2407 #endif
2408 
2409 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2410 void ExtAddrMode::print(raw_ostream &OS) const {
2411   bool NeedPlus = false;
2412   OS << "[";
2413   if (InBounds)
2414     OS << "inbounds ";
2415   if (BaseGV) {
2416     OS << (NeedPlus ? " + " : "")
2417        << "GV:";
2418     BaseGV->printAsOperand(OS, /*PrintType=*/false);
2419     NeedPlus = true;
2420   }
2421 
2422   if (BaseOffs) {
2423     OS << (NeedPlus ? " + " : "")
2424        << BaseOffs;
2425     NeedPlus = true;
2426   }
2427 
2428   if (BaseReg) {
2429     OS << (NeedPlus ? " + " : "")
2430        << "Base:";
2431     BaseReg->printAsOperand(OS, /*PrintType=*/false);
2432     NeedPlus = true;
2433   }
2434   if (Scale) {
2435     OS << (NeedPlus ? " + " : "")
2436        << Scale << "*";
2437     ScaledReg->printAsOperand(OS, /*PrintType=*/false);
2438   }
2439 
2440   OS << ']';
2441 }
2442 
2443 LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
2444   print(dbgs());
2445   dbgs() << '\n';
2446 }
2447 #endif
2448 
2449 namespace {
2450 
2451 /// This class provides transaction based operation on the IR.
2452 /// Every change made through this class is recorded in the internal state and
2453 /// can be undone (rollback) until commit is called.
2454 /// CGP does not check if instructions could be speculatively executed when
2455 /// moved. Preserving the original location would pessimize the debugging
2456 /// experience, as well as negatively impact the quality of sample PGO.
2457 class TypePromotionTransaction {
2458   /// This represents the common interface of the individual transaction.
2459   /// Each class implements the logic for doing one specific modification on
2460   /// the IR via the TypePromotionTransaction.
2461   class TypePromotionAction {
2462   protected:
2463     /// The Instruction modified.
2464     Instruction *Inst;
2465 
2466   public:
2467     /// Constructor of the action.
2468     /// The constructor performs the related action on the IR.
2469     TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
2470 
2471     virtual ~TypePromotionAction() = default;
2472 
2473     /// Undo the modification done by this action.
2474     /// When this method is called, the IR must be in the same state as it was
2475     /// before this action was applied.
2476     /// \pre Undoing the action works if and only if the IR is in the exact same
2477     /// state as it was directly after this action was applied.
2478     virtual void undo() = 0;
2479 
2480     /// Advocate every change made by this action.
2481     /// When the results on the IR of the action are to be kept, it is important
2482     /// to call this function, otherwise hidden information may be kept forever.
2483     virtual void commit() {
2484       // Nothing to be done, this action is not doing anything.
2485     }
2486   };
2487 
2488   /// Utility to remember the position of an instruction.
2489   class InsertionHandler {
2490     /// Position of an instruction.
2491     /// Either an instruction:
2492     /// - Is the first in a basic block: BB is used.
2493     /// - Has a previous instruction: PrevInst is used.
2494     union {
2495       Instruction *PrevInst;
2496       BasicBlock *BB;
2497     } Point;
2498 
2499     /// Remember whether or not the instruction had a previous instruction.
2500     bool HasPrevInstruction;
2501 
2502   public:
2503     /// Record the position of \p Inst.
2504     InsertionHandler(Instruction *Inst) {
2505       BasicBlock::iterator It = Inst->getIterator();
2506       HasPrevInstruction = (It != (Inst->getParent()->begin()));
2507       if (HasPrevInstruction)
2508         Point.PrevInst = &*--It;
2509       else
2510         Point.BB = Inst->getParent();
2511     }
2512 
2513     /// Insert \p Inst at the recorded position.
2514     void insert(Instruction *Inst) {
2515       if (HasPrevInstruction) {
2516         if (Inst->getParent())
2517           Inst->removeFromParent();
2518         Inst->insertAfter(Point.PrevInst);
2519       } else {
2520         Instruction *Position = &*Point.BB->getFirstInsertionPt();
2521         if (Inst->getParent())
2522           Inst->moveBefore(Position);
2523         else
2524           Inst->insertBefore(Position);
2525       }
2526     }
2527   };
2528 
2529   /// Move an instruction before another.
2530   class InstructionMoveBefore : public TypePromotionAction {
2531     /// Original position of the instruction.
2532     InsertionHandler Position;
2533 
2534   public:
2535     /// Move \p Inst before \p Before.
2536     InstructionMoveBefore(Instruction *Inst, Instruction *Before)
2537         : TypePromotionAction(Inst), Position(Inst) {
2538       LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
2539                         << "\n");
2540       Inst->moveBefore(Before);
2541     }
2542 
2543     /// Move the instruction back to its original position.
2544     void undo() override {
2545       LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
2546       Position.insert(Inst);
2547     }
2548   };
2549 
2550   /// Set the operand of an instruction with a new value.
2551   class OperandSetter : public TypePromotionAction {
2552     /// Original operand of the instruction.
2553     Value *Origin;
2554 
2555     /// Index of the modified instruction.
2556     unsigned Idx;
2557 
2558   public:
2559     /// Set \p Idx operand of \p Inst with \p NewVal.
2560     OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
2561         : TypePromotionAction(Inst), Idx(Idx) {
2562       LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
2563                         << "for:" << *Inst << "\n"
2564                         << "with:" << *NewVal << "\n");
2565       Origin = Inst->getOperand(Idx);
2566       Inst->setOperand(Idx, NewVal);
2567     }
2568 
2569     /// Restore the original value of the instruction.
2570     void undo() override {
2571       LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
2572                         << "for: " << *Inst << "\n"
2573                         << "with: " << *Origin << "\n");
2574       Inst->setOperand(Idx, Origin);
2575     }
2576   };
2577 
2578   /// Hide the operands of an instruction.
2579   /// Do as if this instruction was not using any of its operands.
2580   class OperandsHider : public TypePromotionAction {
2581     /// The list of original operands.
2582     SmallVector<Value *, 4> OriginalValues;
2583 
2584   public:
2585     /// Remove \p Inst from the uses of the operands of \p Inst.
2586     OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
2587       LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
2588       unsigned NumOpnds = Inst->getNumOperands();
2589       OriginalValues.reserve(NumOpnds);
2590       for (unsigned It = 0; It < NumOpnds; ++It) {
2591         // Save the current operand.
2592         Value *Val = Inst->getOperand(It);
2593         OriginalValues.push_back(Val);
2594         // Set a dummy one.
2595         // We could use OperandSetter here, but that would imply an overhead
2596         // that we are not willing to pay.
2597         Inst->setOperand(It, UndefValue::get(Val->getType()));
2598       }
2599     }
2600 
2601     /// Restore the original list of uses.
2602     void undo() override {
2603       LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
2604       for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
2605         Inst->setOperand(It, OriginalValues[It]);
2606     }
2607   };
2608 
2609   /// Build a truncate instruction.
2610   class TruncBuilder : public TypePromotionAction {
2611     Value *Val;
2612 
2613   public:
2614     /// Build a truncate instruction of \p Opnd producing a \p Ty
2615     /// result.
2616     /// trunc Opnd to Ty.
2617     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
2618       IRBuilder<> Builder(Opnd);
2619       Builder.SetCurrentDebugLocation(DebugLoc());
2620       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
2621       LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
2622     }
2623 
2624     /// Get the built value.
2625     Value *getBuiltValue() { return Val; }
2626 
2627     /// Remove the built instruction.
2628     void undo() override {
2629       LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
2630       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2631         IVal->eraseFromParent();
2632     }
2633   };
2634 
2635   /// Build a sign extension instruction.
2636   class SExtBuilder : public TypePromotionAction {
2637     Value *Val;
2638 
2639   public:
2640     /// Build a sign extension instruction of \p Opnd producing a \p Ty
2641     /// result.
2642     /// sext Opnd to Ty.
2643     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2644         : TypePromotionAction(InsertPt) {
2645       IRBuilder<> Builder(InsertPt);
2646       Val = Builder.CreateSExt(Opnd, Ty, "promoted");
2647       LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
2648     }
2649 
2650     /// Get the built value.
2651     Value *getBuiltValue() { return Val; }
2652 
2653     /// Remove the built instruction.
2654     void undo() override {
2655       LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
2656       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2657         IVal->eraseFromParent();
2658     }
2659   };
2660 
2661   /// Build a zero extension instruction.
2662   class ZExtBuilder : public TypePromotionAction {
2663     Value *Val;
2664 
2665   public:
2666     /// Build a zero extension instruction of \p Opnd producing a \p Ty
2667     /// result.
2668     /// zext Opnd to Ty.
2669     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2670         : TypePromotionAction(InsertPt) {
2671       IRBuilder<> Builder(InsertPt);
2672       Builder.SetCurrentDebugLocation(DebugLoc());
2673       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
2674       LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
2675     }
2676 
2677     /// Get the built value.
2678     Value *getBuiltValue() { return Val; }
2679 
2680     /// Remove the built instruction.
2681     void undo() override {
2682       LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
2683       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2684         IVal->eraseFromParent();
2685     }
2686   };
2687 
2688   /// Mutate an instruction to another type.
2689   class TypeMutator : public TypePromotionAction {
2690     /// Record the original type.
2691     Type *OrigTy;
2692 
2693   public:
2694     /// Mutate the type of \p Inst into \p NewTy.
2695     TypeMutator(Instruction *Inst, Type *NewTy)
2696         : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
2697       LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
2698                         << "\n");
2699       Inst->mutateType(NewTy);
2700     }
2701 
2702     /// Mutate the instruction back to its original type.
2703     void undo() override {
2704       LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
2705                         << "\n");
2706       Inst->mutateType(OrigTy);
2707     }
2708   };
2709 
2710   /// Replace the uses of an instruction by another instruction.
2711   class UsesReplacer : public TypePromotionAction {
2712     /// Helper structure to keep track of the replaced uses.
2713     struct InstructionAndIdx {
2714       /// The instruction using the instruction.
2715       Instruction *Inst;
2716 
2717       /// The index where this instruction is used for Inst.
2718       unsigned Idx;
2719 
2720       InstructionAndIdx(Instruction *Inst, unsigned Idx)
2721           : Inst(Inst), Idx(Idx) {}
2722     };
2723 
2724     /// Keep track of the original uses (pair Instruction, Index).
2725     SmallVector<InstructionAndIdx, 4> OriginalUses;
2726     /// Keep track of the debug users.
2727     SmallVector<DbgValueInst *, 1> DbgValues;
2728 
2729     using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
2730 
2731   public:
2732     /// Replace all the use of \p Inst by \p New.
2733     UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
2734       LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
2735                         << "\n");
2736       // Record the original uses.
2737       for (Use &U : Inst->uses()) {
2738         Instruction *UserI = cast<Instruction>(U.getUser());
2739         OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
2740       }
2741       // Record the debug uses separately. They are not in the instruction's
2742       // use list, but they are replaced by RAUW.
2743       findDbgValues(DbgValues, Inst);
2744 
2745       // Now, we can replace the uses.
2746       Inst->replaceAllUsesWith(New);
2747     }
2748 
2749     /// Reassign the original uses of Inst to Inst.
2750     void undo() override {
2751       LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
2752       for (use_iterator UseIt = OriginalUses.begin(),
2753                         EndIt = OriginalUses.end();
2754            UseIt != EndIt; ++UseIt) {
2755         UseIt->Inst->setOperand(UseIt->Idx, Inst);
2756       }
2757       // RAUW has replaced all original uses with references to the new value,
2758       // including the debug uses. Since we are undoing the replacements,
2759       // the original debug uses must also be reinstated to maintain the
2760       // correctness and utility of debug value instructions.
2761       for (auto *DVI: DbgValues) {
2762         LLVMContext &Ctx = Inst->getType()->getContext();
2763         auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst));
2764         DVI->setOperand(0, MV);
2765       }
2766     }
2767   };
2768 
2769   /// Remove an instruction from the IR.
2770   class InstructionRemover : public TypePromotionAction {
2771     /// Original position of the instruction.
2772     InsertionHandler Inserter;
2773 
2774     /// Helper structure to hide all the link to the instruction. In other
2775     /// words, this helps to do as if the instruction was removed.
2776     OperandsHider Hider;
2777 
2778     /// Keep track of the uses replaced, if any.
2779     UsesReplacer *Replacer = nullptr;
2780 
2781     /// Keep track of instructions removed.
2782     SetOfInstrs &RemovedInsts;
2783 
2784   public:
2785     /// Remove all reference of \p Inst and optionally replace all its
2786     /// uses with New.
2787     /// \p RemovedInsts Keep track of the instructions removed by this Action.
2788     /// \pre If !Inst->use_empty(), then New != nullptr
2789     InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
2790                        Value *New = nullptr)
2791         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
2792           RemovedInsts(RemovedInsts) {
2793       if (New)
2794         Replacer = new UsesReplacer(Inst, New);
2795       LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
2796       RemovedInsts.insert(Inst);
2797       /// The instructions removed here will be freed after completing
2798       /// optimizeBlock() for all blocks as we need to keep track of the
2799       /// removed instructions during promotion.
2800       Inst->removeFromParent();
2801     }
2802 
2803     ~InstructionRemover() override { delete Replacer; }
2804 
2805     /// Resurrect the instruction and reassign it to the proper uses if
2806     /// new value was provided when build this action.
2807     void undo() override {
2808       LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
2809       Inserter.insert(Inst);
2810       if (Replacer)
2811         Replacer->undo();
2812       Hider.undo();
2813       RemovedInsts.erase(Inst);
2814     }
2815   };
2816 
2817 public:
2818   /// Restoration point.
2819   /// The restoration point is a pointer to an action instead of an iterator
2820   /// because the iterator may be invalidated but not the pointer.
2821   using ConstRestorationPt = const TypePromotionAction *;
2822 
2823   TypePromotionTransaction(SetOfInstrs &RemovedInsts)
2824       : RemovedInsts(RemovedInsts) {}
2825 
2826   /// Advocate every changes made in that transaction.
2827   void commit();
2828 
2829   /// Undo all the changes made after the given point.
2830   void rollback(ConstRestorationPt Point);
2831 
2832   /// Get the current restoration point.
2833   ConstRestorationPt getRestorationPoint() const;
2834 
2835   /// \name API for IR modification with state keeping to support rollback.
2836   /// @{
2837   /// Same as Instruction::setOperand.
2838   void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
2839 
2840   /// Same as Instruction::eraseFromParent.
2841   void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
2842 
2843   /// Same as Value::replaceAllUsesWith.
2844   void replaceAllUsesWith(Instruction *Inst, Value *New);
2845 
2846   /// Same as Value::mutateType.
2847   void mutateType(Instruction *Inst, Type *NewTy);
2848 
2849   /// Same as IRBuilder::createTrunc.
2850   Value *createTrunc(Instruction *Opnd, Type *Ty);
2851 
2852   /// Same as IRBuilder::createSExt.
2853   Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
2854 
2855   /// Same as IRBuilder::createZExt.
2856   Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
2857 
2858   /// Same as Instruction::moveBefore.
2859   void moveBefore(Instruction *Inst, Instruction *Before);
2860   /// @}
2861 
2862 private:
2863   /// The ordered list of actions made so far.
2864   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
2865 
2866   using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;
2867 
2868   SetOfInstrs &RemovedInsts;
2869 };
2870 
2871 } // end anonymous namespace
2872 
2873 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
2874                                           Value *NewVal) {
2875   Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(
2876       Inst, Idx, NewVal));
2877 }
2878 
2879 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
2880                                                 Value *NewVal) {
2881   Actions.push_back(
2882       std::make_unique<TypePromotionTransaction::InstructionRemover>(
2883           Inst, RemovedInsts, NewVal));
2884 }
2885 
2886 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
2887                                                   Value *New) {
2888   Actions.push_back(
2889       std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
2890 }
2891 
2892 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
2893   Actions.push_back(
2894       std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
2895 }
2896 
2897 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
2898                                              Type *Ty) {
2899   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
2900   Value *Val = Ptr->getBuiltValue();
2901   Actions.push_back(std::move(Ptr));
2902   return Val;
2903 }
2904 
2905 Value *TypePromotionTransaction::createSExt(Instruction *Inst,
2906                                             Value *Opnd, Type *Ty) {
2907   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
2908   Value *Val = Ptr->getBuiltValue();
2909   Actions.push_back(std::move(Ptr));
2910   return Val;
2911 }
2912 
2913 Value *TypePromotionTransaction::createZExt(Instruction *Inst,
2914                                             Value *Opnd, Type *Ty) {
2915   std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
2916   Value *Val = Ptr->getBuiltValue();
2917   Actions.push_back(std::move(Ptr));
2918   return Val;
2919 }
2920 
2921 void TypePromotionTransaction::moveBefore(Instruction *Inst,
2922                                           Instruction *Before) {
2923   Actions.push_back(
2924       std::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
2925           Inst, Before));
2926 }
2927 
2928 TypePromotionTransaction::ConstRestorationPt
2929 TypePromotionTransaction::getRestorationPoint() const {
2930   return !Actions.empty() ? Actions.back().get() : nullptr;
2931 }
2932 
2933 void TypePromotionTransaction::commit() {
2934   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
2935        ++It)
2936     (*It)->commit();
2937   Actions.clear();
2938 }
2939 
2940 void TypePromotionTransaction::rollback(
2941     TypePromotionTransaction::ConstRestorationPt Point) {
2942   while (!Actions.empty() && Point != Actions.back().get()) {
2943     std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
2944     Curr->undo();
2945   }
2946 }
2947 
2948 namespace {
2949 
2950 /// A helper class for matching addressing modes.
2951 ///
2952 /// This encapsulates the logic for matching the target-legal addressing modes.
2953 class AddressingModeMatcher {
2954   SmallVectorImpl<Instruction*> &AddrModeInsts;
2955   const TargetLowering &TLI;
2956   const TargetRegisterInfo &TRI;
2957   const DataLayout &DL;
2958 
2959   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
2960   /// the memory instruction that we're computing this address for.
2961   Type *AccessTy;
2962   unsigned AddrSpace;
2963   Instruction *MemoryInst;
2964 
2965   /// This is the addressing mode that we're building up. This is
2966   /// part of the return value of this addressing mode matching stuff.
2967   ExtAddrMode &AddrMode;
2968 
2969   /// The instructions inserted by other CodeGenPrepare optimizations.
2970   const SetOfInstrs &InsertedInsts;
2971 
2972   /// A map from the instructions to their type before promotion.
2973   InstrToOrigTy &PromotedInsts;
2974 
2975   /// The ongoing transaction where every action should be registered.
2976   TypePromotionTransaction &TPT;
2977 
2978   // A GEP which has too large offset to be folded into the addressing mode.
2979   std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
2980 
2981   /// This is set to true when we should not do profitability checks.
2982   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
2983   bool IgnoreProfitability;
2984 
2985   /// True if we are optimizing for size.
2986   bool OptSize;
2987 
2988   ProfileSummaryInfo *PSI;
2989   BlockFrequencyInfo *BFI;
2990 
2991   AddressingModeMatcher(
2992       SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
2993       const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
2994       ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
2995       InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
2996       std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
2997       bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
2998       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
2999         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
3000         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
3001         PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
3002         OptSize(OptSize), PSI(PSI), BFI(BFI) {
3003     IgnoreProfitability = false;
3004   }
3005 
3006 public:
3007   /// Find the maximal addressing mode that a load/store of V can fold,
3008   /// give an access type of AccessTy.  This returns a list of involved
3009   /// instructions in AddrModeInsts.
3010   /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
3011   /// optimizations.
3012   /// \p PromotedInsts maps the instructions to their type before promotion.
3013   /// \p The ongoing transaction where every action should be registered.
3014   static ExtAddrMode
3015   Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
3016         SmallVectorImpl<Instruction *> &AddrModeInsts,
3017         const TargetLowering &TLI, const TargetRegisterInfo &TRI,
3018         const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
3019         TypePromotionTransaction &TPT,
3020         std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3021         bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
3022     ExtAddrMode Result;
3023 
3024     bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
3025                                          MemoryInst, Result, InsertedInsts,
3026                                          PromotedInsts, TPT, LargeOffsetGEP,
3027                                          OptSize, PSI, BFI)
3028                        .matchAddr(V, 0);
3029     (void)Success; assert(Success && "Couldn't select *anything*?");
3030     return Result;
3031   }
3032 
3033 private:
3034   bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
3035   bool matchAddr(Value *Addr, unsigned Depth);
3036   bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
3037                           bool *MovedAway = nullptr);
3038   bool isProfitableToFoldIntoAddressingMode(Instruction *I,
3039                                             ExtAddrMode &AMBefore,
3040                                             ExtAddrMode &AMAfter);
3041   bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
3042   bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
3043                              Value *PromotedOperand) const;
3044 };
3045 
3046 class PhiNodeSet;
3047 
3048 /// An iterator for PhiNodeSet.
3049 class PhiNodeSetIterator {
3050   PhiNodeSet * const Set;
3051   size_t CurrentIndex = 0;
3052 
3053 public:
3054   /// The constructor. Start should point to either a valid element, or be equal
3055   /// to the size of the underlying SmallVector of the PhiNodeSet.
3056   PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start);
3057   PHINode * operator*() const;
3058   PhiNodeSetIterator& operator++();
3059   bool operator==(const PhiNodeSetIterator &RHS) const;
3060   bool operator!=(const PhiNodeSetIterator &RHS) const;
3061 };
3062 
3063 /// Keeps a set of PHINodes.
3064 ///
3065 /// This is a minimal set implementation for a specific use case:
3066 /// It is very fast when there are very few elements, but also provides good
3067 /// performance when there are many. It is similar to SmallPtrSet, but also
3068 /// provides iteration by insertion order, which is deterministic and stable
3069 /// across runs. It is also similar to SmallSetVector, but provides removing
3070 /// elements in O(1) time. This is achieved by not actually removing the element
3071 /// from the underlying vector, so comes at the cost of using more memory, but
3072 /// that is fine, since PhiNodeSets are used as short lived objects.
3073 class PhiNodeSet {
3074   friend class PhiNodeSetIterator;
3075 
3076   using MapType = SmallDenseMap<PHINode *, size_t, 32>;
3077   using iterator =  PhiNodeSetIterator;
3078 
3079   /// Keeps the elements in the order of their insertion in the underlying
3080   /// vector. To achieve constant time removal, it never deletes any element.
3081   SmallVector<PHINode *, 32> NodeList;
3082 
3083   /// Keeps the elements in the underlying set implementation. This (and not the
3084   /// NodeList defined above) is the source of truth on whether an element
3085   /// is actually in the collection.
3086   MapType NodeMap;
3087 
3088   /// Points to the first valid (not deleted) element when the set is not empty
3089   /// and the value is not zero. Equals to the size of the underlying vector
3090   /// when the set is empty. When the value is 0, as in the beginning, the
3091   /// first element may or may not be valid.
3092   size_t FirstValidElement = 0;
3093 
3094 public:
3095   /// Inserts a new element to the collection.
3096   /// \returns true if the element is actually added, i.e. was not in the
3097   /// collection before the operation.
3098   bool insert(PHINode *Ptr) {
3099     if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
3100       NodeList.push_back(Ptr);
3101       return true;
3102     }
3103     return false;
3104   }
3105 
3106   /// Removes the element from the collection.
3107   /// \returns whether the element is actually removed, i.e. was in the
3108   /// collection before the operation.
3109   bool erase(PHINode *Ptr) {
3110     auto it = NodeMap.find(Ptr);
3111     if (it != NodeMap.end()) {
3112       NodeMap.erase(Ptr);
3113       SkipRemovedElements(FirstValidElement);
3114       return true;
3115     }
3116     return false;
3117   }
3118 
3119   /// Removes all elements and clears the collection.
3120   void clear() {
3121     NodeMap.clear();
3122     NodeList.clear();
3123     FirstValidElement = 0;
3124   }
3125 
3126   /// \returns an iterator that will iterate the elements in the order of
3127   /// insertion.
3128   iterator begin() {
3129     if (FirstValidElement == 0)
3130       SkipRemovedElements(FirstValidElement);
3131     return PhiNodeSetIterator(this, FirstValidElement);
3132   }
3133 
3134   /// \returns an iterator that points to the end of the collection.
3135   iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
3136 
3137   /// Returns the number of elements in the collection.
3138   size_t size() const {
3139     return NodeMap.size();
3140   }
3141 
3142   /// \returns 1 if the given element is in the collection, and 0 if otherwise.
3143   size_t count(PHINode *Ptr) const {
3144     return NodeMap.count(Ptr);
3145   }
3146 
3147 private:
3148   /// Updates the CurrentIndex so that it will point to a valid element.
3149   ///
3150   /// If the element of NodeList at CurrentIndex is valid, it does not
3151   /// change it. If there are no more valid elements, it updates CurrentIndex
3152   /// to point to the end of the NodeList.
3153   void SkipRemovedElements(size_t &CurrentIndex) {
3154     while (CurrentIndex < NodeList.size()) {
3155       auto it = NodeMap.find(NodeList[CurrentIndex]);
3156       // If the element has been deleted and added again later, NodeMap will
3157       // point to a different index, so CurrentIndex will still be invalid.
3158       if (it != NodeMap.end() && it->second == CurrentIndex)
3159         break;
3160       ++CurrentIndex;
3161     }
3162   }
3163 };
3164 
3165 PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
3166     : Set(Set), CurrentIndex(Start) {}
3167 
3168 PHINode * PhiNodeSetIterator::operator*() const {
3169   assert(CurrentIndex < Set->NodeList.size() &&
3170          "PhiNodeSet access out of range");
3171   return Set->NodeList[CurrentIndex];
3172 }
3173 
3174 PhiNodeSetIterator& PhiNodeSetIterator::operator++() {
3175   assert(CurrentIndex < Set->NodeList.size() &&
3176          "PhiNodeSet access out of range");
3177   ++CurrentIndex;
3178   Set->SkipRemovedElements(CurrentIndex);
3179   return *this;
3180 }
3181 
3182 bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
3183   return CurrentIndex == RHS.CurrentIndex;
3184 }
3185 
3186 bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
3187   return !((*this) == RHS);
3188 }
3189 
3190 /// Keep track of simplification of Phi nodes.
3191 /// Accept the set of all phi nodes and erase phi node from this set
3192 /// if it is simplified.
3193 class SimplificationTracker {
3194   DenseMap<Value *, Value *> Storage;
3195   const SimplifyQuery &SQ;
3196   // Tracks newly created Phi nodes. The elements are iterated by insertion
3197   // order.
3198   PhiNodeSet AllPhiNodes;
3199   // Tracks newly created Select nodes.
3200   SmallPtrSet<SelectInst *, 32> AllSelectNodes;
3201 
3202 public:
3203   SimplificationTracker(const SimplifyQuery &sq)
3204       : SQ(sq) {}
3205 
3206   Value *Get(Value *V) {
3207     do {
3208       auto SV = Storage.find(V);
3209       if (SV == Storage.end())
3210         return V;
3211       V = SV->second;
3212     } while (true);
3213   }
3214 
3215   Value *Simplify(Value *Val) {
3216     SmallVector<Value *, 32> WorkList;
3217     SmallPtrSet<Value *, 32> Visited;
3218     WorkList.push_back(Val);
3219     while (!WorkList.empty()) {
3220       auto *P = WorkList.pop_back_val();
3221       if (!Visited.insert(P).second)
3222         continue;
3223       if (auto *PI = dyn_cast<Instruction>(P))
3224         if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
3225           for (auto *U : PI->users())
3226             WorkList.push_back(cast<Value>(U));
3227           Put(PI, V);
3228           PI->replaceAllUsesWith(V);
3229           if (auto *PHI = dyn_cast<PHINode>(PI))
3230             AllPhiNodes.erase(PHI);
3231           if (auto *Select = dyn_cast<SelectInst>(PI))
3232             AllSelectNodes.erase(Select);
3233           PI->eraseFromParent();
3234         }
3235     }
3236     return Get(Val);
3237   }
3238 
3239   void Put(Value *From, Value *To) {
3240     Storage.insert({ From, To });
3241   }
3242 
3243   void ReplacePhi(PHINode *From, PHINode *To) {
3244     Value* OldReplacement = Get(From);
3245     while (OldReplacement != From) {
3246       From = To;
3247       To = dyn_cast<PHINode>(OldReplacement);
3248       OldReplacement = Get(From);
3249     }
3250     assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
3251     Put(From, To);
3252     From->replaceAllUsesWith(To);
3253     AllPhiNodes.erase(From);
3254     From->eraseFromParent();
3255   }
3256 
3257   PhiNodeSet& newPhiNodes() { return AllPhiNodes; }
3258 
3259   void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
3260 
3261   void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }
3262 
3263   unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
3264 
3265   unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
3266 
3267   void destroyNewNodes(Type *CommonType) {
3268     // For safe erasing, replace the uses with dummy value first.
3269     auto *Dummy = UndefValue::get(CommonType);
3270     for (auto *I : AllPhiNodes) {
3271       I->replaceAllUsesWith(Dummy);
3272       I->eraseFromParent();
3273     }
3274     AllPhiNodes.clear();
3275     for (auto *I : AllSelectNodes) {
3276       I->replaceAllUsesWith(Dummy);
3277       I->eraseFromParent();
3278     }
3279     AllSelectNodes.clear();
3280   }
3281 };
3282 
3283 /// A helper class for combining addressing modes.
3284 class AddressingModeCombiner {
3285   typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
3286   typedef std::pair<PHINode *, PHINode *> PHIPair;
3287 
3288 private:
3289   /// The addressing modes we've collected.
3290   SmallVector<ExtAddrMode, 16> AddrModes;
3291 
3292   /// The field in which the AddrModes differ, when we have more than one.
3293   ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;
3294 
3295   /// Are the AddrModes that we have all just equal to their original values?
3296   bool AllAddrModesTrivial = true;
3297 
3298   /// Common Type for all different fields in addressing modes.
3299   Type *CommonType;
3300 
3301   /// SimplifyQuery for simplifyInstruction utility.
3302   const SimplifyQuery &SQ;
3303 
3304   /// Original Address.
3305   Value *Original;
3306 
3307 public:
3308   AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
3309       : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
3310 
3311   /// Get the combined AddrMode
3312   const ExtAddrMode &getAddrMode() const {
3313     return AddrModes[0];
3314   }
3315 
3316   /// Add a new AddrMode if it's compatible with the AddrModes we already
3317   /// have.
3318   /// \return True iff we succeeded in doing so.
3319   bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
3320     // Take note of if we have any non-trivial AddrModes, as we need to detect
3321     // when all AddrModes are trivial as then we would introduce a phi or select
3322     // which just duplicates what's already there.
3323     AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();
3324 
3325     // If this is the first addrmode then everything is fine.
3326     if (AddrModes.empty()) {
3327       AddrModes.emplace_back(NewAddrMode);
3328       return true;
3329     }
3330 
3331     // Figure out how different this is from the other address modes, which we
3332     // can do just by comparing against the first one given that we only care
3333     // about the cumulative difference.
3334     ExtAddrMode::FieldName ThisDifferentField =
3335       AddrModes[0].compare(NewAddrMode);
3336     if (DifferentField == ExtAddrMode::NoField)
3337       DifferentField = ThisDifferentField;
3338     else if (DifferentField != ThisDifferentField)
3339       DifferentField = ExtAddrMode::MultipleFields;
3340 
3341     // If NewAddrMode differs in more than one dimension we cannot handle it.
3342     bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
3343 
3344     // If Scale Field is different then we reject.
3345     CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
3346 
3347     // We also must reject the case when base offset is different and
3348     // scale reg is not null, we cannot handle this case due to merge of
3349     // different offsets will be used as ScaleReg.
3350     CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
3351                               !NewAddrMode.ScaledReg);
3352 
3353     // We also must reject the case when GV is different and BaseReg installed
3354     // due to we want to use base reg as a merge of GV values.
3355     CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
3356                               !NewAddrMode.HasBaseReg);
3357 
3358     // Even if NewAddMode is the same we still need to collect it due to
3359     // original value is different. And later we will need all original values
3360     // as anchors during finding the common Phi node.
3361     if (CanHandle)
3362       AddrModes.emplace_back(NewAddrMode);
3363     else
3364       AddrModes.clear();
3365 
3366     return CanHandle;
3367   }
3368 
3369   /// Combine the addressing modes we've collected into a single
3370   /// addressing mode.
3371   /// \return True iff we successfully combined them or we only had one so
3372   /// didn't need to combine them anyway.
3373   bool combineAddrModes() {
3374     // If we have no AddrModes then they can't be combined.
3375     if (AddrModes.size() == 0)
3376       return false;
3377 
3378     // A single AddrMode can trivially be combined.
3379     if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)
3380       return true;
3381 
3382     // If the AddrModes we collected are all just equal to the value they are
3383     // derived from then combining them wouldn't do anything useful.
3384     if (AllAddrModesTrivial)
3385       return false;
3386 
3387     if (!addrModeCombiningAllowed())
3388       return false;
3389 
3390     // Build a map between <original value, basic block where we saw it> to
3391     // value of base register.
3392     // Bail out if there is no common type.
3393     FoldAddrToValueMapping Map;
3394     if (!initializeMap(Map))
3395       return false;
3396 
3397     Value *CommonValue = findCommon(Map);
3398     if (CommonValue)
3399       AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
3400     return CommonValue != nullptr;
3401   }
3402 
3403 private:
3404   /// Initialize Map with anchor values. For address seen
3405   /// we set the value of different field saw in this address.
3406   /// At the same time we find a common type for different field we will
3407   /// use to create new Phi/Select nodes. Keep it in CommonType field.
3408   /// Return false if there is no common type found.
3409   bool initializeMap(FoldAddrToValueMapping &Map) {
3410     // Keep track of keys where the value is null. We will need to replace it
3411     // with constant null when we know the common type.
3412     SmallVector<Value *, 2> NullValue;
3413     Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
3414     for (auto &AM : AddrModes) {
3415       Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
3416       if (DV) {
3417         auto *Type = DV->getType();
3418         if (CommonType && CommonType != Type)
3419           return false;
3420         CommonType = Type;
3421         Map[AM.OriginalValue] = DV;
3422       } else {
3423         NullValue.push_back(AM.OriginalValue);
3424       }
3425     }
3426     assert(CommonType && "At least one non-null value must be!");
3427     for (auto *V : NullValue)
3428       Map[V] = Constant::getNullValue(CommonType);
3429     return true;
3430   }
3431 
3432   /// We have mapping between value A and other value B where B was a field in
3433   /// addressing mode represented by A. Also we have an original value C
3434   /// representing an address we start with. Traversing from C through phi and
3435   /// selects we ended up with A's in a map. This utility function tries to find
3436   /// a value V which is a field in addressing mode C and traversing through phi
3437   /// nodes and selects we will end up in corresponded values B in a map.
3438   /// The utility will create a new Phi/Selects if needed.
3439   // The simple example looks as follows:
3440   // BB1:
3441   //   p1 = b1 + 40
3442   //   br cond BB2, BB3
3443   // BB2:
3444   //   p2 = b2 + 40
3445   //   br BB3
3446   // BB3:
3447   //   p = phi [p1, BB1], [p2, BB2]
3448   //   v = load p
3449   // Map is
3450   //   p1 -> b1
3451   //   p2 -> b2
3452   // Request is
3453   //   p -> ?
3454   // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
3455   Value *findCommon(FoldAddrToValueMapping &Map) {
3456     // Tracks the simplification of newly created phi nodes. The reason we use
3457     // this mapping is because we will add new created Phi nodes in AddrToBase.
3458     // Simplification of Phi nodes is recursive, so some Phi node may
3459     // be simplified after we added it to AddrToBase. In reality this
3460     // simplification is possible only if original phi/selects were not
3461     // simplified yet.
3462     // Using this mapping we can find the current value in AddrToBase.
3463     SimplificationTracker ST(SQ);
3464 
3465     // First step, DFS to create PHI nodes for all intermediate blocks.
3466     // Also fill traverse order for the second step.
3467     SmallVector<Value *, 32> TraverseOrder;
3468     InsertPlaceholders(Map, TraverseOrder, ST);
3469 
3470     // Second Step, fill new nodes by merged values and simplify if possible.
3471     FillPlaceholders(Map, TraverseOrder, ST);
3472 
3473     if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
3474       ST.destroyNewNodes(CommonType);
3475       return nullptr;
3476     }
3477 
3478     // Now we'd like to match New Phi nodes to existed ones.
3479     unsigned PhiNotMatchedCount = 0;
3480     if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
3481       ST.destroyNewNodes(CommonType);
3482       return nullptr;
3483     }
3484 
3485     auto *Result = ST.Get(Map.find(Original)->second);
3486     if (Result) {
3487       NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
3488       NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
3489     }
3490     return Result;
3491   }
3492 
3493   /// Try to match PHI node to Candidate.
3494   /// Matcher tracks the matched Phi nodes.
3495   bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
3496                     SmallSetVector<PHIPair, 8> &Matcher,
3497                     PhiNodeSet &PhiNodesToMatch) {
3498     SmallVector<PHIPair, 8> WorkList;
3499     Matcher.insert({ PHI, Candidate });
3500     SmallSet<PHINode *, 8> MatchedPHIs;
3501     MatchedPHIs.insert(PHI);
3502     WorkList.push_back({ PHI, Candidate });
3503     SmallSet<PHIPair, 8> Visited;
3504     while (!WorkList.empty()) {
3505       auto Item = WorkList.pop_back_val();
3506       if (!Visited.insert(Item).second)
3507         continue;
3508       // We iterate over all incoming values to Phi to compare them.
3509       // If values are different and both of them Phi and the first one is a
3510       // Phi we added (subject to match) and both of them is in the same basic
3511       // block then we can match our pair if values match. So we state that
3512       // these values match and add it to work list to verify that.
3513       for (auto B : Item.first->blocks()) {
3514         Value *FirstValue = Item.first->getIncomingValueForBlock(B);
3515         Value *SecondValue = Item.second->getIncomingValueForBlock(B);
3516         if (FirstValue == SecondValue)
3517           continue;
3518 
3519         PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
3520         PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
3521 
3522         // One of them is not Phi or
3523         // The first one is not Phi node from the set we'd like to match or
3524         // Phi nodes from different basic blocks then
3525         // we will not be able to match.
3526         if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
3527             FirstPhi->getParent() != SecondPhi->getParent())
3528           return false;
3529 
3530         // If we already matched them then continue.
3531         if (Matcher.count({ FirstPhi, SecondPhi }))
3532           continue;
3533         // So the values are different and does not match. So we need them to
3534         // match. (But we register no more than one match per PHI node, so that
3535         // we won't later try to replace them twice.)
3536         if (MatchedPHIs.insert(FirstPhi).second)
3537           Matcher.insert({ FirstPhi, SecondPhi });
3538         // But me must check it.
3539         WorkList.push_back({ FirstPhi, SecondPhi });
3540       }
3541     }
3542     return true;
3543   }
3544 
3545   /// For the given set of PHI nodes (in the SimplificationTracker) try
3546   /// to find their equivalents.
3547   /// Returns false if this matching fails and creation of new Phi is disabled.
3548   bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
3549                    unsigned &PhiNotMatchedCount) {
3550     // Matched and PhiNodesToMatch iterate their elements in a deterministic
3551     // order, so the replacements (ReplacePhi) are also done in a deterministic
3552     // order.
3553     SmallSetVector<PHIPair, 8> Matched;
3554     SmallPtrSet<PHINode *, 8> WillNotMatch;
3555     PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
3556     while (PhiNodesToMatch.size()) {
3557       PHINode *PHI = *PhiNodesToMatch.begin();
3558 
3559       // Add us, if no Phi nodes in the basic block we do not match.
3560       WillNotMatch.clear();
3561       WillNotMatch.insert(PHI);
3562 
3563       // Traverse all Phis until we found equivalent or fail to do that.
3564       bool IsMatched = false;
3565       for (auto &P : PHI->getParent()->phis()) {
3566         if (&P == PHI)
3567           continue;
3568         if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
3569           break;
3570         // If it does not match, collect all Phi nodes from matcher.
3571         // if we end up with no match, them all these Phi nodes will not match
3572         // later.
3573         for (auto M : Matched)
3574           WillNotMatch.insert(M.first);
3575         Matched.clear();
3576       }
3577       if (IsMatched) {
3578         // Replace all matched values and erase them.
3579         for (auto MV : Matched)
3580           ST.ReplacePhi(MV.first, MV.second);
3581         Matched.clear();
3582         continue;
3583       }
3584       // If we are not allowed to create new nodes then bail out.
3585       if (!AllowNewPhiNodes)
3586         return false;
3587       // Just remove all seen values in matcher. They will not match anything.
3588       PhiNotMatchedCount += WillNotMatch.size();
3589       for (auto *P : WillNotMatch)
3590         PhiNodesToMatch.erase(P);
3591     }
3592     return true;
3593   }
3594   /// Fill the placeholders with values from predecessors and simplify them.
3595   void FillPlaceholders(FoldAddrToValueMapping &Map,
3596                         SmallVectorImpl<Value *> &TraverseOrder,
3597                         SimplificationTracker &ST) {
3598     while (!TraverseOrder.empty()) {
3599       Value *Current = TraverseOrder.pop_back_val();
3600       assert(Map.find(Current) != Map.end() && "No node to fill!!!");
3601       Value *V = Map[Current];
3602 
3603       if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
3604         // CurrentValue also must be Select.
3605         auto *CurrentSelect = cast<SelectInst>(Current);
3606         auto *TrueValue = CurrentSelect->getTrueValue();
3607         assert(Map.find(TrueValue) != Map.end() && "No True Value!");
3608         Select->setTrueValue(ST.Get(Map[TrueValue]));
3609         auto *FalseValue = CurrentSelect->getFalseValue();
3610         assert(Map.find(FalseValue) != Map.end() && "No False Value!");
3611         Select->setFalseValue(ST.Get(Map[FalseValue]));
3612       } else {
3613         // Must be a Phi node then.
3614         auto *PHI = cast<PHINode>(V);
3615         // Fill the Phi node with values from predecessors.
3616         for (auto *B : predecessors(PHI->getParent())) {
3617           Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
3618           assert(Map.find(PV) != Map.end() && "No predecessor Value!");
3619           PHI->addIncoming(ST.Get(Map[PV]), B);
3620         }
3621       }
3622       Map[Current] = ST.Simplify(V);
3623     }
3624   }
3625 
3626   /// Starting from original value recursively iterates over def-use chain up to
3627   /// known ending values represented in a map. For each traversed phi/select
3628   /// inserts a placeholder Phi or Select.
3629   /// Reports all new created Phi/Select nodes by adding them to set.
3630   /// Also reports and order in what values have been traversed.
3631   void InsertPlaceholders(FoldAddrToValueMapping &Map,
3632                           SmallVectorImpl<Value *> &TraverseOrder,
3633                           SimplificationTracker &ST) {
3634     SmallVector<Value *, 32> Worklist;
3635     assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
3636            "Address must be a Phi or Select node");
3637     auto *Dummy = UndefValue::get(CommonType);
3638     Worklist.push_back(Original);
3639     while (!Worklist.empty()) {
3640       Value *Current = Worklist.pop_back_val();
3641       // if it is already visited or it is an ending value then skip it.
3642       if (Map.find(Current) != Map.end())
3643         continue;
3644       TraverseOrder.push_back(Current);
3645 
3646       // CurrentValue must be a Phi node or select. All others must be covered
3647       // by anchors.
3648       if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
3649         // Is it OK to get metadata from OrigSelect?!
3650         // Create a Select placeholder with dummy value.
3651         SelectInst *Select = SelectInst::Create(
3652             CurrentSelect->getCondition(), Dummy, Dummy,
3653             CurrentSelect->getName(), CurrentSelect, CurrentSelect);
3654         Map[Current] = Select;
3655         ST.insertNewSelect(Select);
3656         // We are interested in True and False values.
3657         Worklist.push_back(CurrentSelect->getTrueValue());
3658         Worklist.push_back(CurrentSelect->getFalseValue());
3659       } else {
3660         // It must be a Phi node then.
3661         PHINode *CurrentPhi = cast<PHINode>(Current);
3662         unsigned PredCount = CurrentPhi->getNumIncomingValues();
3663         PHINode *PHI =
3664             PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
3665         Map[Current] = PHI;
3666         ST.insertNewPhi(PHI);
3667         for (Value *P : CurrentPhi->incoming_values())
3668           Worklist.push_back(P);
3669       }
3670     }
3671   }
3672 
3673   bool addrModeCombiningAllowed() {
3674     if (DisableComplexAddrModes)
3675       return false;
3676     switch (DifferentField) {
3677     default:
3678       return false;
3679     case ExtAddrMode::BaseRegField:
3680       return AddrSinkCombineBaseReg;
3681     case ExtAddrMode::BaseGVField:
3682       return AddrSinkCombineBaseGV;
3683     case ExtAddrMode::BaseOffsField:
3684       return AddrSinkCombineBaseOffs;
3685     case ExtAddrMode::ScaledRegField:
3686       return AddrSinkCombineScaledReg;
3687     }
3688   }
3689 };
3690 } // end anonymous namespace
3691 
3692 /// Try adding ScaleReg*Scale to the current addressing mode.
3693 /// Return true and update AddrMode if this addr mode is legal for the target,
3694 /// false if not.
3695 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
3696                                              unsigned Depth) {
3697   // If Scale is 1, then this is the same as adding ScaleReg to the addressing
3698   // mode.  Just process that directly.
3699   if (Scale == 1)
3700     return matchAddr(ScaleReg, Depth);
3701 
3702   // If the scale is 0, it takes nothing to add this.
3703   if (Scale == 0)
3704     return true;
3705 
3706   // If we already have a scale of this value, we can add to it, otherwise, we
3707   // need an available scale field.
3708   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
3709     return false;
3710 
3711   ExtAddrMode TestAddrMode = AddrMode;
3712 
3713   // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
3714   // [A+B + A*7] -> [B+A*8].
3715   TestAddrMode.Scale += Scale;
3716   TestAddrMode.ScaledReg = ScaleReg;
3717 
3718   // If the new address isn't legal, bail out.
3719   if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
3720     return false;
3721 
3722   // It was legal, so commit it.
3723   AddrMode = TestAddrMode;
3724 
3725   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
3726   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
3727   // X*Scale + C*Scale to addr mode.
3728   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
3729   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
3730       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
3731       CI->getValue().isSignedIntN(64)) {
3732     TestAddrMode.InBounds = false;
3733     TestAddrMode.ScaledReg = AddLHS;
3734     TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
3735 
3736     // If this addressing mode is legal, commit it and remember that we folded
3737     // this instruction.
3738     if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
3739       AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
3740       AddrMode = TestAddrMode;
3741       return true;
3742     }
3743   }
3744 
3745   // Otherwise, not (x+c)*scale, just return what we have.
3746   return true;
3747 }
3748 
3749 /// This is a little filter, which returns true if an addressing computation
3750 /// involving I might be folded into a load/store accessing it.
3751 /// This doesn't need to be perfect, but needs to accept at least
3752 /// the set of instructions that MatchOperationAddr can.
3753 static bool MightBeFoldableInst(Instruction *I) {
3754   switch (I->getOpcode()) {
3755   case Instruction::BitCast:
3756   case Instruction::AddrSpaceCast:
3757     // Don't touch identity bitcasts.
3758     if (I->getType() == I->getOperand(0)->getType())
3759       return false;
3760     return I->getType()->isIntOrPtrTy();
3761   case Instruction::PtrToInt:
3762     // PtrToInt is always a noop, as we know that the int type is pointer sized.
3763     return true;
3764   case Instruction::IntToPtr:
3765     // We know the input is intptr_t, so this is foldable.
3766     return true;
3767   case Instruction::Add:
3768     return true;
3769   case Instruction::Mul:
3770   case Instruction::Shl:
3771     // Can only handle X*C and X << C.
3772     return isa<ConstantInt>(I->getOperand(1));
3773   case Instruction::GetElementPtr:
3774     return true;
3775   default:
3776     return false;
3777   }
3778 }
3779 
3780 /// Check whether or not \p Val is a legal instruction for \p TLI.
3781 /// \note \p Val is assumed to be the product of some type promotion.
3782 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
3783 /// to be legal, as the non-promoted value would have had the same state.
3784 static bool isPromotedInstructionLegal(const TargetLowering &TLI,
3785                                        const DataLayout &DL, Value *Val) {
3786   Instruction *PromotedInst = dyn_cast<Instruction>(Val);
3787   if (!PromotedInst)
3788     return false;
3789   int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
3790   // If the ISDOpcode is undefined, it was undefined before the promotion.
3791   if (!ISDOpcode)
3792     return true;
3793   // Otherwise, check if the promoted instruction is legal or not.
3794   return TLI.isOperationLegalOrCustom(
3795       ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
3796 }
3797 
3798 namespace {
3799 
3800 /// Hepler class to perform type promotion.
3801 class TypePromotionHelper {
3802   /// Utility function to add a promoted instruction \p ExtOpnd to
3803   /// \p PromotedInsts and record the type of extension we have seen.
3804   static void addPromotedInst(InstrToOrigTy &PromotedInsts,
3805                               Instruction *ExtOpnd,
3806                               bool IsSExt) {
3807     ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
3808     InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);
3809     if (It != PromotedInsts.end()) {
3810       // If the new extension is same as original, the information in
3811       // PromotedInsts[ExtOpnd] is still correct.
3812       if (It->second.getInt() == ExtTy)
3813         return;
3814 
3815       // Now the new extension is different from old extension, we make
3816       // the type information invalid by setting extension type to
3817       // BothExtension.
3818       ExtTy = BothExtension;
3819     }
3820     PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);
3821   }
3822 
3823   /// Utility function to query the original type of instruction \p Opnd
3824   /// with a matched extension type. If the extension doesn't match, we
3825   /// cannot use the information we had on the original type.
3826   /// BothExtension doesn't match any extension type.
3827   static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
3828                                  Instruction *Opnd,
3829                                  bool IsSExt) {
3830     ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
3831     InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
3832     if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
3833       return It->second.getPointer();
3834     return nullptr;
3835   }
3836 
3837   /// Utility function to check whether or not a sign or zero extension
3838   /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
3839   /// either using the operands of \p Inst or promoting \p Inst.
3840   /// The type of the extension is defined by \p IsSExt.
3841   /// In other words, check if:
3842   /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
3843   /// #1 Promotion applies:
3844   /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
3845   /// #2 Operand reuses:
3846   /// ext opnd1 to ConsideredExtType.
3847   /// \p PromotedInsts maps the instructions to their type before promotion.
3848   static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
3849                             const InstrToOrigTy &PromotedInsts, bool IsSExt);
3850 
3851   /// Utility function to determine if \p OpIdx should be promoted when
3852   /// promoting \p Inst.
3853   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
3854     return !(isa<SelectInst>(Inst) && OpIdx == 0);
3855   }
3856 
3857   /// Utility function to promote the operand of \p Ext when this
3858   /// operand is a promotable trunc or sext or zext.
3859   /// \p PromotedInsts maps the instructions to their type before promotion.
3860   /// \p CreatedInstsCost[out] contains the cost of all instructions
3861   /// created to promote the operand of Ext.
3862   /// Newly added extensions are inserted in \p Exts.
3863   /// Newly added truncates are inserted in \p Truncs.
3864   /// Should never be called directly.
3865   /// \return The promoted value which is used instead of Ext.
3866   static Value *promoteOperandForTruncAndAnyExt(
3867       Instruction *Ext, TypePromotionTransaction &TPT,
3868       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3869       SmallVectorImpl<Instruction *> *Exts,
3870       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
3871 
3872   /// Utility function to promote the operand of \p Ext when this
3873   /// operand is promotable and is not a supported trunc or sext.
3874   /// \p PromotedInsts maps the instructions to their type before promotion.
3875   /// \p CreatedInstsCost[out] contains the cost of all the instructions
3876   /// created to promote the operand of Ext.
3877   /// Newly added extensions are inserted in \p Exts.
3878   /// Newly added truncates are inserted in \p Truncs.
3879   /// Should never be called directly.
3880   /// \return The promoted value which is used instead of Ext.
3881   static Value *promoteOperandForOther(Instruction *Ext,
3882                                        TypePromotionTransaction &TPT,
3883                                        InstrToOrigTy &PromotedInsts,
3884                                        unsigned &CreatedInstsCost,
3885                                        SmallVectorImpl<Instruction *> *Exts,
3886                                        SmallVectorImpl<Instruction *> *Truncs,
3887                                        const TargetLowering &TLI, bool IsSExt);
3888 
3889   /// \see promoteOperandForOther.
3890   static Value *signExtendOperandForOther(
3891       Instruction *Ext, TypePromotionTransaction &TPT,
3892       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3893       SmallVectorImpl<Instruction *> *Exts,
3894       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3895     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3896                                   Exts, Truncs, TLI, true);
3897   }
3898 
3899   /// \see promoteOperandForOther.
3900   static Value *zeroExtendOperandForOther(
3901       Instruction *Ext, TypePromotionTransaction &TPT,
3902       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3903       SmallVectorImpl<Instruction *> *Exts,
3904       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3905     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3906                                   Exts, Truncs, TLI, false);
3907   }
3908 
3909 public:
3910   /// Type for the utility function that promotes the operand of Ext.
3911   using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
3912                             InstrToOrigTy &PromotedInsts,
3913                             unsigned &CreatedInstsCost,
3914                             SmallVectorImpl<Instruction *> *Exts,
3915                             SmallVectorImpl<Instruction *> *Truncs,
3916                             const TargetLowering &TLI);
3917 
3918   /// Given a sign/zero extend instruction \p Ext, return the appropriate
3919   /// action to promote the operand of \p Ext instead of using Ext.
3920   /// \return NULL if no promotable action is possible with the current
3921   /// sign extension.
3922   /// \p InsertedInsts keeps track of all the instructions inserted by the
3923   /// other CodeGenPrepare optimizations. This information is important
3924   /// because we do not want to promote these instructions as CodeGenPrepare
3925   /// will reinsert them later. Thus creating an infinite loop: create/remove.
3926   /// \p PromotedInsts maps the instructions to their type before promotion.
3927   static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
3928                           const TargetLowering &TLI,
3929                           const InstrToOrigTy &PromotedInsts);
3930 };
3931 
3932 } // end anonymous namespace
3933 
3934 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
3935                                         Type *ConsideredExtType,
3936                                         const InstrToOrigTy &PromotedInsts,
3937                                         bool IsSExt) {
3938   // The promotion helper does not know how to deal with vector types yet.
3939   // To be able to fix that, we would need to fix the places where we
3940   // statically extend, e.g., constants and such.
3941   if (Inst->getType()->isVectorTy())
3942     return false;
3943 
3944   // We can always get through zext.
3945   if (isa<ZExtInst>(Inst))
3946     return true;
3947 
3948   // sext(sext) is ok too.
3949   if (IsSExt && isa<SExtInst>(Inst))
3950     return true;
3951 
3952   // We can get through binary operator, if it is legal. In other words, the
3953   // binary operator must have a nuw or nsw flag.
3954   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
3955   if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&
3956       ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
3957        (IsSExt && BinOp->hasNoSignedWrap())))
3958     return true;
3959 
3960   // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
3961   if ((Inst->getOpcode() == Instruction::And ||
3962        Inst->getOpcode() == Instruction::Or))
3963     return true;
3964 
3965   // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
3966   if (Inst->getOpcode() == Instruction::Xor) {
3967     const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
3968     // Make sure it is not a NOT.
3969     if (Cst && !Cst->getValue().isAllOnesValue())
3970       return true;
3971   }
3972 
3973   // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
3974   // It may change a poisoned value into a regular value, like
3975   //     zext i32 (shrl i8 %val, 12)  -->  shrl i32 (zext i8 %val), 12
3976   //          poisoned value                    regular value
3977   // It should be OK since undef covers valid value.
3978   if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
3979     return true;
3980 
3981   // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
3982   // It may change a poisoned value into a regular value, like
3983   //     zext i32 (shl i8 %val, 12)  -->  shl i32 (zext i8 %val), 12
3984   //          poisoned value                    regular value
3985   // It should be OK since undef covers valid value.
3986   if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
3987     const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());
3988     if (ExtInst->hasOneUse()) {
3989       const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());
3990       if (AndInst && AndInst->getOpcode() == Instruction::And) {
3991         const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
3992         if (Cst &&
3993             Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
3994           return true;
3995       }
3996     }
3997   }
3998 
3999   // Check if we can do the following simplification.
4000   // ext(trunc(opnd)) --> ext(opnd)
4001   if (!isa<TruncInst>(Inst))
4002     return false;
4003 
4004   Value *OpndVal = Inst->getOperand(0);
4005   // Check if we can use this operand in the extension.
4006   // If the type is larger than the result type of the extension, we cannot.
4007   if (!OpndVal->getType()->isIntegerTy() ||
4008       OpndVal->getType()->getIntegerBitWidth() >
4009           ConsideredExtType->getIntegerBitWidth())
4010     return false;
4011 
4012   // If the operand of the truncate is not an instruction, we will not have
4013   // any information on the dropped bits.
4014   // (Actually we could for constant but it is not worth the extra logic).
4015   Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
4016   if (!Opnd)
4017     return false;
4018 
4019   // Check if the source of the type is narrow enough.
4020   // I.e., check that trunc just drops extended bits of the same kind of
4021   // the extension.
4022   // #1 get the type of the operand and check the kind of the extended bits.
4023   const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
4024   if (OpndType)
4025     ;
4026   else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
4027     OpndType = Opnd->getOperand(0)->getType();
4028   else
4029     return false;
4030 
4031   // #2 check that the truncate just drops extended bits.
4032   return Inst->getType()->getIntegerBitWidth() >=
4033          OpndType->getIntegerBitWidth();
4034 }
4035 
4036 TypePromotionHelper::Action TypePromotionHelper::getAction(
4037     Instruction *Ext, const SetOfInstrs &InsertedInsts,
4038     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
4039   assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4040          "Unexpected instruction type");
4041   Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
4042   Type *ExtTy = Ext->getType();
4043   bool IsSExt = isa<SExtInst>(Ext);
4044   // If the operand of the extension is not an instruction, we cannot
4045   // get through.
4046   // If it, check we can get through.
4047   if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
4048     return nullptr;
4049 
4050   // Do not promote if the operand has been added by codegenprepare.
4051   // Otherwise, it means we are undoing an optimization that is likely to be
4052   // redone, thus causing potential infinite loop.
4053   if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
4054     return nullptr;
4055 
4056   // SExt or Trunc instructions.
4057   // Return the related handler.
4058   if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
4059       isa<ZExtInst>(ExtOpnd))
4060     return promoteOperandForTruncAndAnyExt;
4061 
4062   // Regular instruction.
4063   // Abort early if we will have to insert non-free instructions.
4064   if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
4065     return nullptr;
4066   return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
4067 }
4068 
4069 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
4070     Instruction *SExt, TypePromotionTransaction &TPT,
4071     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4072     SmallVectorImpl<Instruction *> *Exts,
4073     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
4074   // By construction, the operand of SExt is an instruction. Otherwise we cannot
4075   // get through it and this method should not be called.
4076   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
4077   Value *ExtVal = SExt;
4078   bool HasMergedNonFreeExt = false;
4079   if (isa<ZExtInst>(SExtOpnd)) {
4080     // Replace s|zext(zext(opnd))
4081     // => zext(opnd).
4082     HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
4083     Value *ZExt =
4084         TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
4085     TPT.replaceAllUsesWith(SExt, ZExt);
4086     TPT.eraseInstruction(SExt);
4087     ExtVal = ZExt;
4088   } else {
4089     // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
4090     // => z|sext(opnd).
4091     TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
4092   }
4093   CreatedInstsCost = 0;
4094 
4095   // Remove dead code.
4096   if (SExtOpnd->use_empty())
4097     TPT.eraseInstruction(SExtOpnd);
4098 
4099   // Check if the extension is still needed.
4100   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
4101   if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
4102     if (ExtInst) {
4103       if (Exts)
4104         Exts->push_back(ExtInst);
4105       CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
4106     }
4107     return ExtVal;
4108   }
4109 
4110   // At this point we have: ext ty opnd to ty.
4111   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
4112   Value *NextVal = ExtInst->getOperand(0);
4113   TPT.eraseInstruction(ExtInst, NextVal);
4114   return NextVal;
4115 }
4116 
4117 Value *TypePromotionHelper::promoteOperandForOther(
4118     Instruction *Ext, TypePromotionTransaction &TPT,
4119     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4120     SmallVectorImpl<Instruction *> *Exts,
4121     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
4122     bool IsSExt) {
4123   // By construction, the operand of Ext is an instruction. Otherwise we cannot
4124   // get through it and this method should not be called.
4125   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
4126   CreatedInstsCost = 0;
4127   if (!ExtOpnd->hasOneUse()) {
4128     // ExtOpnd will be promoted.
4129     // All its uses, but Ext, will need to use a truncated value of the
4130     // promoted version.
4131     // Create the truncate now.
4132     Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
4133     if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
4134       // Insert it just after the definition.
4135       ITrunc->moveAfter(ExtOpnd);
4136       if (Truncs)
4137         Truncs->push_back(ITrunc);
4138     }
4139 
4140     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
4141     // Restore the operand of Ext (which has been replaced by the previous call
4142     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
4143     TPT.setOperand(Ext, 0, ExtOpnd);
4144   }
4145 
4146   // Get through the Instruction:
4147   // 1. Update its type.
4148   // 2. Replace the uses of Ext by Inst.
4149   // 3. Extend each operand that needs to be extended.
4150 
4151   // Remember the original type of the instruction before promotion.
4152   // This is useful to know that the high bits are sign extended bits.
4153   addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
4154   // Step #1.
4155   TPT.mutateType(ExtOpnd, Ext->getType());
4156   // Step #2.
4157   TPT.replaceAllUsesWith(Ext, ExtOpnd);
4158   // Step #3.
4159   Instruction *ExtForOpnd = Ext;
4160 
4161   LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
4162   for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
4163        ++OpIdx) {
4164     LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
4165     if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
4166         !shouldExtOperand(ExtOpnd, OpIdx)) {
4167       LLVM_DEBUG(dbgs() << "No need to propagate\n");
4168       continue;
4169     }
4170     // Check if we can statically extend the operand.
4171     Value *Opnd = ExtOpnd->getOperand(OpIdx);
4172     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
4173       LLVM_DEBUG(dbgs() << "Statically extend\n");
4174       unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
4175       APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
4176                             : Cst->getValue().zext(BitWidth);
4177       TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
4178       continue;
4179     }
4180     // UndefValue are typed, so we have to statically sign extend them.
4181     if (isa<UndefValue>(Opnd)) {
4182       LLVM_DEBUG(dbgs() << "Statically extend\n");
4183       TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
4184       continue;
4185     }
4186 
4187     // Otherwise we have to explicitly sign extend the operand.
4188     // Check if Ext was reused to extend an operand.
4189     if (!ExtForOpnd) {
4190       // If yes, create a new one.
4191       LLVM_DEBUG(dbgs() << "More operands to ext\n");
4192       Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
4193         : TPT.createZExt(Ext, Opnd, Ext->getType());
4194       if (!isa<Instruction>(ValForExtOpnd)) {
4195         TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
4196         continue;
4197       }
4198       ExtForOpnd = cast<Instruction>(ValForExtOpnd);
4199     }
4200     if (Exts)
4201       Exts->push_back(ExtForOpnd);
4202     TPT.setOperand(ExtForOpnd, 0, Opnd);
4203 
4204     // Move the sign extension before the insertion point.
4205     TPT.moveBefore(ExtForOpnd, ExtOpnd);
4206     TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
4207     CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
4208     // If more sext are required, new instructions will have to be created.
4209     ExtForOpnd = nullptr;
4210   }
4211   if (ExtForOpnd == Ext) {
4212     LLVM_DEBUG(dbgs() << "Extension is useless now\n");
4213     TPT.eraseInstruction(Ext);
4214   }
4215   return ExtOpnd;
4216 }
4217 
4218 /// Check whether or not promoting an instruction to a wider type is profitable.
4219 /// \p NewCost gives the cost of extension instructions created by the
4220 /// promotion.
4221 /// \p OldCost gives the cost of extension instructions before the promotion
4222 /// plus the number of instructions that have been
4223 /// matched in the addressing mode the promotion.
4224 /// \p PromotedOperand is the value that has been promoted.
4225 /// \return True if the promotion is profitable, false otherwise.
4226 bool AddressingModeMatcher::isPromotionProfitable(
4227     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
4228   LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
4229                     << '\n');
4230   // The cost of the new extensions is greater than the cost of the
4231   // old extension plus what we folded.
4232   // This is not profitable.
4233   if (NewCost > OldCost)
4234     return false;
4235   if (NewCost < OldCost)
4236     return true;
4237   // The promotion is neutral but it may help folding the sign extension in
4238   // loads for instance.
4239   // Check that we did not create an illegal instruction.
4240   return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
4241 }
4242 
4243 /// Given an instruction or constant expr, see if we can fold the operation
4244 /// into the addressing mode. If so, update the addressing mode and return
4245 /// true, otherwise return false without modifying AddrMode.
4246 /// If \p MovedAway is not NULL, it contains the information of whether or
4247 /// not AddrInst has to be folded into the addressing mode on success.
4248 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
4249 /// because it has been moved away.
4250 /// Thus AddrInst must not be added in the matched instructions.
4251 /// This state can happen when AddrInst is a sext, since it may be moved away.
4252 /// Therefore, AddrInst may not be valid when MovedAway is true and it must
4253 /// not be referenced anymore.
4254 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
4255                                                unsigned Depth,
4256                                                bool *MovedAway) {
4257   // Avoid exponential behavior on extremely deep expression trees.
4258   if (Depth >= 5) return false;
4259 
4260   // By default, all matched instructions stay in place.
4261   if (MovedAway)
4262     *MovedAway = false;
4263 
4264   switch (Opcode) {
4265   case Instruction::PtrToInt:
4266     // PtrToInt is always a noop, as we know that the int type is pointer sized.
4267     return matchAddr(AddrInst->getOperand(0), Depth);
4268   case Instruction::IntToPtr: {
4269     auto AS = AddrInst->getType()->getPointerAddressSpace();
4270     auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
4271     // This inttoptr is a no-op if the integer type is pointer sized.
4272     if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
4273       return matchAddr(AddrInst->getOperand(0), Depth);
4274     return false;
4275   }
4276   case Instruction::BitCast:
4277     // BitCast is always a noop, and we can handle it as long as it is
4278     // int->int or pointer->pointer (we don't want int<->fp or something).
4279     if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
4280         // Don't touch identity bitcasts.  These were probably put here by LSR,
4281         // and we don't want to mess around with them.  Assume it knows what it
4282         // is doing.
4283         AddrInst->getOperand(0)->getType() != AddrInst->getType())
4284       return matchAddr(AddrInst->getOperand(0), Depth);
4285     return false;
4286   case Instruction::AddrSpaceCast: {
4287     unsigned SrcAS
4288       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
4289     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
4290     if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
4291       return matchAddr(AddrInst->getOperand(0), Depth);
4292     return false;
4293   }
4294   case Instruction::Add: {
4295     // Check to see if we can merge in the RHS then the LHS.  If so, we win.
4296     ExtAddrMode BackupAddrMode = AddrMode;
4297     unsigned OldSize = AddrModeInsts.size();
4298     // Start a transaction at this point.
4299     // The LHS may match but not the RHS.
4300     // Therefore, we need a higher level restoration point to undo partially
4301     // matched operation.
4302     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4303         TPT.getRestorationPoint();
4304 
4305     AddrMode.InBounds = false;
4306     if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
4307         matchAddr(AddrInst->getOperand(0), Depth+1))
4308       return true;
4309 
4310     // Restore the old addr mode info.
4311     AddrMode = BackupAddrMode;
4312     AddrModeInsts.resize(OldSize);
4313     TPT.rollback(LastKnownGood);
4314 
4315     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
4316     if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
4317         matchAddr(AddrInst->getOperand(1), Depth+1))
4318       return true;
4319 
4320     // Otherwise we definitely can't merge the ADD in.
4321     AddrMode = BackupAddrMode;
4322     AddrModeInsts.resize(OldSize);
4323     TPT.rollback(LastKnownGood);
4324     break;
4325   }
4326   //case Instruction::Or:
4327   // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
4328   //break;
4329   case Instruction::Mul:
4330   case Instruction::Shl: {
4331     // Can only handle X*C and X << C.
4332     AddrMode.InBounds = false;
4333     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
4334     if (!RHS || RHS->getBitWidth() > 64)
4335       return false;
4336     int64_t Scale = RHS->getSExtValue();
4337     if (Opcode == Instruction::Shl)
4338       Scale = 1LL << Scale;
4339 
4340     return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
4341   }
4342   case Instruction::GetElementPtr: {
4343     // Scan the GEP.  We check it if it contains constant offsets and at most
4344     // one variable offset.
4345     int VariableOperand = -1;
4346     unsigned VariableScale = 0;
4347 
4348     int64_t ConstantOffset = 0;
4349     gep_type_iterator GTI = gep_type_begin(AddrInst);
4350     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
4351       if (StructType *STy = GTI.getStructTypeOrNull()) {
4352         const StructLayout *SL = DL.getStructLayout(STy);
4353         unsigned Idx =
4354           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
4355         ConstantOffset += SL->getElementOffset(Idx);
4356       } else {
4357         uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
4358         if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
4359           const APInt &CVal = CI->getValue();
4360           if (CVal.getMinSignedBits() <= 64) {
4361             ConstantOffset += CVal.getSExtValue() * TypeSize;
4362             continue;
4363           }
4364         }
4365         if (TypeSize) {  // Scales of zero don't do anything.
4366           // We only allow one variable index at the moment.
4367           if (VariableOperand != -1)
4368             return false;
4369 
4370           // Remember the variable index.
4371           VariableOperand = i;
4372           VariableScale = TypeSize;
4373         }
4374       }
4375     }
4376 
4377     // A common case is for the GEP to only do a constant offset.  In this case,
4378     // just add it to the disp field and check validity.
4379     if (VariableOperand == -1) {
4380       AddrMode.BaseOffs += ConstantOffset;
4381       if (ConstantOffset == 0 ||
4382           TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
4383         // Check to see if we can fold the base pointer in too.
4384         if (matchAddr(AddrInst->getOperand(0), Depth+1)) {
4385           if (!cast<GEPOperator>(AddrInst)->isInBounds())
4386             AddrMode.InBounds = false;
4387           return true;
4388         }
4389       } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
4390                  TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
4391                  ConstantOffset > 0) {
4392         // Record GEPs with non-zero offsets as candidates for splitting in the
4393         // event that the offset cannot fit into the r+i addressing mode.
4394         // Simple and common case that only one GEP is used in calculating the
4395         // address for the memory access.
4396         Value *Base = AddrInst->getOperand(0);
4397         auto *BaseI = dyn_cast<Instruction>(Base);
4398         auto *GEP = cast<GetElementPtrInst>(AddrInst);
4399         if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
4400             (BaseI && !isa<CastInst>(BaseI) &&
4401              !isa<GetElementPtrInst>(BaseI))) {
4402           // Make sure the parent block allows inserting non-PHI instructions
4403           // before the terminator.
4404           BasicBlock *Parent =
4405               BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
4406           if (!Parent->getTerminator()->isEHPad())
4407             LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
4408         }
4409       }
4410       AddrMode.BaseOffs -= ConstantOffset;
4411       return false;
4412     }
4413 
4414     // Save the valid addressing mode in case we can't match.
4415     ExtAddrMode BackupAddrMode = AddrMode;
4416     unsigned OldSize = AddrModeInsts.size();
4417 
4418     // See if the scale and offset amount is valid for this target.
4419     AddrMode.BaseOffs += ConstantOffset;
4420     if (!cast<GEPOperator>(AddrInst)->isInBounds())
4421       AddrMode.InBounds = false;
4422 
4423     // Match the base operand of the GEP.
4424     if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
4425       // If it couldn't be matched, just stuff the value in a register.
4426       if (AddrMode.HasBaseReg) {
4427         AddrMode = BackupAddrMode;
4428         AddrModeInsts.resize(OldSize);
4429         return false;
4430       }
4431       AddrMode.HasBaseReg = true;
4432       AddrMode.BaseReg = AddrInst->getOperand(0);
4433     }
4434 
4435     // Match the remaining variable portion of the GEP.
4436     if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
4437                           Depth)) {
4438       // If it couldn't be matched, try stuffing the base into a register
4439       // instead of matching it, and retrying the match of the scale.
4440       AddrMode = BackupAddrMode;
4441       AddrModeInsts.resize(OldSize);
4442       if (AddrMode.HasBaseReg)
4443         return false;
4444       AddrMode.HasBaseReg = true;
4445       AddrMode.BaseReg = AddrInst->getOperand(0);
4446       AddrMode.BaseOffs += ConstantOffset;
4447       if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
4448                             VariableScale, Depth)) {
4449         // If even that didn't work, bail.
4450         AddrMode = BackupAddrMode;
4451         AddrModeInsts.resize(OldSize);
4452         return false;
4453       }
4454     }
4455 
4456     return true;
4457   }
4458   case Instruction::SExt:
4459   case Instruction::ZExt: {
4460     Instruction *Ext = dyn_cast<Instruction>(AddrInst);
4461     if (!Ext)
4462       return false;
4463 
4464     // Try to move this ext out of the way of the addressing mode.
4465     // Ask for a method for doing so.
4466     TypePromotionHelper::Action TPH =
4467         TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
4468     if (!TPH)
4469       return false;
4470 
4471     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4472         TPT.getRestorationPoint();
4473     unsigned CreatedInstsCost = 0;
4474     unsigned ExtCost = !TLI.isExtFree(Ext);
4475     Value *PromotedOperand =
4476         TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
4477     // SExt has been moved away.
4478     // Thus either it will be rematched later in the recursive calls or it is
4479     // gone. Anyway, we must not fold it into the addressing mode at this point.
4480     // E.g.,
4481     // op = add opnd, 1
4482     // idx = ext op
4483     // addr = gep base, idx
4484     // is now:
4485     // promotedOpnd = ext opnd            <- no match here
4486     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
4487     // addr = gep base, op                <- match
4488     if (MovedAway)
4489       *MovedAway = true;
4490 
4491     assert(PromotedOperand &&
4492            "TypePromotionHelper should have filtered out those cases");
4493 
4494     ExtAddrMode BackupAddrMode = AddrMode;
4495     unsigned OldSize = AddrModeInsts.size();
4496 
4497     if (!matchAddr(PromotedOperand, Depth) ||
4498         // The total of the new cost is equal to the cost of the created
4499         // instructions.
4500         // The total of the old cost is equal to the cost of the extension plus
4501         // what we have saved in the addressing mode.
4502         !isPromotionProfitable(CreatedInstsCost,
4503                                ExtCost + (AddrModeInsts.size() - OldSize),
4504                                PromotedOperand)) {
4505       AddrMode = BackupAddrMode;
4506       AddrModeInsts.resize(OldSize);
4507       LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
4508       TPT.rollback(LastKnownGood);
4509       return false;
4510     }
4511     return true;
4512   }
4513   }
4514   return false;
4515 }
4516 
4517 /// If we can, try to add the value of 'Addr' into the current addressing mode.
4518 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode
4519 /// unmodified. This assumes that Addr is either a pointer type or intptr_t
4520 /// for the target.
4521 ///
4522 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
4523   // Start a transaction at this point that we will rollback if the matching
4524   // fails.
4525   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4526       TPT.getRestorationPoint();
4527   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
4528     if (CI->getValue().isSignedIntN(64)) {
4529       // Fold in immediates if legal for the target.
4530       AddrMode.BaseOffs += CI->getSExtValue();
4531       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4532         return true;
4533       AddrMode.BaseOffs -= CI->getSExtValue();
4534     }
4535   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
4536     // If this is a global variable, try to fold it into the addressing mode.
4537     if (!AddrMode.BaseGV) {
4538       AddrMode.BaseGV = GV;
4539       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4540         return true;
4541       AddrMode.BaseGV = nullptr;
4542     }
4543   } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
4544     ExtAddrMode BackupAddrMode = AddrMode;
4545     unsigned OldSize = AddrModeInsts.size();
4546 
4547     // Check to see if it is possible to fold this operation.
4548     bool MovedAway = false;
4549     if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
4550       // This instruction may have been moved away. If so, there is nothing
4551       // to check here.
4552       if (MovedAway)
4553         return true;
4554       // Okay, it's possible to fold this.  Check to see if it is actually
4555       // *profitable* to do so.  We use a simple cost model to avoid increasing
4556       // register pressure too much.
4557       if (I->hasOneUse() ||
4558           isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
4559         AddrModeInsts.push_back(I);
4560         return true;
4561       }
4562 
4563       // It isn't profitable to do this, roll back.
4564       //cerr << "NOT FOLDING: " << *I;
4565       AddrMode = BackupAddrMode;
4566       AddrModeInsts.resize(OldSize);
4567       TPT.rollback(LastKnownGood);
4568     }
4569   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
4570     if (matchOperationAddr(CE, CE->getOpcode(), Depth))
4571       return true;
4572     TPT.rollback(LastKnownGood);
4573   } else if (isa<ConstantPointerNull>(Addr)) {
4574     // Null pointer gets folded without affecting the addressing mode.
4575     return true;
4576   }
4577 
4578   // Worse case, the target should support [reg] addressing modes. :)
4579   if (!AddrMode.HasBaseReg) {
4580     AddrMode.HasBaseReg = true;
4581     AddrMode.BaseReg = Addr;
4582     // Still check for legality in case the target supports [imm] but not [i+r].
4583     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4584       return true;
4585     AddrMode.HasBaseReg = false;
4586     AddrMode.BaseReg = nullptr;
4587   }
4588 
4589   // If the base register is already taken, see if we can do [r+r].
4590   if (AddrMode.Scale == 0) {
4591     AddrMode.Scale = 1;
4592     AddrMode.ScaledReg = Addr;
4593     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4594       return true;
4595     AddrMode.Scale = 0;
4596     AddrMode.ScaledReg = nullptr;
4597   }
4598   // Couldn't match.
4599   TPT.rollback(LastKnownGood);
4600   return false;
4601 }
4602 
4603 /// Check to see if all uses of OpVal by the specified inline asm call are due
4604 /// to memory operands. If so, return true, otherwise return false.
4605 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
4606                                     const TargetLowering &TLI,
4607                                     const TargetRegisterInfo &TRI) {
4608   const Function *F = CI->getFunction();
4609   TargetLowering::AsmOperandInfoVector TargetConstraints =
4610       TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
4611 
4612   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
4613     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
4614 
4615     // Compute the constraint code and ConstraintType to use.
4616     TLI.ComputeConstraintToUse(OpInfo, SDValue());
4617 
4618     // If this asm operand is our Value*, and if it isn't an indirect memory
4619     // operand, we can't fold it!
4620     if (OpInfo.CallOperandVal == OpVal &&
4621         (OpInfo.ConstraintType != TargetLowering::C_Memory ||
4622          !OpInfo.isIndirect))
4623       return false;
4624   }
4625 
4626   return true;
4627 }
4628 
4629 // Max number of memory uses to look at before aborting the search to conserve
4630 // compile time.
4631 static constexpr int MaxMemoryUsesToScan = 20;
4632 
4633 /// Recursively walk all the uses of I until we find a memory use.
4634 /// If we find an obviously non-foldable instruction, return true.
4635 /// Add the ultimately found memory instructions to MemoryUses.
4636 static bool FindAllMemoryUses(
4637     Instruction *I,
4638     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
4639     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
4640     const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
4641     BlockFrequencyInfo *BFI, int SeenInsts = 0) {
4642   // If we already considered this instruction, we're done.
4643   if (!ConsideredInsts.insert(I).second)
4644     return false;
4645 
4646   // If this is an obviously unfoldable instruction, bail out.
4647   if (!MightBeFoldableInst(I))
4648     return true;
4649 
4650   // Loop over all the uses, recursively processing them.
4651   for (Use &U : I->uses()) {
4652     // Conservatively return true if we're seeing a large number or a deep chain
4653     // of users. This avoids excessive compilation times in pathological cases.
4654     if (SeenInsts++ >= MaxMemoryUsesToScan)
4655       return true;
4656 
4657     Instruction *UserI = cast<Instruction>(U.getUser());
4658     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
4659       MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
4660       continue;
4661     }
4662 
4663     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
4664       unsigned opNo = U.getOperandNo();
4665       if (opNo != StoreInst::getPointerOperandIndex())
4666         return true; // Storing addr, not into addr.
4667       MemoryUses.push_back(std::make_pair(SI, opNo));
4668       continue;
4669     }
4670 
4671     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
4672       unsigned opNo = U.getOperandNo();
4673       if (opNo != AtomicRMWInst::getPointerOperandIndex())
4674         return true; // Storing addr, not into addr.
4675       MemoryUses.push_back(std::make_pair(RMW, opNo));
4676       continue;
4677     }
4678 
4679     if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
4680       unsigned opNo = U.getOperandNo();
4681       if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
4682         return true; // Storing addr, not into addr.
4683       MemoryUses.push_back(std::make_pair(CmpX, opNo));
4684       continue;
4685     }
4686 
4687     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
4688       if (CI->hasFnAttr(Attribute::Cold)) {
4689         // If this is a cold call, we can sink the addressing calculation into
4690         // the cold path.  See optimizeCallInst
4691         bool OptForSize = OptSize ||
4692           llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
4693         if (!OptForSize)
4694           continue;
4695       }
4696 
4697       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());
4698       if (!IA) return true;
4699 
4700       // If this is a memory operand, we're cool, otherwise bail out.
4701       if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
4702         return true;
4703       continue;
4704     }
4705 
4706     if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
4707                           PSI, BFI, SeenInsts))
4708       return true;
4709   }
4710 
4711   return false;
4712 }
4713 
4714 /// Return true if Val is already known to be live at the use site that we're
4715 /// folding it into. If so, there is no cost to include it in the addressing
4716 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
4717 /// instruction already.
4718 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
4719                                                    Value *KnownLive2) {
4720   // If Val is either of the known-live values, we know it is live!
4721   if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
4722     return true;
4723 
4724   // All values other than instructions and arguments (e.g. constants) are live.
4725   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
4726 
4727   // If Val is a constant sized alloca in the entry block, it is live, this is
4728   // true because it is just a reference to the stack/frame pointer, which is
4729   // live for the whole function.
4730   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
4731     if (AI->isStaticAlloca())
4732       return true;
4733 
4734   // Check to see if this value is already used in the memory instruction's
4735   // block.  If so, it's already live into the block at the very least, so we
4736   // can reasonably fold it.
4737   return Val->isUsedInBasicBlock(MemoryInst->getParent());
4738 }
4739 
4740 /// It is possible for the addressing mode of the machine to fold the specified
4741 /// instruction into a load or store that ultimately uses it.
4742 /// However, the specified instruction has multiple uses.
4743 /// Given this, it may actually increase register pressure to fold it
4744 /// into the load. For example, consider this code:
4745 ///
4746 ///     X = ...
4747 ///     Y = X+1
4748 ///     use(Y)   -> nonload/store
4749 ///     Z = Y+1
4750 ///     load Z
4751 ///
4752 /// In this case, Y has multiple uses, and can be folded into the load of Z
4753 /// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
4754 /// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
4755 /// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
4756 /// number of computations either.
4757 ///
4758 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
4759 /// X was live across 'load Z' for other reasons, we actually *would* want to
4760 /// fold the addressing mode in the Z case.  This would make Y die earlier.
4761 bool AddressingModeMatcher::
4762 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
4763                                      ExtAddrMode &AMAfter) {
4764   if (IgnoreProfitability) return true;
4765 
4766   // AMBefore is the addressing mode before this instruction was folded into it,
4767   // and AMAfter is the addressing mode after the instruction was folded.  Get
4768   // the set of registers referenced by AMAfter and subtract out those
4769   // referenced by AMBefore: this is the set of values which folding in this
4770   // address extends the lifetime of.
4771   //
4772   // Note that there are only two potential values being referenced here,
4773   // BaseReg and ScaleReg (global addresses are always available, as are any
4774   // folded immediates).
4775   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
4776 
4777   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
4778   // lifetime wasn't extended by adding this instruction.
4779   if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4780     BaseReg = nullptr;
4781   if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4782     ScaledReg = nullptr;
4783 
4784   // If folding this instruction (and it's subexprs) didn't extend any live
4785   // ranges, we're ok with it.
4786   if (!BaseReg && !ScaledReg)
4787     return true;
4788 
4789   // If all uses of this instruction can have the address mode sunk into them,
4790   // we can remove the addressing mode and effectively trade one live register
4791   // for another (at worst.)  In this context, folding an addressing mode into
4792   // the use is just a particularly nice way of sinking it.
4793   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
4794   SmallPtrSet<Instruction*, 16> ConsideredInsts;
4795   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
4796                         PSI, BFI))
4797     return false;  // Has a non-memory, non-foldable use!
4798 
4799   // Now that we know that all uses of this instruction are part of a chain of
4800   // computation involving only operations that could theoretically be folded
4801   // into a memory use, loop over each of these memory operation uses and see
4802   // if they could  *actually* fold the instruction.  The assumption is that
4803   // addressing modes are cheap and that duplicating the computation involved
4804   // many times is worthwhile, even on a fastpath. For sinking candidates
4805   // (i.e. cold call sites), this serves as a way to prevent excessive code
4806   // growth since most architectures have some reasonable small and fast way to
4807   // compute an effective address.  (i.e LEA on x86)
4808   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
4809   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
4810     Instruction *User = MemoryUses[i].first;
4811     unsigned OpNo = MemoryUses[i].second;
4812 
4813     // Get the access type of this use.  If the use isn't a pointer, we don't
4814     // know what it accesses.
4815     Value *Address = User->getOperand(OpNo);
4816     PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
4817     if (!AddrTy)
4818       return false;
4819     Type *AddressAccessTy = AddrTy->getElementType();
4820     unsigned AS = AddrTy->getAddressSpace();
4821 
4822     // Do a match against the root of this address, ignoring profitability. This
4823     // will tell us if the addressing mode for the memory operation will
4824     // *actually* cover the shared instruction.
4825     ExtAddrMode Result;
4826     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
4827                                                                       0);
4828     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4829         TPT.getRestorationPoint();
4830     AddressingModeMatcher Matcher(
4831         MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
4832         InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
4833     Matcher.IgnoreProfitability = true;
4834     bool Success = Matcher.matchAddr(Address, 0);
4835     (void)Success; assert(Success && "Couldn't select *anything*?");
4836 
4837     // The match was to check the profitability, the changes made are not
4838     // part of the original matcher. Therefore, they should be dropped
4839     // otherwise the original matcher will not present the right state.
4840     TPT.rollback(LastKnownGood);
4841 
4842     // If the match didn't cover I, then it won't be shared by it.
4843     if (!is_contained(MatchedAddrModeInsts, I))
4844       return false;
4845 
4846     MatchedAddrModeInsts.clear();
4847   }
4848 
4849   return true;
4850 }
4851 
4852 /// Return true if the specified values are defined in a
4853 /// different basic block than BB.
4854 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
4855   if (Instruction *I = dyn_cast<Instruction>(V))
4856     return I->getParent() != BB;
4857   return false;
4858 }
4859 
4860 /// Sink addressing mode computation immediate before MemoryInst if doing so
4861 /// can be done without increasing register pressure.  The need for the
4862 /// register pressure constraint means this can end up being an all or nothing
4863 /// decision for all uses of the same addressing computation.
4864 ///
4865 /// Load and Store Instructions often have addressing modes that can do
4866 /// significant amounts of computation. As such, instruction selection will try
4867 /// to get the load or store to do as much computation as possible for the
4868 /// program. The problem is that isel can only see within a single block. As
4869 /// such, we sink as much legal addressing mode work into the block as possible.
4870 ///
4871 /// This method is used to optimize both load/store and inline asms with memory
4872 /// operands.  It's also used to sink addressing computations feeding into cold
4873 /// call sites into their (cold) basic block.
4874 ///
4875 /// The motivation for handling sinking into cold blocks is that doing so can
4876 /// both enable other address mode sinking (by satisfying the register pressure
4877 /// constraint above), and reduce register pressure globally (by removing the
4878 /// addressing mode computation from the fast path entirely.).
4879 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
4880                                         Type *AccessTy, unsigned AddrSpace) {
4881   Value *Repl = Addr;
4882 
4883   // Try to collapse single-value PHI nodes.  This is necessary to undo
4884   // unprofitable PRE transformations.
4885   SmallVector<Value*, 8> worklist;
4886   SmallPtrSet<Value*, 16> Visited;
4887   worklist.push_back(Addr);
4888 
4889   // Use a worklist to iteratively look through PHI and select nodes, and
4890   // ensure that the addressing mode obtained from the non-PHI/select roots of
4891   // the graph are compatible.
4892   bool PhiOrSelectSeen = false;
4893   SmallVector<Instruction*, 16> AddrModeInsts;
4894   const SimplifyQuery SQ(*DL, TLInfo);
4895   AddressingModeCombiner AddrModes(SQ, Addr);
4896   TypePromotionTransaction TPT(RemovedInsts);
4897   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4898       TPT.getRestorationPoint();
4899   while (!worklist.empty()) {
4900     Value *V = worklist.back();
4901     worklist.pop_back();
4902 
4903     // We allow traversing cyclic Phi nodes.
4904     // In case of success after this loop we ensure that traversing through
4905     // Phi nodes ends up with all cases to compute address of the form
4906     //    BaseGV + Base + Scale * Index + Offset
4907     // where Scale and Offset are constans and BaseGV, Base and Index
4908     // are exactly the same Values in all cases.
4909     // It means that BaseGV, Scale and Offset dominate our memory instruction
4910     // and have the same value as they had in address computation represented
4911     // as Phi. So we can safely sink address computation to memory instruction.
4912     if (!Visited.insert(V).second)
4913       continue;
4914 
4915     // For a PHI node, push all of its incoming values.
4916     if (PHINode *P = dyn_cast<PHINode>(V)) {
4917       for (Value *IncValue : P->incoming_values())
4918         worklist.push_back(IncValue);
4919       PhiOrSelectSeen = true;
4920       continue;
4921     }
4922     // Similar for select.
4923     if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
4924       worklist.push_back(SI->getFalseValue());
4925       worklist.push_back(SI->getTrueValue());
4926       PhiOrSelectSeen = true;
4927       continue;
4928     }
4929 
4930     // For non-PHIs, determine the addressing mode being computed.  Note that
4931     // the result may differ depending on what other uses our candidate
4932     // addressing instructions might have.
4933     AddrModeInsts.clear();
4934     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
4935                                                                       0);
4936     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
4937         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
4938         InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
4939         BFI.get());
4940 
4941     GetElementPtrInst *GEP = LargeOffsetGEP.first;
4942     if (GEP && !NewGEPBases.count(GEP)) {
4943       // If splitting the underlying data structure can reduce the offset of a
4944       // GEP, collect the GEP.  Skip the GEPs that are the new bases of
4945       // previously split data structures.
4946       LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
4947       if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
4948         LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
4949     }
4950 
4951     NewAddrMode.OriginalValue = V;
4952     if (!AddrModes.addNewAddrMode(NewAddrMode))
4953       break;
4954   }
4955 
4956   // Try to combine the AddrModes we've collected. If we couldn't collect any,
4957   // or we have multiple but either couldn't combine them or combining them
4958   // wouldn't do anything useful, bail out now.
4959   if (!AddrModes.combineAddrModes()) {
4960     TPT.rollback(LastKnownGood);
4961     return false;
4962   }
4963   TPT.commit();
4964 
4965   // Get the combined AddrMode (or the only AddrMode, if we only had one).
4966   ExtAddrMode AddrMode = AddrModes.getAddrMode();
4967 
4968   // If all the instructions matched are already in this BB, don't do anything.
4969   // If we saw a Phi node then it is not local definitely, and if we saw a select
4970   // then we want to push the address calculation past it even if it's already
4971   // in this BB.
4972   if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
4973         return IsNonLocalValue(V, MemoryInst->getParent());
4974                   })) {
4975     LLVM_DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode
4976                       << "\n");
4977     return false;
4978   }
4979 
4980   // Insert this computation right after this user.  Since our caller is
4981   // scanning from the top of the BB to the bottom, reuse of the expr are
4982   // guaranteed to happen later.
4983   IRBuilder<> Builder(MemoryInst);
4984 
4985   // Now that we determined the addressing expression we want to use and know
4986   // that we have to sink it into this block.  Check to see if we have already
4987   // done this for some other load/store instr in this block.  If so, reuse
4988   // the computation.  Before attempting reuse, check if the address is valid
4989   // as it may have been erased.
4990 
4991   WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
4992 
4993   Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
4994   if (SunkAddr) {
4995     LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
4996                       << " for " << *MemoryInst << "\n");
4997     if (SunkAddr->getType() != Addr->getType())
4998       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
4999   } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
5000                                    SubtargetInfo->addrSinkUsingGEPs())) {
5001     // By default, we use the GEP-based method when AA is used later. This
5002     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
5003     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
5004                       << " for " << *MemoryInst << "\n");
5005     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5006     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
5007 
5008     // First, find the pointer.
5009     if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
5010       ResultPtr = AddrMode.BaseReg;
5011       AddrMode.BaseReg = nullptr;
5012     }
5013 
5014     if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
5015       // We can't add more than one pointer together, nor can we scale a
5016       // pointer (both of which seem meaningless).
5017       if (ResultPtr || AddrMode.Scale != 1)
5018         return false;
5019 
5020       ResultPtr = AddrMode.ScaledReg;
5021       AddrMode.Scale = 0;
5022     }
5023 
5024     // It is only safe to sign extend the BaseReg if we know that the math
5025     // required to create it did not overflow before we extend it. Since
5026     // the original IR value was tossed in favor of a constant back when
5027     // the AddrMode was created we need to bail out gracefully if widths
5028     // do not match instead of extending it.
5029     //
5030     // (See below for code to add the scale.)
5031     if (AddrMode.Scale) {
5032       Type *ScaledRegTy = AddrMode.ScaledReg->getType();
5033       if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
5034           cast<IntegerType>(ScaledRegTy)->getBitWidth())
5035         return false;
5036     }
5037 
5038     if (AddrMode.BaseGV) {
5039       if (ResultPtr)
5040         return false;
5041 
5042       ResultPtr = AddrMode.BaseGV;
5043     }
5044 
5045     // If the real base value actually came from an inttoptr, then the matcher
5046     // will look through it and provide only the integer value. In that case,
5047     // use it here.
5048     if (!DL->isNonIntegralPointerType(Addr->getType())) {
5049       if (!ResultPtr && AddrMode.BaseReg) {
5050         ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
5051                                            "sunkaddr");
5052         AddrMode.BaseReg = nullptr;
5053       } else if (!ResultPtr && AddrMode.Scale == 1) {
5054         ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
5055                                            "sunkaddr");
5056         AddrMode.Scale = 0;
5057       }
5058     }
5059 
5060     if (!ResultPtr &&
5061         !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
5062       SunkAddr = Constant::getNullValue(Addr->getType());
5063     } else if (!ResultPtr) {
5064       return false;
5065     } else {
5066       Type *I8PtrTy =
5067           Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
5068       Type *I8Ty = Builder.getInt8Ty();
5069 
5070       // Start with the base register. Do this first so that subsequent address
5071       // matching finds it last, which will prevent it from trying to match it
5072       // as the scaled value in case it happens to be a mul. That would be
5073       // problematic if we've sunk a different mul for the scale, because then
5074       // we'd end up sinking both muls.
5075       if (AddrMode.BaseReg) {
5076         Value *V = AddrMode.BaseReg;
5077         if (V->getType() != IntPtrTy)
5078           V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
5079 
5080         ResultIndex = V;
5081       }
5082 
5083       // Add the scale value.
5084       if (AddrMode.Scale) {
5085         Value *V = AddrMode.ScaledReg;
5086         if (V->getType() == IntPtrTy) {
5087           // done.
5088         } else {
5089           assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
5090                  cast<IntegerType>(V->getType())->getBitWidth() &&
5091                  "We can't transform if ScaledReg is too narrow");
5092           V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
5093         }
5094 
5095         if (AddrMode.Scale != 1)
5096           V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
5097                                 "sunkaddr");
5098         if (ResultIndex)
5099           ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
5100         else
5101           ResultIndex = V;
5102       }
5103 
5104       // Add in the Base Offset if present.
5105       if (AddrMode.BaseOffs) {
5106         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
5107         if (ResultIndex) {
5108           // We need to add this separately from the scale above to help with
5109           // SDAG consecutive load/store merging.
5110           if (ResultPtr->getType() != I8PtrTy)
5111             ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
5112           ResultPtr =
5113               AddrMode.InBounds
5114                   ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
5115                                               "sunkaddr")
5116                   : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
5117         }
5118 
5119         ResultIndex = V;
5120       }
5121 
5122       if (!ResultIndex) {
5123         SunkAddr = ResultPtr;
5124       } else {
5125         if (ResultPtr->getType() != I8PtrTy)
5126           ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
5127         SunkAddr =
5128             AddrMode.InBounds
5129                 ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
5130                                             "sunkaddr")
5131                 : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
5132       }
5133 
5134       if (SunkAddr->getType() != Addr->getType())
5135         SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
5136     }
5137   } else {
5138     // We'd require a ptrtoint/inttoptr down the line, which we can't do for
5139     // non-integral pointers, so in that case bail out now.
5140     Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
5141     Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
5142     PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
5143     PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
5144     if (DL->isNonIntegralPointerType(Addr->getType()) ||
5145         (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
5146         (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
5147         (AddrMode.BaseGV &&
5148          DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
5149       return false;
5150 
5151     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
5152                       << " for " << *MemoryInst << "\n");
5153     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5154     Value *Result = nullptr;
5155 
5156     // Start with the base register. Do this first so that subsequent address
5157     // matching finds it last, which will prevent it from trying to match it
5158     // as the scaled value in case it happens to be a mul. That would be
5159     // problematic if we've sunk a different mul for the scale, because then
5160     // we'd end up sinking both muls.
5161     if (AddrMode.BaseReg) {
5162       Value *V = AddrMode.BaseReg;
5163       if (V->getType()->isPointerTy())
5164         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
5165       if (V->getType() != IntPtrTy)
5166         V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
5167       Result = V;
5168     }
5169 
5170     // Add the scale value.
5171     if (AddrMode.Scale) {
5172       Value *V = AddrMode.ScaledReg;
5173       if (V->getType() == IntPtrTy) {
5174         // done.
5175       } else if (V->getType()->isPointerTy()) {
5176         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
5177       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
5178                  cast<IntegerType>(V->getType())->getBitWidth()) {
5179         V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
5180       } else {
5181         // It is only safe to sign extend the BaseReg if we know that the math
5182         // required to create it did not overflow before we extend it. Since
5183         // the original IR value was tossed in favor of a constant back when
5184         // the AddrMode was created we need to bail out gracefully if widths
5185         // do not match instead of extending it.
5186         Instruction *I = dyn_cast_or_null<Instruction>(Result);
5187         if (I && (Result != AddrMode.BaseReg))
5188           I->eraseFromParent();
5189         return false;
5190       }
5191       if (AddrMode.Scale != 1)
5192         V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
5193                               "sunkaddr");
5194       if (Result)
5195         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5196       else
5197         Result = V;
5198     }
5199 
5200     // Add in the BaseGV if present.
5201     if (AddrMode.BaseGV) {
5202       Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
5203       if (Result)
5204         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5205       else
5206         Result = V;
5207     }
5208 
5209     // Add in the Base Offset if present.
5210     if (AddrMode.BaseOffs) {
5211       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
5212       if (Result)
5213         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5214       else
5215         Result = V;
5216     }
5217 
5218     if (!Result)
5219       SunkAddr = Constant::getNullValue(Addr->getType());
5220     else
5221       SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
5222   }
5223 
5224   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
5225   // Store the newly computed address into the cache. In the case we reused a
5226   // value, this should be idempotent.
5227   SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);
5228 
5229   // If we have no uses, recursively delete the value and all dead instructions
5230   // using it.
5231   if (Repl->use_empty()) {
5232     // This can cause recursive deletion, which can invalidate our iterator.
5233     // Use a WeakTrackingVH to hold onto it in case this happens.
5234     Value *CurValue = &*CurInstIterator;
5235     WeakTrackingVH IterHandle(CurValue);
5236     BasicBlock *BB = CurInstIterator->getParent();
5237 
5238     RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
5239 
5240     if (IterHandle != CurValue) {
5241       // If the iterator instruction was recursively deleted, start over at the
5242       // start of the block.
5243       CurInstIterator = BB->begin();
5244       SunkAddrs.clear();
5245     }
5246   }
5247   ++NumMemoryInsts;
5248   return true;
5249 }
5250 
5251 /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
5252 /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
5253 /// only handle a 2 operand GEP in the same basic block or a splat constant
5254 /// vector. The 2 operands to the GEP must have a scalar pointer and a vector
5255 /// index.
5256 ///
5257 /// If the existing GEP has a vector base pointer that is splat, we can look
5258 /// through the splat to find the scalar pointer. If we can't find a scalar
5259 /// pointer there's nothing we can do.
5260 ///
5261 /// If we have a GEP with more than 2 indices where the middle indices are all
5262 /// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
5263 ///
5264 /// If the final index isn't a vector or is a splat, we can emit a scalar GEP
5265 /// followed by a GEP with an all zeroes vector index. This will enable
5266 /// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
5267 /// zero index.
5268 bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
5269                                                Value *Ptr) {
5270   const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
5271   if (!GEP || !GEP->hasIndices())
5272     return false;
5273 
5274   // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
5275   // FIXME: We should support this by sinking the GEP.
5276   if (MemoryInst->getParent() != GEP->getParent())
5277     return false;
5278 
5279   SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
5280 
5281   bool RewriteGEP = false;
5282 
5283   if (Ops[0]->getType()->isVectorTy()) {
5284     Ops[0] = const_cast<Value *>(getSplatValue(Ops[0]));
5285     if (!Ops[0])
5286       return false;
5287     RewriteGEP = true;
5288   }
5289 
5290   unsigned FinalIndex = Ops.size() - 1;
5291 
5292   // Ensure all but the last index is 0.
5293   // FIXME: This isn't strictly required. All that's required is that they are
5294   // all scalars or splats.
5295   for (unsigned i = 1; i < FinalIndex; ++i) {
5296     auto *C = dyn_cast<Constant>(Ops[i]);
5297     if (!C)
5298       return false;
5299     if (isa<VectorType>(C->getType()))
5300       C = C->getSplatValue();
5301     auto *CI = dyn_cast_or_null<ConstantInt>(C);
5302     if (!CI || !CI->isZero())
5303       return false;
5304     // Scalarize the index if needed.
5305     Ops[i] = CI;
5306   }
5307 
5308   // Try to scalarize the final index.
5309   if (Ops[FinalIndex]->getType()->isVectorTy()) {
5310     if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) {
5311       auto *C = dyn_cast<ConstantInt>(V);
5312       // Don't scalarize all zeros vector.
5313       if (!C || !C->isZero()) {
5314         Ops[FinalIndex] = V;
5315         RewriteGEP = true;
5316       }
5317     }
5318   }
5319 
5320   // If we made any changes or the we have extra operands, we need to generate
5321   // new instructions.
5322   if (!RewriteGEP && Ops.size() == 2)
5323     return false;
5324 
5325   unsigned NumElts = cast<VectorType>(Ptr->getType())->getNumElements();
5326 
5327   IRBuilder<> Builder(MemoryInst);
5328 
5329   Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
5330 
5331   Value *NewAddr;
5332 
5333   // If the final index isn't a vector, emit a scalar GEP containing all ops
5334   // and a vector GEP with all zeroes final index.
5335   if (!Ops[FinalIndex]->getType()->isVectorTy()) {
5336     NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
5337     auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
5338     NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
5339   } else {
5340     Value *Base = Ops[0];
5341     Value *Index = Ops[FinalIndex];
5342 
5343     // Create a scalar GEP if there are more than 2 operands.
5344     if (Ops.size() != 2) {
5345       // Replace the last index with 0.
5346       Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
5347       Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
5348     }
5349 
5350     // Now create the GEP with scalar pointer and vector index.
5351     NewAddr = Builder.CreateGEP(Base, Index);
5352   }
5353 
5354   MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
5355 
5356   // If we have no uses, recursively delete the value and all dead instructions
5357   // using it.
5358   if (Ptr->use_empty())
5359     RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo);
5360 
5361   return true;
5362 }
5363 
5364 /// If there are any memory operands, use OptimizeMemoryInst to sink their
5365 /// address computing into the block when possible / profitable.
5366 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
5367   bool MadeChange = false;
5368 
5369   const TargetRegisterInfo *TRI =
5370       TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
5371   TargetLowering::AsmOperandInfoVector TargetConstraints =
5372       TLI->ParseConstraints(*DL, TRI, *CS);
5373   unsigned ArgNo = 0;
5374   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
5375     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
5376 
5377     // Compute the constraint code and ConstraintType to use.
5378     TLI->ComputeConstraintToUse(OpInfo, SDValue());
5379 
5380     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
5381         OpInfo.isIndirect) {
5382       Value *OpVal = CS->getArgOperand(ArgNo++);
5383       MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
5384     } else if (OpInfo.Type == InlineAsm::isInput)
5385       ArgNo++;
5386   }
5387 
5388   return MadeChange;
5389 }
5390 
5391 /// Check if all the uses of \p Val are equivalent (or free) zero or
5392 /// sign extensions.
5393 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
5394   assert(!Val->use_empty() && "Input must have at least one use");
5395   const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
5396   bool IsSExt = isa<SExtInst>(FirstUser);
5397   Type *ExtTy = FirstUser->getType();
5398   for (const User *U : Val->users()) {
5399     const Instruction *UI = cast<Instruction>(U);
5400     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
5401       return false;
5402     Type *CurTy = UI->getType();
5403     // Same input and output types: Same instruction after CSE.
5404     if (CurTy == ExtTy)
5405       continue;
5406 
5407     // If IsSExt is true, we are in this situation:
5408     // a = Val
5409     // b = sext ty1 a to ty2
5410     // c = sext ty1 a to ty3
5411     // Assuming ty2 is shorter than ty3, this could be turned into:
5412     // a = Val
5413     // b = sext ty1 a to ty2
5414     // c = sext ty2 b to ty3
5415     // However, the last sext is not free.
5416     if (IsSExt)
5417       return false;
5418 
5419     // This is a ZExt, maybe this is free to extend from one type to another.
5420     // In that case, we would not account for a different use.
5421     Type *NarrowTy;
5422     Type *LargeTy;
5423     if (ExtTy->getScalarType()->getIntegerBitWidth() >
5424         CurTy->getScalarType()->getIntegerBitWidth()) {
5425       NarrowTy = CurTy;
5426       LargeTy = ExtTy;
5427     } else {
5428       NarrowTy = ExtTy;
5429       LargeTy = CurTy;
5430     }
5431 
5432     if (!TLI.isZExtFree(NarrowTy, LargeTy))
5433       return false;
5434   }
5435   // All uses are the same or can be derived from one another for free.
5436   return true;
5437 }
5438 
5439 /// Try to speculatively promote extensions in \p Exts and continue
5440 /// promoting through newly promoted operands recursively as far as doing so is
5441 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
5442 /// When some promotion happened, \p TPT contains the proper state to revert
5443 /// them.
5444 ///
5445 /// \return true if some promotion happened, false otherwise.
5446 bool CodeGenPrepare::tryToPromoteExts(
5447     TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
5448     SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
5449     unsigned CreatedInstsCost) {
5450   bool Promoted = false;
5451 
5452   // Iterate over all the extensions to try to promote them.
5453   for (auto *I : Exts) {
5454     // Early check if we directly have ext(load).
5455     if (isa<LoadInst>(I->getOperand(0))) {
5456       ProfitablyMovedExts.push_back(I);
5457       continue;
5458     }
5459 
5460     // Check whether or not we want to do any promotion.  The reason we have
5461     // this check inside the for loop is to catch the case where an extension
5462     // is directly fed by a load because in such case the extension can be moved
5463     // up without any promotion on its operands.
5464     if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
5465       return false;
5466 
5467     // Get the action to perform the promotion.
5468     TypePromotionHelper::Action TPH =
5469         TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
5470     // Check if we can promote.
5471     if (!TPH) {
5472       // Save the current extension as we cannot move up through its operand.
5473       ProfitablyMovedExts.push_back(I);
5474       continue;
5475     }
5476 
5477     // Save the current state.
5478     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5479         TPT.getRestorationPoint();
5480     SmallVector<Instruction *, 4> NewExts;
5481     unsigned NewCreatedInstsCost = 0;
5482     unsigned ExtCost = !TLI->isExtFree(I);
5483     // Promote.
5484     Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
5485                              &NewExts, nullptr, *TLI);
5486     assert(PromotedVal &&
5487            "TypePromotionHelper should have filtered out those cases");
5488 
5489     // We would be able to merge only one extension in a load.
5490     // Therefore, if we have more than 1 new extension we heuristically
5491     // cut this search path, because it means we degrade the code quality.
5492     // With exactly 2, the transformation is neutral, because we will merge
5493     // one extension but leave one. However, we optimistically keep going,
5494     // because the new extension may be removed too.
5495     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
5496     // FIXME: It would be possible to propagate a negative value instead of
5497     // conservatively ceiling it to 0.
5498     TotalCreatedInstsCost =
5499         std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
5500     if (!StressExtLdPromotion &&
5501         (TotalCreatedInstsCost > 1 ||
5502          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
5503       // This promotion is not profitable, rollback to the previous state, and
5504       // save the current extension in ProfitablyMovedExts as the latest
5505       // speculative promotion turned out to be unprofitable.
5506       TPT.rollback(LastKnownGood);
5507       ProfitablyMovedExts.push_back(I);
5508       continue;
5509     }
5510     // Continue promoting NewExts as far as doing so is profitable.
5511     SmallVector<Instruction *, 2> NewlyMovedExts;
5512     (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
5513     bool NewPromoted = false;
5514     for (auto *ExtInst : NewlyMovedExts) {
5515       Instruction *MovedExt = cast<Instruction>(ExtInst);
5516       Value *ExtOperand = MovedExt->getOperand(0);
5517       // If we have reached to a load, we need this extra profitability check
5518       // as it could potentially be merged into an ext(load).
5519       if (isa<LoadInst>(ExtOperand) &&
5520           !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
5521             (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
5522         continue;
5523 
5524       ProfitablyMovedExts.push_back(MovedExt);
5525       NewPromoted = true;
5526     }
5527 
5528     // If none of speculative promotions for NewExts is profitable, rollback
5529     // and save the current extension (I) as the last profitable extension.
5530     if (!NewPromoted) {
5531       TPT.rollback(LastKnownGood);
5532       ProfitablyMovedExts.push_back(I);
5533       continue;
5534     }
5535     // The promotion is profitable.
5536     Promoted = true;
5537   }
5538   return Promoted;
5539 }
5540 
5541 /// Merging redundant sexts when one is dominating the other.
5542 bool CodeGenPrepare::mergeSExts(Function &F) {
5543   bool Changed = false;
5544   for (auto &Entry : ValToSExtendedUses) {
5545     SExts &Insts = Entry.second;
5546     SExts CurPts;
5547     for (Instruction *Inst : Insts) {
5548       if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
5549           Inst->getOperand(0) != Entry.first)
5550         continue;
5551       bool inserted = false;
5552       for (auto &Pt : CurPts) {
5553         if (getDT(F).dominates(Inst, Pt)) {
5554           Pt->replaceAllUsesWith(Inst);
5555           RemovedInsts.insert(Pt);
5556           Pt->removeFromParent();
5557           Pt = Inst;
5558           inserted = true;
5559           Changed = true;
5560           break;
5561         }
5562         if (!getDT(F).dominates(Pt, Inst))
5563           // Give up if we need to merge in a common dominator as the
5564           // experiments show it is not profitable.
5565           continue;
5566         Inst->replaceAllUsesWith(Pt);
5567         RemovedInsts.insert(Inst);
5568         Inst->removeFromParent();
5569         inserted = true;
5570         Changed = true;
5571         break;
5572       }
5573       if (!inserted)
5574         CurPts.push_back(Inst);
5575     }
5576   }
5577   return Changed;
5578 }
5579 
5580 // Spliting large data structures so that the GEPs accessing them can have
5581 // smaller offsets so that they can be sunk to the same blocks as their users.
5582 // For example, a large struct starting from %base is splitted into two parts
5583 // where the second part starts from %new_base.
5584 //
5585 // Before:
5586 // BB0:
5587 //   %base     =
5588 //
5589 // BB1:
5590 //   %gep0     = gep %base, off0
5591 //   %gep1     = gep %base, off1
5592 //   %gep2     = gep %base, off2
5593 //
5594 // BB2:
5595 //   %load1    = load %gep0
5596 //   %load2    = load %gep1
5597 //   %load3    = load %gep2
5598 //
5599 // After:
5600 // BB0:
5601 //   %base     =
5602 //   %new_base = gep %base, off0
5603 //
5604 // BB1:
5605 //   %new_gep0 = %new_base
5606 //   %new_gep1 = gep %new_base, off1 - off0
5607 //   %new_gep2 = gep %new_base, off2 - off0
5608 //
5609 // BB2:
5610 //   %load1    = load i32, i32* %new_gep0
5611 //   %load2    = load i32, i32* %new_gep1
5612 //   %load3    = load i32, i32* %new_gep2
5613 //
5614 // %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
5615 // their offsets are smaller enough to fit into the addressing mode.
5616 bool CodeGenPrepare::splitLargeGEPOffsets() {
5617   bool Changed = false;
5618   for (auto &Entry : LargeOffsetGEPMap) {
5619     Value *OldBase = Entry.first;
5620     SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
5621         &LargeOffsetGEPs = Entry.second;
5622     auto compareGEPOffset =
5623         [&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
5624             const std::pair<GetElementPtrInst *, int64_t> &RHS) {
5625           if (LHS.first == RHS.first)
5626             return false;
5627           if (LHS.second != RHS.second)
5628             return LHS.second < RHS.second;
5629           return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
5630         };
5631     // Sorting all the GEPs of the same data structures based on the offsets.
5632     llvm::sort(LargeOffsetGEPs, compareGEPOffset);
5633     LargeOffsetGEPs.erase(
5634         std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
5635         LargeOffsetGEPs.end());
5636     // Skip if all the GEPs have the same offsets.
5637     if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
5638       continue;
5639     GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
5640     int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
5641     Value *NewBaseGEP = nullptr;
5642 
5643     auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
5644     while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
5645       GetElementPtrInst *GEP = LargeOffsetGEP->first;
5646       int64_t Offset = LargeOffsetGEP->second;
5647       if (Offset != BaseOffset) {
5648         TargetLowering::AddrMode AddrMode;
5649         AddrMode.BaseOffs = Offset - BaseOffset;
5650         // The result type of the GEP might not be the type of the memory
5651         // access.
5652         if (!TLI->isLegalAddressingMode(*DL, AddrMode,
5653                                         GEP->getResultElementType(),
5654                                         GEP->getAddressSpace())) {
5655           // We need to create a new base if the offset to the current base is
5656           // too large to fit into the addressing mode. So, a very large struct
5657           // may be splitted into several parts.
5658           BaseGEP = GEP;
5659           BaseOffset = Offset;
5660           NewBaseGEP = nullptr;
5661         }
5662       }
5663 
5664       // Generate a new GEP to replace the current one.
5665       LLVMContext &Ctx = GEP->getContext();
5666       Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
5667       Type *I8PtrTy =
5668           Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
5669       Type *I8Ty = Type::getInt8Ty(Ctx);
5670 
5671       if (!NewBaseGEP) {
5672         // Create a new base if we don't have one yet.  Find the insertion
5673         // pointer for the new base first.
5674         BasicBlock::iterator NewBaseInsertPt;
5675         BasicBlock *NewBaseInsertBB;
5676         if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
5677           // If the base of the struct is an instruction, the new base will be
5678           // inserted close to it.
5679           NewBaseInsertBB = BaseI->getParent();
5680           if (isa<PHINode>(BaseI))
5681             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5682           else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
5683             NewBaseInsertBB =
5684                 SplitEdge(NewBaseInsertBB, Invoke->getNormalDest());
5685             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5686           } else
5687             NewBaseInsertPt = std::next(BaseI->getIterator());
5688         } else {
5689           // If the current base is an argument or global value, the new base
5690           // will be inserted to the entry block.
5691           NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
5692           NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5693         }
5694         IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
5695         // Create a new base.
5696         Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset);
5697         NewBaseGEP = OldBase;
5698         if (NewBaseGEP->getType() != I8PtrTy)
5699           NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
5700         NewBaseGEP =
5701             NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
5702         NewGEPBases.insert(NewBaseGEP);
5703       }
5704 
5705       IRBuilder<> Builder(GEP);
5706       Value *NewGEP = NewBaseGEP;
5707       if (Offset == BaseOffset) {
5708         if (GEP->getType() != I8PtrTy)
5709           NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
5710       } else {
5711         // Calculate the new offset for the new GEP.
5712         Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset);
5713         NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
5714 
5715         if (GEP->getType() != I8PtrTy)
5716           NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
5717       }
5718       GEP->replaceAllUsesWith(NewGEP);
5719       LargeOffsetGEPID.erase(GEP);
5720       LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
5721       GEP->eraseFromParent();
5722       Changed = true;
5723     }
5724   }
5725   return Changed;
5726 }
5727 
5728 bool CodeGenPrepare::optimizePhiType(
5729     PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
5730     SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
5731   // We are looking for a collection on interconnected phi nodes that together
5732   // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
5733   // are of the same type. Convert the whole set of nodes to the type of the
5734   // bitcast.
5735   Type *PhiTy = I->getType();
5736   Type *ConvertTy = nullptr;
5737   if (Visited.count(I) ||
5738       (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
5739     return false;
5740 
5741   SmallVector<Instruction *, 4> Worklist;
5742   Worklist.push_back(cast<Instruction>(I));
5743   SmallPtrSet<PHINode *, 4> PhiNodes;
5744   PhiNodes.insert(I);
5745   Visited.insert(I);
5746   SmallPtrSet<Instruction *, 4> Defs;
5747   SmallPtrSet<Instruction *, 4> Uses;
5748 
5749   while (!Worklist.empty()) {
5750     Instruction *II = Worklist.pop_back_val();
5751 
5752     if (auto *Phi = dyn_cast<PHINode>(II)) {
5753       // Handle Defs, which might also be PHI's
5754       for (Value *V : Phi->incoming_values()) {
5755         if (auto *OpPhi = dyn_cast<PHINode>(V)) {
5756           if (!PhiNodes.count(OpPhi)) {
5757             if (Visited.count(OpPhi))
5758               return false;
5759             PhiNodes.insert(OpPhi);
5760             Visited.insert(OpPhi);
5761             Worklist.push_back(OpPhi);
5762           }
5763         } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
5764           if (!Defs.count(OpLoad)) {
5765             Defs.insert(OpLoad);
5766             Worklist.push_back(OpLoad);
5767           }
5768         } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
5769           if (!Defs.count(OpEx)) {
5770             Defs.insert(OpEx);
5771             Worklist.push_back(OpEx);
5772           }
5773         } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
5774           if (!ConvertTy)
5775             ConvertTy = OpBC->getOperand(0)->getType();
5776           if (OpBC->getOperand(0)->getType() != ConvertTy)
5777             return false;
5778           if (!Defs.count(OpBC)) {
5779             Defs.insert(OpBC);
5780             Worklist.push_back(OpBC);
5781           }
5782         } else if (!isa<UndefValue>(V))
5783           return false;
5784       }
5785     }
5786 
5787     // Handle uses which might also be phi's
5788     for (User *V : II->users()) {
5789       if (auto *OpPhi = dyn_cast<PHINode>(V)) {
5790         if (!PhiNodes.count(OpPhi)) {
5791           if (Visited.count(OpPhi))
5792             return false;
5793           PhiNodes.insert(OpPhi);
5794           Visited.insert(OpPhi);
5795           Worklist.push_back(OpPhi);
5796         }
5797       } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
5798         if (OpStore->getOperand(0) != II)
5799           return false;
5800         Uses.insert(OpStore);
5801       } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
5802         if (!ConvertTy)
5803           ConvertTy = OpBC->getType();
5804         if (OpBC->getType() != ConvertTy)
5805           return false;
5806         Uses.insert(OpBC);
5807       } else
5808         return false;
5809     }
5810   }
5811 
5812   if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
5813     return false;
5814 
5815   LLVM_DEBUG(dbgs() << "Converting " << *I << "\n  and connected nodes to "
5816                     << *ConvertTy << "\n");
5817 
5818   // Create all the new phi nodes of the new type, and bitcast any loads to the
5819   // correct type.
5820   ValueToValueMap ValMap;
5821   ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);
5822   for (Instruction *D : Defs) {
5823     if (isa<BitCastInst>(D))
5824       ValMap[D] = D->getOperand(0);
5825     else
5826       ValMap[D] =
5827           new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
5828   }
5829   for (PHINode *Phi : PhiNodes)
5830     ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
5831                                   Phi->getName() + ".tc", Phi);
5832   // Pipe together all the PhiNodes.
5833   for (PHINode *Phi : PhiNodes) {
5834     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
5835     for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
5836       NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
5837                           Phi->getIncomingBlock(i));
5838   }
5839   // And finally pipe up the stores and bitcasts
5840   for (Instruction *U : Uses) {
5841     if (isa<BitCastInst>(U)) {
5842       DeletedInstrs.insert(U);
5843       U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
5844     } else
5845       U->setOperand(0,
5846                     new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
5847   }
5848 
5849   // Save the removed phis to be deleted later.
5850   for (PHINode *Phi : PhiNodes)
5851     DeletedInstrs.insert(Phi);
5852   return true;
5853 }
5854 
5855 bool CodeGenPrepare::optimizePhiTypes(Function &F) {
5856   if (!OptimizePhiTypes)
5857     return false;
5858 
5859   bool Changed = false;
5860   SmallPtrSet<PHINode *, 4> Visited;
5861   SmallPtrSet<Instruction *, 4> DeletedInstrs;
5862 
5863   // Attempt to optimize all the phis in the functions to the correct type.
5864   for (auto &BB : F)
5865     for (auto &Phi : BB.phis())
5866       Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);
5867 
5868   // Remove any old phi's that have been converted.
5869   for (auto *I : DeletedInstrs) {
5870     I->replaceAllUsesWith(UndefValue::get(I->getType()));
5871     I->eraseFromParent();
5872   }
5873 
5874   return Changed;
5875 }
5876 
5877 /// Return true, if an ext(load) can be formed from an extension in
5878 /// \p MovedExts.
5879 bool CodeGenPrepare::canFormExtLd(
5880     const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
5881     Instruction *&Inst, bool HasPromoted) {
5882   for (auto *MovedExtInst : MovedExts) {
5883     if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
5884       LI = cast<LoadInst>(MovedExtInst->getOperand(0));
5885       Inst = MovedExtInst;
5886       break;
5887     }
5888   }
5889   if (!LI)
5890     return false;
5891 
5892   // If they're already in the same block, there's nothing to do.
5893   // Make the cheap checks first if we did not promote.
5894   // If we promoted, we need to check if it is indeed profitable.
5895   if (!HasPromoted && LI->getParent() == Inst->getParent())
5896     return false;
5897 
5898   return TLI->isExtLoad(LI, Inst, *DL);
5899 }
5900 
5901 /// Move a zext or sext fed by a load into the same basic block as the load,
5902 /// unless conditions are unfavorable. This allows SelectionDAG to fold the
5903 /// extend into the load.
5904 ///
5905 /// E.g.,
5906 /// \code
5907 /// %ld = load i32* %addr
5908 /// %add = add nuw i32 %ld, 4
5909 /// %zext = zext i32 %add to i64
5910 // \endcode
5911 /// =>
5912 /// \code
5913 /// %ld = load i32* %addr
5914 /// %zext = zext i32 %ld to i64
5915 /// %add = add nuw i64 %zext, 4
5916 /// \encode
5917 /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
5918 /// allow us to match zext(load i32*) to i64.
5919 ///
5920 /// Also, try to promote the computations used to obtain a sign extended
5921 /// value used into memory accesses.
5922 /// E.g.,
5923 /// \code
5924 /// a = add nsw i32 b, 3
5925 /// d = sext i32 a to i64
5926 /// e = getelementptr ..., i64 d
5927 /// \endcode
5928 /// =>
5929 /// \code
5930 /// f = sext i32 b to i64
5931 /// a = add nsw i64 f, 3
5932 /// e = getelementptr ..., i64 a
5933 /// \endcode
5934 ///
5935 /// \p Inst[in/out] the extension may be modified during the process if some
5936 /// promotions apply.
5937 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
5938   bool AllowPromotionWithoutCommonHeader = false;
5939   /// See if it is an interesting sext operations for the address type
5940   /// promotion before trying to promote it, e.g., the ones with the right
5941   /// type and used in memory accesses.
5942   bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
5943       *Inst, AllowPromotionWithoutCommonHeader);
5944   TypePromotionTransaction TPT(RemovedInsts);
5945   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5946       TPT.getRestorationPoint();
5947   SmallVector<Instruction *, 1> Exts;
5948   SmallVector<Instruction *, 2> SpeculativelyMovedExts;
5949   Exts.push_back(Inst);
5950 
5951   bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
5952 
5953   // Look for a load being extended.
5954   LoadInst *LI = nullptr;
5955   Instruction *ExtFedByLoad;
5956 
5957   // Try to promote a chain of computation if it allows to form an extended
5958   // load.
5959   if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
5960     assert(LI && ExtFedByLoad && "Expect a valid load and extension");
5961     TPT.commit();
5962     // Move the extend into the same block as the load.
5963     ExtFedByLoad->moveAfter(LI);
5964     ++NumExtsMoved;
5965     Inst = ExtFedByLoad;
5966     return true;
5967   }
5968 
5969   // Continue promoting SExts if known as considerable depending on targets.
5970   if (ATPConsiderable &&
5971       performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
5972                                   HasPromoted, TPT, SpeculativelyMovedExts))
5973     return true;
5974 
5975   TPT.rollback(LastKnownGood);
5976   return false;
5977 }
5978 
5979 // Perform address type promotion if doing so is profitable.
5980 // If AllowPromotionWithoutCommonHeader == false, we should find other sext
5981 // instructions that sign extended the same initial value. However, if
5982 // AllowPromotionWithoutCommonHeader == true, we expect promoting the
5983 // extension is just profitable.
5984 bool CodeGenPrepare::performAddressTypePromotion(
5985     Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
5986     bool HasPromoted, TypePromotionTransaction &TPT,
5987     SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
5988   bool Promoted = false;
5989   SmallPtrSet<Instruction *, 1> UnhandledExts;
5990   bool AllSeenFirst = true;
5991   for (auto *I : SpeculativelyMovedExts) {
5992     Value *HeadOfChain = I->getOperand(0);
5993     DenseMap<Value *, Instruction *>::iterator AlreadySeen =
5994         SeenChainsForSExt.find(HeadOfChain);
5995     // If there is an unhandled SExt which has the same header, try to promote
5996     // it as well.
5997     if (AlreadySeen != SeenChainsForSExt.end()) {
5998       if (AlreadySeen->second != nullptr)
5999         UnhandledExts.insert(AlreadySeen->second);
6000       AllSeenFirst = false;
6001     }
6002   }
6003 
6004   if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
6005                         SpeculativelyMovedExts.size() == 1)) {
6006     TPT.commit();
6007     if (HasPromoted)
6008       Promoted = true;
6009     for (auto *I : SpeculativelyMovedExts) {
6010       Value *HeadOfChain = I->getOperand(0);
6011       SeenChainsForSExt[HeadOfChain] = nullptr;
6012       ValToSExtendedUses[HeadOfChain].push_back(I);
6013     }
6014     // Update Inst as promotion happen.
6015     Inst = SpeculativelyMovedExts.pop_back_val();
6016   } else {
6017     // This is the first chain visited from the header, keep the current chain
6018     // as unhandled. Defer to promote this until we encounter another SExt
6019     // chain derived from the same header.
6020     for (auto *I : SpeculativelyMovedExts) {
6021       Value *HeadOfChain = I->getOperand(0);
6022       SeenChainsForSExt[HeadOfChain] = Inst;
6023     }
6024     return false;
6025   }
6026 
6027   if (!AllSeenFirst && !UnhandledExts.empty())
6028     for (auto *VisitedSExt : UnhandledExts) {
6029       if (RemovedInsts.count(VisitedSExt))
6030         continue;
6031       TypePromotionTransaction TPT(RemovedInsts);
6032       SmallVector<Instruction *, 1> Exts;
6033       SmallVector<Instruction *, 2> Chains;
6034       Exts.push_back(VisitedSExt);
6035       bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
6036       TPT.commit();
6037       if (HasPromoted)
6038         Promoted = true;
6039       for (auto *I : Chains) {
6040         Value *HeadOfChain = I->getOperand(0);
6041         // Mark this as handled.
6042         SeenChainsForSExt[HeadOfChain] = nullptr;
6043         ValToSExtendedUses[HeadOfChain].push_back(I);
6044       }
6045     }
6046   return Promoted;
6047 }
6048 
6049 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
6050   BasicBlock *DefBB = I->getParent();
6051 
6052   // If the result of a {s|z}ext and its source are both live out, rewrite all
6053   // other uses of the source with result of extension.
6054   Value *Src = I->getOperand(0);
6055   if (Src->hasOneUse())
6056     return false;
6057 
6058   // Only do this xform if truncating is free.
6059   if (!TLI->isTruncateFree(I->getType(), Src->getType()))
6060     return false;
6061 
6062   // Only safe to perform the optimization if the source is also defined in
6063   // this block.
6064   if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
6065     return false;
6066 
6067   bool DefIsLiveOut = false;
6068   for (User *U : I->users()) {
6069     Instruction *UI = cast<Instruction>(U);
6070 
6071     // Figure out which BB this ext is used in.
6072     BasicBlock *UserBB = UI->getParent();
6073     if (UserBB == DefBB) continue;
6074     DefIsLiveOut = true;
6075     break;
6076   }
6077   if (!DefIsLiveOut)
6078     return false;
6079 
6080   // Make sure none of the uses are PHI nodes.
6081   for (User *U : Src->users()) {
6082     Instruction *UI = cast<Instruction>(U);
6083     BasicBlock *UserBB = UI->getParent();
6084     if (UserBB == DefBB) continue;
6085     // Be conservative. We don't want this xform to end up introducing
6086     // reloads just before load / store instructions.
6087     if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
6088       return false;
6089   }
6090 
6091   // InsertedTruncs - Only insert one trunc in each block once.
6092   DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
6093 
6094   bool MadeChange = false;
6095   for (Use &U : Src->uses()) {
6096     Instruction *User = cast<Instruction>(U.getUser());
6097 
6098     // Figure out which BB this ext is used in.
6099     BasicBlock *UserBB = User->getParent();
6100     if (UserBB == DefBB) continue;
6101 
6102     // Both src and def are live in this block. Rewrite the use.
6103     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
6104 
6105     if (!InsertedTrunc) {
6106       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
6107       assert(InsertPt != UserBB->end());
6108       InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
6109       InsertedInsts.insert(InsertedTrunc);
6110     }
6111 
6112     // Replace a use of the {s|z}ext source with a use of the result.
6113     U = InsertedTrunc;
6114     ++NumExtUses;
6115     MadeChange = true;
6116   }
6117 
6118   return MadeChange;
6119 }
6120 
6121 // Find loads whose uses only use some of the loaded value's bits.  Add an "and"
6122 // just after the load if the target can fold this into one extload instruction,
6123 // with the hope of eliminating some of the other later "and" instructions using
6124 // the loaded value.  "and"s that are made trivially redundant by the insertion
6125 // of the new "and" are removed by this function, while others (e.g. those whose
6126 // path from the load goes through a phi) are left for isel to potentially
6127 // remove.
6128 //
6129 // For example:
6130 //
6131 // b0:
6132 //   x = load i32
6133 //   ...
6134 // b1:
6135 //   y = and x, 0xff
6136 //   z = use y
6137 //
6138 // becomes:
6139 //
6140 // b0:
6141 //   x = load i32
6142 //   x' = and x, 0xff
6143 //   ...
6144 // b1:
6145 //   z = use x'
6146 //
6147 // whereas:
6148 //
6149 // b0:
6150 //   x1 = load i32
6151 //   ...
6152 // b1:
6153 //   x2 = load i32
6154 //   ...
6155 // b2:
6156 //   x = phi x1, x2
6157 //   y = and x, 0xff
6158 //
6159 // becomes (after a call to optimizeLoadExt for each load):
6160 //
6161 // b0:
6162 //   x1 = load i32
6163 //   x1' = and x1, 0xff
6164 //   ...
6165 // b1:
6166 //   x2 = load i32
6167 //   x2' = and x2, 0xff
6168 //   ...
6169 // b2:
6170 //   x = phi x1', x2'
6171 //   y = and x, 0xff
6172 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
6173   if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
6174     return false;
6175 
6176   // Skip loads we've already transformed.
6177   if (Load->hasOneUse() &&
6178       InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
6179     return false;
6180 
6181   // Look at all uses of Load, looking through phis, to determine how many bits
6182   // of the loaded value are needed.
6183   SmallVector<Instruction *, 8> WorkList;
6184   SmallPtrSet<Instruction *, 16> Visited;
6185   SmallVector<Instruction *, 8> AndsToMaybeRemove;
6186   for (auto *U : Load->users())
6187     WorkList.push_back(cast<Instruction>(U));
6188 
6189   EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
6190   unsigned BitWidth = LoadResultVT.getSizeInBits();
6191   APInt DemandBits(BitWidth, 0);
6192   APInt WidestAndBits(BitWidth, 0);
6193 
6194   while (!WorkList.empty()) {
6195     Instruction *I = WorkList.back();
6196     WorkList.pop_back();
6197 
6198     // Break use-def graph loops.
6199     if (!Visited.insert(I).second)
6200       continue;
6201 
6202     // For a PHI node, push all of its users.
6203     if (auto *Phi = dyn_cast<PHINode>(I)) {
6204       for (auto *U : Phi->users())
6205         WorkList.push_back(cast<Instruction>(U));
6206       continue;
6207     }
6208 
6209     switch (I->getOpcode()) {
6210     case Instruction::And: {
6211       auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
6212       if (!AndC)
6213         return false;
6214       APInt AndBits = AndC->getValue();
6215       DemandBits |= AndBits;
6216       // Keep track of the widest and mask we see.
6217       if (AndBits.ugt(WidestAndBits))
6218         WidestAndBits = AndBits;
6219       if (AndBits == WidestAndBits && I->getOperand(0) == Load)
6220         AndsToMaybeRemove.push_back(I);
6221       break;
6222     }
6223 
6224     case Instruction::Shl: {
6225       auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
6226       if (!ShlC)
6227         return false;
6228       uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
6229       DemandBits.setLowBits(BitWidth - ShiftAmt);
6230       break;
6231     }
6232 
6233     case Instruction::Trunc: {
6234       EVT TruncVT = TLI->getValueType(*DL, I->getType());
6235       unsigned TruncBitWidth = TruncVT.getSizeInBits();
6236       DemandBits.setLowBits(TruncBitWidth);
6237       break;
6238     }
6239 
6240     default:
6241       return false;
6242     }
6243   }
6244 
6245   uint32_t ActiveBits = DemandBits.getActiveBits();
6246   // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
6247   // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
6248   // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
6249   // (and (load x) 1) is not matched as a single instruction, rather as a LDR
6250   // followed by an AND.
6251   // TODO: Look into removing this restriction by fixing backends to either
6252   // return false for isLoadExtLegal for i1 or have them select this pattern to
6253   // a single instruction.
6254   //
6255   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
6256   // mask, since these are the only ands that will be removed by isel.
6257   if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
6258       WidestAndBits != DemandBits)
6259     return false;
6260 
6261   LLVMContext &Ctx = Load->getType()->getContext();
6262   Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
6263   EVT TruncVT = TLI->getValueType(*DL, TruncTy);
6264 
6265   // Reject cases that won't be matched as extloads.
6266   if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
6267       !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
6268     return false;
6269 
6270   IRBuilder<> Builder(Load->getNextNode());
6271   auto *NewAnd = cast<Instruction>(
6272       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
6273   // Mark this instruction as "inserted by CGP", so that other
6274   // optimizations don't touch it.
6275   InsertedInsts.insert(NewAnd);
6276 
6277   // Replace all uses of load with new and (except for the use of load in the
6278   // new and itself).
6279   Load->replaceAllUsesWith(NewAnd);
6280   NewAnd->setOperand(0, Load);
6281 
6282   // Remove any and instructions that are now redundant.
6283   for (auto *And : AndsToMaybeRemove)
6284     // Check that the and mask is the same as the one we decided to put on the
6285     // new and.
6286     if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
6287       And->replaceAllUsesWith(NewAnd);
6288       if (&*CurInstIterator == And)
6289         CurInstIterator = std::next(And->getIterator());
6290       And->eraseFromParent();
6291       ++NumAndUses;
6292     }
6293 
6294   ++NumAndsAdded;
6295   return true;
6296 }
6297 
6298 /// Check if V (an operand of a select instruction) is an expensive instruction
6299 /// that is only used once.
6300 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
6301   auto *I = dyn_cast<Instruction>(V);
6302   // If it's safe to speculatively execute, then it should not have side
6303   // effects; therefore, it's safe to sink and possibly *not* execute.
6304   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
6305          TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
6306          TargetTransformInfo::TCC_Expensive;
6307 }
6308 
6309 /// Returns true if a SelectInst should be turned into an explicit branch.
6310 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
6311                                                 const TargetLowering *TLI,
6312                                                 SelectInst *SI) {
6313   // If even a predictable select is cheap, then a branch can't be cheaper.
6314   if (!TLI->isPredictableSelectExpensive())
6315     return false;
6316 
6317   // FIXME: This should use the same heuristics as IfConversion to determine
6318   // whether a select is better represented as a branch.
6319 
6320   // If metadata tells us that the select condition is obviously predictable,
6321   // then we want to replace the select with a branch.
6322   uint64_t TrueWeight, FalseWeight;
6323   if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
6324     uint64_t Max = std::max(TrueWeight, FalseWeight);
6325     uint64_t Sum = TrueWeight + FalseWeight;
6326     if (Sum != 0) {
6327       auto Probability = BranchProbability::getBranchProbability(Max, Sum);
6328       if (Probability > TLI->getPredictableBranchThreshold())
6329         return true;
6330     }
6331   }
6332 
6333   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
6334 
6335   // If a branch is predictable, an out-of-order CPU can avoid blocking on its
6336   // comparison condition. If the compare has more than one use, there's
6337   // probably another cmov or setcc around, so it's not worth emitting a branch.
6338   if (!Cmp || !Cmp->hasOneUse())
6339     return false;
6340 
6341   // If either operand of the select is expensive and only needed on one side
6342   // of the select, we should form a branch.
6343   if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
6344       sinkSelectOperand(TTI, SI->getFalseValue()))
6345     return true;
6346 
6347   return false;
6348 }
6349 
6350 /// If \p isTrue is true, return the true value of \p SI, otherwise return
6351 /// false value of \p SI. If the true/false value of \p SI is defined by any
6352 /// select instructions in \p Selects, look through the defining select
6353 /// instruction until the true/false value is not defined in \p Selects.
6354 static Value *getTrueOrFalseValue(
6355     SelectInst *SI, bool isTrue,
6356     const SmallPtrSet<const Instruction *, 2> &Selects) {
6357   Value *V = nullptr;
6358 
6359   for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
6360        DefSI = dyn_cast<SelectInst>(V)) {
6361     assert(DefSI->getCondition() == SI->getCondition() &&
6362            "The condition of DefSI does not match with SI");
6363     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
6364   }
6365 
6366   assert(V && "Failed to get select true/false value");
6367   return V;
6368 }
6369 
6370 bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
6371   assert(Shift->isShift() && "Expected a shift");
6372 
6373   // If this is (1) a vector shift, (2) shifts by scalars are cheaper than
6374   // general vector shifts, and (3) the shift amount is a select-of-splatted
6375   // values, hoist the shifts before the select:
6376   //   shift Op0, (select Cond, TVal, FVal) -->
6377   //   select Cond, (shift Op0, TVal), (shift Op0, FVal)
6378   //
6379   // This is inverting a generic IR transform when we know that the cost of a
6380   // general vector shift is more than the cost of 2 shift-by-scalars.
6381   // We can't do this effectively in SDAG because we may not be able to
6382   // determine if the select operands are splats from within a basic block.
6383   Type *Ty = Shift->getType();
6384   if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
6385     return false;
6386   Value *Cond, *TVal, *FVal;
6387   if (!match(Shift->getOperand(1),
6388              m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
6389     return false;
6390   if (!isSplatValue(TVal) || !isSplatValue(FVal))
6391     return false;
6392 
6393   IRBuilder<> Builder(Shift);
6394   BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
6395   Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
6396   Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
6397   Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
6398   Shift->replaceAllUsesWith(NewSel);
6399   Shift->eraseFromParent();
6400   return true;
6401 }
6402 
6403 bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
6404   Intrinsic::ID Opcode = Fsh->getIntrinsicID();
6405   assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
6406          "Expected a funnel shift");
6407 
6408   // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
6409   // than general vector shifts, and (3) the shift amount is select-of-splatted
6410   // values, hoist the funnel shifts before the select:
6411   //   fsh Op0, Op1, (select Cond, TVal, FVal) -->
6412   //   select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
6413   //
6414   // This is inverting a generic IR transform when we know that the cost of a
6415   // general vector shift is more than the cost of 2 shift-by-scalars.
6416   // We can't do this effectively in SDAG because we may not be able to
6417   // determine if the select operands are splats from within a basic block.
6418   Type *Ty = Fsh->getType();
6419   if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
6420     return false;
6421   Value *Cond, *TVal, *FVal;
6422   if (!match(Fsh->getOperand(2),
6423              m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
6424     return false;
6425   if (!isSplatValue(TVal) || !isSplatValue(FVal))
6426     return false;
6427 
6428   IRBuilder<> Builder(Fsh);
6429   Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
6430   Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
6431   Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
6432   Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
6433   Fsh->replaceAllUsesWith(NewSel);
6434   Fsh->eraseFromParent();
6435   return true;
6436 }
6437 
6438 /// If we have a SelectInst that will likely profit from branch prediction,
6439 /// turn it into a branch.
6440 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
6441   // If branch conversion isn't desirable, exit early.
6442   if (DisableSelectToBranch || OptSize ||
6443       llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))
6444     return false;
6445 
6446   // Find all consecutive select instructions that share the same condition.
6447   SmallVector<SelectInst *, 2> ASI;
6448   ASI.push_back(SI);
6449   for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
6450        It != SI->getParent()->end(); ++It) {
6451     SelectInst *I = dyn_cast<SelectInst>(&*It);
6452     if (I && SI->getCondition() == I->getCondition()) {
6453       ASI.push_back(I);
6454     } else {
6455       break;
6456     }
6457   }
6458 
6459   SelectInst *LastSI = ASI.back();
6460   // Increment the current iterator to skip all the rest of select instructions
6461   // because they will be either "not lowered" or "all lowered" to branch.
6462   CurInstIterator = std::next(LastSI->getIterator());
6463 
6464   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
6465 
6466   // Can we convert the 'select' to CF ?
6467   if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable))
6468     return false;
6469 
6470   TargetLowering::SelectSupportKind SelectKind;
6471   if (VectorCond)
6472     SelectKind = TargetLowering::VectorMaskSelect;
6473   else if (SI->getType()->isVectorTy())
6474     SelectKind = TargetLowering::ScalarCondVectorVal;
6475   else
6476     SelectKind = TargetLowering::ScalarValSelect;
6477 
6478   if (TLI->isSelectSupported(SelectKind) &&
6479       !isFormingBranchFromSelectProfitable(TTI, TLI, SI))
6480     return false;
6481 
6482   // The DominatorTree needs to be rebuilt by any consumers after this
6483   // transformation. We simply reset here rather than setting the ModifiedDT
6484   // flag to avoid restarting the function walk in runOnFunction for each
6485   // select optimized.
6486   DT.reset();
6487 
6488   // Transform a sequence like this:
6489   //    start:
6490   //       %cmp = cmp uge i32 %a, %b
6491   //       %sel = select i1 %cmp, i32 %c, i32 %d
6492   //
6493   // Into:
6494   //    start:
6495   //       %cmp = cmp uge i32 %a, %b
6496   //       %cmp.frozen = freeze %cmp
6497   //       br i1 %cmp.frozen, label %select.true, label %select.false
6498   //    select.true:
6499   //       br label %select.end
6500   //    select.false:
6501   //       br label %select.end
6502   //    select.end:
6503   //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
6504   //
6505   // %cmp should be frozen, otherwise it may introduce undefined behavior.
6506   // In addition, we may sink instructions that produce %c or %d from
6507   // the entry block into the destination(s) of the new branch.
6508   // If the true or false blocks do not contain a sunken instruction, that
6509   // block and its branch may be optimized away. In that case, one side of the
6510   // first branch will point directly to select.end, and the corresponding PHI
6511   // predecessor block will be the start block.
6512 
6513   // First, we split the block containing the select into 2 blocks.
6514   BasicBlock *StartBlock = SI->getParent();
6515   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
6516   BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
6517   BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
6518 
6519   // Delete the unconditional branch that was just created by the split.
6520   StartBlock->getTerminator()->eraseFromParent();
6521 
6522   // These are the new basic blocks for the conditional branch.
6523   // At least one will become an actual new basic block.
6524   BasicBlock *TrueBlock = nullptr;
6525   BasicBlock *FalseBlock = nullptr;
6526   BranchInst *TrueBranch = nullptr;
6527   BranchInst *FalseBranch = nullptr;
6528 
6529   // Sink expensive instructions into the conditional blocks to avoid executing
6530   // them speculatively.
6531   for (SelectInst *SI : ASI) {
6532     if (sinkSelectOperand(TTI, SI->getTrueValue())) {
6533       if (TrueBlock == nullptr) {
6534         TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
6535                                        EndBlock->getParent(), EndBlock);
6536         TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
6537         TrueBranch->setDebugLoc(SI->getDebugLoc());
6538       }
6539       auto *TrueInst = cast<Instruction>(SI->getTrueValue());
6540       TrueInst->moveBefore(TrueBranch);
6541     }
6542     if (sinkSelectOperand(TTI, SI->getFalseValue())) {
6543       if (FalseBlock == nullptr) {
6544         FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
6545                                         EndBlock->getParent(), EndBlock);
6546         FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
6547         FalseBranch->setDebugLoc(SI->getDebugLoc());
6548       }
6549       auto *FalseInst = cast<Instruction>(SI->getFalseValue());
6550       FalseInst->moveBefore(FalseBranch);
6551     }
6552   }
6553 
6554   // If there was nothing to sink, then arbitrarily choose the 'false' side
6555   // for a new input value to the PHI.
6556   if (TrueBlock == FalseBlock) {
6557     assert(TrueBlock == nullptr &&
6558            "Unexpected basic block transform while optimizing select");
6559 
6560     FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
6561                                     EndBlock->getParent(), EndBlock);
6562     auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
6563     FalseBranch->setDebugLoc(SI->getDebugLoc());
6564   }
6565 
6566   // Insert the real conditional branch based on the original condition.
6567   // If we did not create a new block for one of the 'true' or 'false' paths
6568   // of the condition, it means that side of the branch goes to the end block
6569   // directly and the path originates from the start block from the point of
6570   // view of the new PHI.
6571   BasicBlock *TT, *FT;
6572   if (TrueBlock == nullptr) {
6573     TT = EndBlock;
6574     FT = FalseBlock;
6575     TrueBlock = StartBlock;
6576   } else if (FalseBlock == nullptr) {
6577     TT = TrueBlock;
6578     FT = EndBlock;
6579     FalseBlock = StartBlock;
6580   } else {
6581     TT = TrueBlock;
6582     FT = FalseBlock;
6583   }
6584   IRBuilder<> IB(SI);
6585   auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
6586   IB.CreateCondBr(CondFr, TT, FT, SI);
6587 
6588   SmallPtrSet<const Instruction *, 2> INS;
6589   INS.insert(ASI.begin(), ASI.end());
6590   // Use reverse iterator because later select may use the value of the
6591   // earlier select, and we need to propagate value through earlier select
6592   // to get the PHI operand.
6593   for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
6594     SelectInst *SI = *It;
6595     // The select itself is replaced with a PHI Node.
6596     PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
6597     PN->takeName(SI);
6598     PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
6599     PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
6600     PN->setDebugLoc(SI->getDebugLoc());
6601 
6602     SI->replaceAllUsesWith(PN);
6603     SI->eraseFromParent();
6604     INS.erase(SI);
6605     ++NumSelectsExpanded;
6606   }
6607 
6608   // Instruct OptimizeBlock to skip to the next block.
6609   CurInstIterator = StartBlock->end();
6610   return true;
6611 }
6612 
6613 /// Some targets only accept certain types for splat inputs. For example a VDUP
6614 /// in MVE takes a GPR (integer) register, and the instruction that incorporate
6615 /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
6616 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
6617   if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
6618                             m_Undef(), m_ZeroMask())))
6619     return false;
6620   Type *NewType = TLI->shouldConvertSplatType(SVI);
6621   if (!NewType)
6622     return false;
6623 
6624   VectorType *SVIVecType = cast<VectorType>(SVI->getType());
6625   assert(!NewType->isVectorTy() && "Expected a scalar type!");
6626   assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
6627          "Expected a type of the same size!");
6628   auto *NewVecType =
6629       FixedVectorType::get(NewType, SVIVecType->getNumElements());
6630 
6631   // Create a bitcast (shuffle (insert (bitcast(..))))
6632   IRBuilder<> Builder(SVI->getContext());
6633   Builder.SetInsertPoint(SVI);
6634   Value *BC1 = Builder.CreateBitCast(
6635       cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
6636   Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,
6637                                               (uint64_t)0);
6638   Value *Shuffle = Builder.CreateShuffleVector(
6639       Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());
6640   Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
6641 
6642   SVI->replaceAllUsesWith(BC2);
6643   RecursivelyDeleteTriviallyDeadInstructions(SVI);
6644 
6645   // Also hoist the bitcast up to its operand if it they are not in the same
6646   // block.
6647   if (auto *BCI = dyn_cast<Instruction>(BC1))
6648     if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))
6649       if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&
6650           !Op->isTerminator() && !Op->isEHPad())
6651         BCI->moveAfter(Op);
6652 
6653   return true;
6654 }
6655 
6656 bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
6657   // If the operands of I can be folded into a target instruction together with
6658   // I, duplicate and sink them.
6659   SmallVector<Use *, 4> OpsToSink;
6660   if (!TLI->shouldSinkOperands(I, OpsToSink))
6661     return false;
6662 
6663   // OpsToSink can contain multiple uses in a use chain (e.g.
6664   // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
6665   // uses must come first, so we process the ops in reverse order so as to not
6666   // create invalid IR.
6667   BasicBlock *TargetBB = I->getParent();
6668   bool Changed = false;
6669   SmallVector<Use *, 4> ToReplace;
6670   for (Use *U : reverse(OpsToSink)) {
6671     auto *UI = cast<Instruction>(U->get());
6672     if (UI->getParent() == TargetBB || isa<PHINode>(UI))
6673       continue;
6674     ToReplace.push_back(U);
6675   }
6676 
6677   SetVector<Instruction *> MaybeDead;
6678   DenseMap<Instruction *, Instruction *> NewInstructions;
6679   Instruction *InsertPoint = I;
6680   for (Use *U : ToReplace) {
6681     auto *UI = cast<Instruction>(U->get());
6682     Instruction *NI = UI->clone();
6683     NewInstructions[UI] = NI;
6684     MaybeDead.insert(UI);
6685     LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
6686     NI->insertBefore(InsertPoint);
6687     InsertPoint = NI;
6688     InsertedInsts.insert(NI);
6689 
6690     // Update the use for the new instruction, making sure that we update the
6691     // sunk instruction uses, if it is part of a chain that has already been
6692     // sunk.
6693     Instruction *OldI = cast<Instruction>(U->getUser());
6694     if (NewInstructions.count(OldI))
6695       NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
6696     else
6697       U->set(NI);
6698     Changed = true;
6699   }
6700 
6701   // Remove instructions that are dead after sinking.
6702   for (auto *I : MaybeDead) {
6703     if (!I->hasNUsesOrMore(1)) {
6704       LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
6705       I->eraseFromParent();
6706     }
6707   }
6708 
6709   return Changed;
6710 }
6711 
6712 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
6713   Value *Cond = SI->getCondition();
6714   Type *OldType = Cond->getType();
6715   LLVMContext &Context = Cond->getContext();
6716   MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
6717   unsigned RegWidth = RegType.getSizeInBits();
6718 
6719   if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
6720     return false;
6721 
6722   // If the register width is greater than the type width, expand the condition
6723   // of the switch instruction and each case constant to the width of the
6724   // register. By widening the type of the switch condition, subsequent
6725   // comparisons (for case comparisons) will not need to be extended to the
6726   // preferred register width, so we will potentially eliminate N-1 extends,
6727   // where N is the number of cases in the switch.
6728   auto *NewType = Type::getIntNTy(Context, RegWidth);
6729 
6730   // Zero-extend the switch condition and case constants unless the switch
6731   // condition is a function argument that is already being sign-extended.
6732   // In that case, we can avoid an unnecessary mask/extension by sign-extending
6733   // everything instead.
6734   Instruction::CastOps ExtType = Instruction::ZExt;
6735   if (auto *Arg = dyn_cast<Argument>(Cond))
6736     if (Arg->hasSExtAttr())
6737       ExtType = Instruction::SExt;
6738 
6739   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
6740   ExtInst->insertBefore(SI);
6741   ExtInst->setDebugLoc(SI->getDebugLoc());
6742   SI->setCondition(ExtInst);
6743   for (auto Case : SI->cases()) {
6744     APInt NarrowConst = Case.getCaseValue()->getValue();
6745     APInt WideConst = (ExtType == Instruction::ZExt) ?
6746                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
6747     Case.setValue(ConstantInt::get(Context, WideConst));
6748   }
6749 
6750   return true;
6751 }
6752 
6753 
6754 namespace {
6755 
6756 /// Helper class to promote a scalar operation to a vector one.
6757 /// This class is used to move downward extractelement transition.
6758 /// E.g.,
6759 /// a = vector_op <2 x i32>
6760 /// b = extractelement <2 x i32> a, i32 0
6761 /// c = scalar_op b
6762 /// store c
6763 ///
6764 /// =>
6765 /// a = vector_op <2 x i32>
6766 /// c = vector_op a (equivalent to scalar_op on the related lane)
6767 /// * d = extractelement <2 x i32> c, i32 0
6768 /// * store d
6769 /// Assuming both extractelement and store can be combine, we get rid of the
6770 /// transition.
6771 class VectorPromoteHelper {
6772   /// DataLayout associated with the current module.
6773   const DataLayout &DL;
6774 
6775   /// Used to perform some checks on the legality of vector operations.
6776   const TargetLowering &TLI;
6777 
6778   /// Used to estimated the cost of the promoted chain.
6779   const TargetTransformInfo &TTI;
6780 
6781   /// The transition being moved downwards.
6782   Instruction *Transition;
6783 
6784   /// The sequence of instructions to be promoted.
6785   SmallVector<Instruction *, 4> InstsToBePromoted;
6786 
6787   /// Cost of combining a store and an extract.
6788   unsigned StoreExtractCombineCost;
6789 
6790   /// Instruction that will be combined with the transition.
6791   Instruction *CombineInst = nullptr;
6792 
6793   /// The instruction that represents the current end of the transition.
6794   /// Since we are faking the promotion until we reach the end of the chain
6795   /// of computation, we need a way to get the current end of the transition.
6796   Instruction *getEndOfTransition() const {
6797     if (InstsToBePromoted.empty())
6798       return Transition;
6799     return InstsToBePromoted.back();
6800   }
6801 
6802   /// Return the index of the original value in the transition.
6803   /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
6804   /// c, is at index 0.
6805   unsigned getTransitionOriginalValueIdx() const {
6806     assert(isa<ExtractElementInst>(Transition) &&
6807            "Other kind of transitions are not supported yet");
6808     return 0;
6809   }
6810 
6811   /// Return the index of the index in the transition.
6812   /// E.g., for "extractelement <2 x i32> c, i32 0" the index
6813   /// is at index 1.
6814   unsigned getTransitionIdx() const {
6815     assert(isa<ExtractElementInst>(Transition) &&
6816            "Other kind of transitions are not supported yet");
6817     return 1;
6818   }
6819 
6820   /// Get the type of the transition.
6821   /// This is the type of the original value.
6822   /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
6823   /// transition is <2 x i32>.
6824   Type *getTransitionType() const {
6825     return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
6826   }
6827 
6828   /// Promote \p ToBePromoted by moving \p Def downward through.
6829   /// I.e., we have the following sequence:
6830   /// Def = Transition <ty1> a to <ty2>
6831   /// b = ToBePromoted <ty2> Def, ...
6832   /// =>
6833   /// b = ToBePromoted <ty1> a, ...
6834   /// Def = Transition <ty1> ToBePromoted to <ty2>
6835   void promoteImpl(Instruction *ToBePromoted);
6836 
6837   /// Check whether or not it is profitable to promote all the
6838   /// instructions enqueued to be promoted.
6839   bool isProfitableToPromote() {
6840     Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
6841     unsigned Index = isa<ConstantInt>(ValIdx)
6842                          ? cast<ConstantInt>(ValIdx)->getZExtValue()
6843                          : -1;
6844     Type *PromotedType = getTransitionType();
6845 
6846     StoreInst *ST = cast<StoreInst>(CombineInst);
6847     unsigned AS = ST->getPointerAddressSpace();
6848     unsigned Align = ST->getAlignment();
6849     // Check if this store is supported.
6850     if (!TLI.allowsMisalignedMemoryAccesses(
6851             TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
6852             Align)) {
6853       // If this is not supported, there is no way we can combine
6854       // the extract with the store.
6855       return false;
6856     }
6857 
6858     // The scalar chain of computation has to pay for the transition
6859     // scalar to vector.
6860     // The vector chain has to account for the combining cost.
6861     uint64_t ScalarCost =
6862         TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
6863     uint64_t VectorCost = StoreExtractCombineCost;
6864     enum TargetTransformInfo::TargetCostKind CostKind =
6865       TargetTransformInfo::TCK_RecipThroughput;
6866     for (const auto &Inst : InstsToBePromoted) {
6867       // Compute the cost.
6868       // By construction, all instructions being promoted are arithmetic ones.
6869       // Moreover, one argument is a constant that can be viewed as a splat
6870       // constant.
6871       Value *Arg0 = Inst->getOperand(0);
6872       bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
6873                             isa<ConstantFP>(Arg0);
6874       TargetTransformInfo::OperandValueKind Arg0OVK =
6875           IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
6876                          : TargetTransformInfo::OK_AnyValue;
6877       TargetTransformInfo::OperandValueKind Arg1OVK =
6878           !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
6879                           : TargetTransformInfo::OK_AnyValue;
6880       ScalarCost += TTI.getArithmeticInstrCost(
6881           Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK);
6882       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
6883                                                CostKind,
6884                                                Arg0OVK, Arg1OVK);
6885     }
6886     LLVM_DEBUG(
6887         dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
6888                << ScalarCost << "\nVector: " << VectorCost << '\n');
6889     return ScalarCost > VectorCost;
6890   }
6891 
6892   /// Generate a constant vector with \p Val with the same
6893   /// number of elements as the transition.
6894   /// \p UseSplat defines whether or not \p Val should be replicated
6895   /// across the whole vector.
6896   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
6897   /// otherwise we generate a vector with as many undef as possible:
6898   /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
6899   /// used at the index of the extract.
6900   Value *getConstantVector(Constant *Val, bool UseSplat) const {
6901     unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
6902     if (!UseSplat) {
6903       // If we cannot determine where the constant must be, we have to
6904       // use a splat constant.
6905       Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
6906       if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
6907         ExtractIdx = CstVal->getSExtValue();
6908       else
6909         UseSplat = true;
6910     }
6911 
6912     ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();
6913     if (UseSplat)
6914       return ConstantVector::getSplat(EC, Val);
6915 
6916     if (!EC.Scalable) {
6917       SmallVector<Constant *, 4> ConstVec;
6918       UndefValue *UndefVal = UndefValue::get(Val->getType());
6919       for (unsigned Idx = 0; Idx != EC.Min; ++Idx) {
6920         if (Idx == ExtractIdx)
6921           ConstVec.push_back(Val);
6922         else
6923           ConstVec.push_back(UndefVal);
6924       }
6925       return ConstantVector::get(ConstVec);
6926     } else
6927       llvm_unreachable(
6928           "Generate scalable vector for non-splat is unimplemented");
6929   }
6930 
6931   /// Check if promoting to a vector type an operand at \p OperandIdx
6932   /// in \p Use can trigger undefined behavior.
6933   static bool canCauseUndefinedBehavior(const Instruction *Use,
6934                                         unsigned OperandIdx) {
6935     // This is not safe to introduce undef when the operand is on
6936     // the right hand side of a division-like instruction.
6937     if (OperandIdx != 1)
6938       return false;
6939     switch (Use->getOpcode()) {
6940     default:
6941       return false;
6942     case Instruction::SDiv:
6943     case Instruction::UDiv:
6944     case Instruction::SRem:
6945     case Instruction::URem:
6946       return true;
6947     case Instruction::FDiv:
6948     case Instruction::FRem:
6949       return !Use->hasNoNaNs();
6950     }
6951     llvm_unreachable(nullptr);
6952   }
6953 
6954 public:
6955   VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
6956                       const TargetTransformInfo &TTI, Instruction *Transition,
6957                       unsigned CombineCost)
6958       : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
6959         StoreExtractCombineCost(CombineCost) {
6960     assert(Transition && "Do not know how to promote null");
6961   }
6962 
6963   /// Check if we can promote \p ToBePromoted to \p Type.
6964   bool canPromote(const Instruction *ToBePromoted) const {
6965     // We could support CastInst too.
6966     return isa<BinaryOperator>(ToBePromoted);
6967   }
6968 
6969   /// Check if it is profitable to promote \p ToBePromoted
6970   /// by moving downward the transition through.
6971   bool shouldPromote(const Instruction *ToBePromoted) const {
6972     // Promote only if all the operands can be statically expanded.
6973     // Indeed, we do not want to introduce any new kind of transitions.
6974     for (const Use &U : ToBePromoted->operands()) {
6975       const Value *Val = U.get();
6976       if (Val == getEndOfTransition()) {
6977         // If the use is a division and the transition is on the rhs,
6978         // we cannot promote the operation, otherwise we may create a
6979         // division by zero.
6980         if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
6981           return false;
6982         continue;
6983       }
6984       if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
6985           !isa<ConstantFP>(Val))
6986         return false;
6987     }
6988     // Check that the resulting operation is legal.
6989     int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
6990     if (!ISDOpcode)
6991       return false;
6992     return StressStoreExtract ||
6993            TLI.isOperationLegalOrCustom(
6994                ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
6995   }
6996 
6997   /// Check whether or not \p Use can be combined
6998   /// with the transition.
6999   /// I.e., is it possible to do Use(Transition) => AnotherUse?
7000   bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
7001 
7002   /// Record \p ToBePromoted as part of the chain to be promoted.
7003   void enqueueForPromotion(Instruction *ToBePromoted) {
7004     InstsToBePromoted.push_back(ToBePromoted);
7005   }
7006 
7007   /// Set the instruction that will be combined with the transition.
7008   void recordCombineInstruction(Instruction *ToBeCombined) {
7009     assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
7010     CombineInst = ToBeCombined;
7011   }
7012 
7013   /// Promote all the instructions enqueued for promotion if it is
7014   /// is profitable.
7015   /// \return True if the promotion happened, false otherwise.
7016   bool promote() {
7017     // Check if there is something to promote.
7018     // Right now, if we do not have anything to combine with,
7019     // we assume the promotion is not profitable.
7020     if (InstsToBePromoted.empty() || !CombineInst)
7021       return false;
7022 
7023     // Check cost.
7024     if (!StressStoreExtract && !isProfitableToPromote())
7025       return false;
7026 
7027     // Promote.
7028     for (auto &ToBePromoted : InstsToBePromoted)
7029       promoteImpl(ToBePromoted);
7030     InstsToBePromoted.clear();
7031     return true;
7032   }
7033 };
7034 
7035 } // end anonymous namespace
7036 
7037 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
7038   // At this point, we know that all the operands of ToBePromoted but Def
7039   // can be statically promoted.
7040   // For Def, we need to use its parameter in ToBePromoted:
7041   // b = ToBePromoted ty1 a
7042   // Def = Transition ty1 b to ty2
7043   // Move the transition down.
7044   // 1. Replace all uses of the promoted operation by the transition.
7045   // = ... b => = ... Def.
7046   assert(ToBePromoted->getType() == Transition->getType() &&
7047          "The type of the result of the transition does not match "
7048          "the final type");
7049   ToBePromoted->replaceAllUsesWith(Transition);
7050   // 2. Update the type of the uses.
7051   // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
7052   Type *TransitionTy = getTransitionType();
7053   ToBePromoted->mutateType(TransitionTy);
7054   // 3. Update all the operands of the promoted operation with promoted
7055   // operands.
7056   // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
7057   for (Use &U : ToBePromoted->operands()) {
7058     Value *Val = U.get();
7059     Value *NewVal = nullptr;
7060     if (Val == Transition)
7061       NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
7062     else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
7063              isa<ConstantFP>(Val)) {
7064       // Use a splat constant if it is not safe to use undef.
7065       NewVal = getConstantVector(
7066           cast<Constant>(Val),
7067           isa<UndefValue>(Val) ||
7068               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
7069     } else
7070       llvm_unreachable("Did you modified shouldPromote and forgot to update "
7071                        "this?");
7072     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
7073   }
7074   Transition->moveAfter(ToBePromoted);
7075   Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
7076 }
7077 
7078 /// Some targets can do store(extractelement) with one instruction.
7079 /// Try to push the extractelement towards the stores when the target
7080 /// has this feature and this is profitable.
7081 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
7082   unsigned CombineCost = std::numeric_limits<unsigned>::max();
7083   if (DisableStoreExtract ||
7084       (!StressStoreExtract &&
7085        !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
7086                                        Inst->getOperand(1), CombineCost)))
7087     return false;
7088 
7089   // At this point we know that Inst is a vector to scalar transition.
7090   // Try to move it down the def-use chain, until:
7091   // - We can combine the transition with its single use
7092   //   => we got rid of the transition.
7093   // - We escape the current basic block
7094   //   => we would need to check that we are moving it at a cheaper place and
7095   //      we do not do that for now.
7096   BasicBlock *Parent = Inst->getParent();
7097   LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
7098   VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
7099   // If the transition has more than one use, assume this is not going to be
7100   // beneficial.
7101   while (Inst->hasOneUse()) {
7102     Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
7103     LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
7104 
7105     if (ToBePromoted->getParent() != Parent) {
7106       LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
7107                         << ToBePromoted->getParent()->getName()
7108                         << ") than the transition (" << Parent->getName()
7109                         << ").\n");
7110       return false;
7111     }
7112 
7113     if (VPH.canCombine(ToBePromoted)) {
7114       LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
7115                         << "will be combined with: " << *ToBePromoted << '\n');
7116       VPH.recordCombineInstruction(ToBePromoted);
7117       bool Changed = VPH.promote();
7118       NumStoreExtractExposed += Changed;
7119       return Changed;
7120     }
7121 
7122     LLVM_DEBUG(dbgs() << "Try promoting.\n");
7123     if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
7124       return false;
7125 
7126     LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
7127 
7128     VPH.enqueueForPromotion(ToBePromoted);
7129     Inst = ToBePromoted;
7130   }
7131   return false;
7132 }
7133 
7134 /// For the instruction sequence of store below, F and I values
7135 /// are bundled together as an i64 value before being stored into memory.
7136 /// Sometimes it is more efficient to generate separate stores for F and I,
7137 /// which can remove the bitwise instructions or sink them to colder places.
7138 ///
7139 ///   (store (or (zext (bitcast F to i32) to i64),
7140 ///              (shl (zext I to i64), 32)), addr)  -->
7141 ///   (store F, addr) and (store I, addr+4)
7142 ///
7143 /// Similarly, splitting for other merged store can also be beneficial, like:
7144 /// For pair of {i32, i32}, i64 store --> two i32 stores.
7145 /// For pair of {i32, i16}, i64 store --> two i32 stores.
7146 /// For pair of {i16, i16}, i32 store --> two i16 stores.
7147 /// For pair of {i16, i8},  i32 store --> two i16 stores.
7148 /// For pair of {i8, i8},   i16 store --> two i8 stores.
7149 ///
7150 /// We allow each target to determine specifically which kind of splitting is
7151 /// supported.
7152 ///
7153 /// The store patterns are commonly seen from the simple code snippet below
7154 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
7155 ///   void goo(const std::pair<int, float> &);
7156 ///   hoo() {
7157 ///     ...
7158 ///     goo(std::make_pair(tmp, ftmp));
7159 ///     ...
7160 ///   }
7161 ///
7162 /// Although we already have similar splitting in DAG Combine, we duplicate
7163 /// it in CodeGenPrepare to catch the case in which pattern is across
7164 /// multiple BBs. The logic in DAG Combine is kept to catch case generated
7165 /// during code expansion.
7166 static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
7167                                 const TargetLowering &TLI) {
7168   // Handle simple but common cases only.
7169   Type *StoreType = SI.getValueOperand()->getType();
7170 
7171   // The code below assumes shifting a value by <number of bits>,
7172   // whereas scalable vectors would have to be shifted by
7173   // <2log(vscale) + number of bits> in order to store the
7174   // low/high parts. Bailing out for now.
7175   if (isa<ScalableVectorType>(StoreType))
7176     return false;
7177 
7178   if (!DL.typeSizeEqualsStoreSize(StoreType) ||
7179       DL.getTypeSizeInBits(StoreType) == 0)
7180     return false;
7181 
7182   unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
7183   Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
7184   if (!DL.typeSizeEqualsStoreSize(SplitStoreType))
7185     return false;
7186 
7187   // Don't split the store if it is volatile.
7188   if (SI.isVolatile())
7189     return false;
7190 
7191   // Match the following patterns:
7192   // (store (or (zext LValue to i64),
7193   //            (shl (zext HValue to i64), 32)), HalfValBitSize)
7194   //  or
7195   // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
7196   //            (zext LValue to i64),
7197   // Expect both operands of OR and the first operand of SHL have only
7198   // one use.
7199   Value *LValue, *HValue;
7200   if (!match(SI.getValueOperand(),
7201              m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
7202                     m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
7203                                    m_SpecificInt(HalfValBitSize))))))
7204     return false;
7205 
7206   // Check LValue and HValue are int with size less or equal than 32.
7207   if (!LValue->getType()->isIntegerTy() ||
7208       DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
7209       !HValue->getType()->isIntegerTy() ||
7210       DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
7211     return false;
7212 
7213   // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
7214   // as the input of target query.
7215   auto *LBC = dyn_cast<BitCastInst>(LValue);
7216   auto *HBC = dyn_cast<BitCastInst>(HValue);
7217   EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
7218                   : EVT::getEVT(LValue->getType());
7219   EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
7220                    : EVT::getEVT(HValue->getType());
7221   if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
7222     return false;
7223 
7224   // Start to split store.
7225   IRBuilder<> Builder(SI.getContext());
7226   Builder.SetInsertPoint(&SI);
7227 
7228   // If LValue/HValue is a bitcast in another BB, create a new one in current
7229   // BB so it may be merged with the splitted stores by dag combiner.
7230   if (LBC && LBC->getParent() != SI.getParent())
7231     LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
7232   if (HBC && HBC->getParent() != SI.getParent())
7233     HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
7234 
7235   bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
7236   auto CreateSplitStore = [&](Value *V, bool Upper) {
7237     V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
7238     Value *Addr = Builder.CreateBitCast(
7239         SI.getOperand(1),
7240         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
7241     Align Alignment = SI.getAlign();
7242     const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
7243     if (IsOffsetStore) {
7244       Addr = Builder.CreateGEP(
7245           SplitStoreType, Addr,
7246           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
7247 
7248       // When splitting the store in half, naturally one half will retain the
7249       // alignment of the original wider store, regardless of whether it was
7250       // over-aligned or not, while the other will require adjustment.
7251       Alignment = commonAlignment(Alignment, HalfValBitSize / 8);
7252     }
7253     Builder.CreateAlignedStore(V, Addr, Alignment);
7254   };
7255 
7256   CreateSplitStore(LValue, false);
7257   CreateSplitStore(HValue, true);
7258 
7259   // Delete the old store.
7260   SI.eraseFromParent();
7261   return true;
7262 }
7263 
7264 // Return true if the GEP has two operands, the first operand is of a sequential
7265 // type, and the second operand is a constant.
7266 static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
7267   gep_type_iterator I = gep_type_begin(*GEP);
7268   return GEP->getNumOperands() == 2 &&
7269       I.isSequential() &&
7270       isa<ConstantInt>(GEP->getOperand(1));
7271 }
7272 
7273 // Try unmerging GEPs to reduce liveness interference (register pressure) across
7274 // IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
7275 // reducing liveness interference across those edges benefits global register
7276 // allocation. Currently handles only certain cases.
7277 //
7278 // For example, unmerge %GEPI and %UGEPI as below.
7279 //
7280 // ---------- BEFORE ----------
7281 // SrcBlock:
7282 //   ...
7283 //   %GEPIOp = ...
7284 //   ...
7285 //   %GEPI = gep %GEPIOp, Idx
7286 //   ...
7287 //   indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
7288 //   (* %GEPI is alive on the indirectbr edges due to other uses ahead)
7289 //   (* %GEPIOp is alive on the indirectbr edges only because of it's used by
7290 //   %UGEPI)
7291 //
7292 // DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
7293 // DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
7294 // ...
7295 //
7296 // DstBi:
7297 //   ...
7298 //   %UGEPI = gep %GEPIOp, UIdx
7299 // ...
7300 // ---------------------------
7301 //
7302 // ---------- AFTER ----------
7303 // SrcBlock:
7304 //   ... (same as above)
7305 //    (* %GEPI is still alive on the indirectbr edges)
7306 //    (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
7307 //    unmerging)
7308 // ...
7309 //
7310 // DstBi:
7311 //   ...
7312 //   %UGEPI = gep %GEPI, (UIdx-Idx)
7313 //   ...
7314 // ---------------------------
7315 //
7316 // The register pressure on the IndirectBr edges is reduced because %GEPIOp is
7317 // no longer alive on them.
7318 //
7319 // We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
7320 // of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
7321 // not to disable further simplications and optimizations as a result of GEP
7322 // merging.
7323 //
7324 // Note this unmerging may increase the length of the data flow critical path
7325 // (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
7326 // between the register pressure and the length of data-flow critical
7327 // path. Restricting this to the uncommon IndirectBr case would minimize the
7328 // impact of potentially longer critical path, if any, and the impact on compile
7329 // time.
7330 static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
7331                                              const TargetTransformInfo *TTI) {
7332   BasicBlock *SrcBlock = GEPI->getParent();
7333   // Check that SrcBlock ends with an IndirectBr. If not, give up. The common
7334   // (non-IndirectBr) cases exit early here.
7335   if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
7336     return false;
7337   // Check that GEPI is a simple gep with a single constant index.
7338   if (!GEPSequentialConstIndexed(GEPI))
7339     return false;
7340   ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
7341   // Check that GEPI is a cheap one.
7342   if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),
7343                          TargetTransformInfo::TCK_SizeAndLatency)
7344       > TargetTransformInfo::TCC_Basic)
7345     return false;
7346   Value *GEPIOp = GEPI->getOperand(0);
7347   // Check that GEPIOp is an instruction that's also defined in SrcBlock.
7348   if (!isa<Instruction>(GEPIOp))
7349     return false;
7350   auto *GEPIOpI = cast<Instruction>(GEPIOp);
7351   if (GEPIOpI->getParent() != SrcBlock)
7352     return false;
7353   // Check that GEP is used outside the block, meaning it's alive on the
7354   // IndirectBr edge(s).
7355   if (find_if(GEPI->users(), [&](User *Usr) {
7356         if (auto *I = dyn_cast<Instruction>(Usr)) {
7357           if (I->getParent() != SrcBlock) {
7358             return true;
7359           }
7360         }
7361         return false;
7362       }) == GEPI->users().end())
7363     return false;
7364   // The second elements of the GEP chains to be unmerged.
7365   std::vector<GetElementPtrInst *> UGEPIs;
7366   // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
7367   // on IndirectBr edges.
7368   for (User *Usr : GEPIOp->users()) {
7369     if (Usr == GEPI) continue;
7370     // Check if Usr is an Instruction. If not, give up.
7371     if (!isa<Instruction>(Usr))
7372       return false;
7373     auto *UI = cast<Instruction>(Usr);
7374     // Check if Usr in the same block as GEPIOp, which is fine, skip.
7375     if (UI->getParent() == SrcBlock)
7376       continue;
7377     // Check if Usr is a GEP. If not, give up.
7378     if (!isa<GetElementPtrInst>(Usr))
7379       return false;
7380     auto *UGEPI = cast<GetElementPtrInst>(Usr);
7381     // Check if UGEPI is a simple gep with a single constant index and GEPIOp is
7382     // the pointer operand to it. If so, record it in the vector. If not, give
7383     // up.
7384     if (!GEPSequentialConstIndexed(UGEPI))
7385       return false;
7386     if (UGEPI->getOperand(0) != GEPIOp)
7387       return false;
7388     if (GEPIIdx->getType() !=
7389         cast<ConstantInt>(UGEPI->getOperand(1))->getType())
7390       return false;
7391     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7392     if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),
7393                            TargetTransformInfo::TCK_SizeAndLatency)
7394         > TargetTransformInfo::TCC_Basic)
7395       return false;
7396     UGEPIs.push_back(UGEPI);
7397   }
7398   if (UGEPIs.size() == 0)
7399     return false;
7400   // Check the materializing cost of (Uidx-Idx).
7401   for (GetElementPtrInst *UGEPI : UGEPIs) {
7402     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7403     APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
7404     unsigned ImmCost =
7405       TTI->getIntImmCost(NewIdx, GEPIIdx->getType(),
7406                          TargetTransformInfo::TCK_SizeAndLatency);
7407     if (ImmCost > TargetTransformInfo::TCC_Basic)
7408       return false;
7409   }
7410   // Now unmerge between GEPI and UGEPIs.
7411   for (GetElementPtrInst *UGEPI : UGEPIs) {
7412     UGEPI->setOperand(0, GEPI);
7413     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7414     Constant *NewUGEPIIdx =
7415         ConstantInt::get(GEPIIdx->getType(),
7416                          UGEPIIdx->getValue() - GEPIIdx->getValue());
7417     UGEPI->setOperand(1, NewUGEPIIdx);
7418     // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
7419     // inbounds to avoid UB.
7420     if (!GEPI->isInBounds()) {
7421       UGEPI->setIsInBounds(false);
7422     }
7423   }
7424   // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
7425   // alive on IndirectBr edges).
7426   assert(find_if(GEPIOp->users(), [&](User *Usr) {
7427         return cast<Instruction>(Usr)->getParent() != SrcBlock;
7428       }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
7429   return true;
7430 }
7431 
7432 bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
7433   // Bail out if we inserted the instruction to prevent optimizations from
7434   // stepping on each other's toes.
7435   if (InsertedInsts.count(I))
7436     return false;
7437 
7438   // TODO: Move into the switch on opcode below here.
7439   if (PHINode *P = dyn_cast<PHINode>(I)) {
7440     // It is possible for very late stage optimizations (such as SimplifyCFG)
7441     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
7442     // trivial PHI, go ahead and zap it here.
7443     if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
7444       LargeOffsetGEPMap.erase(P);
7445       P->replaceAllUsesWith(V);
7446       P->eraseFromParent();
7447       ++NumPHIsElim;
7448       return true;
7449     }
7450     return false;
7451   }
7452 
7453   if (CastInst *CI = dyn_cast<CastInst>(I)) {
7454     // If the source of the cast is a constant, then this should have
7455     // already been constant folded.  The only reason NOT to constant fold
7456     // it is if something (e.g. LSR) was careful to place the constant
7457     // evaluation in a block other than then one that uses it (e.g. to hoist
7458     // the address of globals out of a loop).  If this is the case, we don't
7459     // want to forward-subst the cast.
7460     if (isa<Constant>(CI->getOperand(0)))
7461       return false;
7462 
7463     if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
7464       return true;
7465 
7466     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
7467       /// Sink a zext or sext into its user blocks if the target type doesn't
7468       /// fit in one register
7469       if (TLI->getTypeAction(CI->getContext(),
7470                              TLI->getValueType(*DL, CI->getType())) ==
7471           TargetLowering::TypeExpandInteger) {
7472         return SinkCast(CI);
7473       } else {
7474         bool MadeChange = optimizeExt(I);
7475         return MadeChange | optimizeExtUses(I);
7476       }
7477     }
7478     return false;
7479   }
7480 
7481   if (auto *Cmp = dyn_cast<CmpInst>(I))
7482     if (optimizeCmp(Cmp, ModifiedDT))
7483       return true;
7484 
7485   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
7486     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
7487     bool Modified = optimizeLoadExt(LI);
7488     unsigned AS = LI->getPointerAddressSpace();
7489     Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
7490     return Modified;
7491   }
7492 
7493   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
7494     if (splitMergedValStore(*SI, *DL, *TLI))
7495       return true;
7496     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
7497     unsigned AS = SI->getPointerAddressSpace();
7498     return optimizeMemoryInst(I, SI->getOperand(1),
7499                               SI->getOperand(0)->getType(), AS);
7500   }
7501 
7502   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
7503       unsigned AS = RMW->getPointerAddressSpace();
7504       return optimizeMemoryInst(I, RMW->getPointerOperand(),
7505                                 RMW->getType(), AS);
7506   }
7507 
7508   if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
7509       unsigned AS = CmpX->getPointerAddressSpace();
7510       return optimizeMemoryInst(I, CmpX->getPointerOperand(),
7511                                 CmpX->getCompareOperand()->getType(), AS);
7512   }
7513 
7514   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
7515 
7516   if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)
7517     return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
7518 
7519   // TODO: Move this into the switch on opcode - it handles shifts already.
7520   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
7521                 BinOp->getOpcode() == Instruction::LShr)) {
7522     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
7523     if (CI && TLI->hasExtractBitsInsn())
7524       if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
7525         return true;
7526   }
7527 
7528   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
7529     if (GEPI->hasAllZeroIndices()) {
7530       /// The GEP operand must be a pointer, so must its result -> BitCast
7531       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
7532                                         GEPI->getName(), GEPI);
7533       NC->setDebugLoc(GEPI->getDebugLoc());
7534       GEPI->replaceAllUsesWith(NC);
7535       GEPI->eraseFromParent();
7536       ++NumGEPsElim;
7537       optimizeInst(NC, ModifiedDT);
7538       return true;
7539     }
7540     if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
7541       return true;
7542     }
7543     return false;
7544   }
7545 
7546   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
7547     // freeze(icmp a, const)) -> icmp (freeze a), const
7548     // This helps generate efficient conditional jumps.
7549     Instruction *CmpI = nullptr;
7550     if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))
7551       CmpI = II;
7552     else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))
7553       CmpI = F->getFastMathFlags().none() ? F : nullptr;
7554 
7555     if (CmpI && CmpI->hasOneUse()) {
7556       auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);
7557       bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||
7558                     isa<ConstantPointerNull>(Op0);
7559       bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||
7560                     isa<ConstantPointerNull>(Op1);
7561       if (Const0 || Const1) {
7562         if (!Const0 || !Const1) {
7563           auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
7564           F->takeName(FI);
7565           CmpI->setOperand(Const0 ? 1 : 0, F);
7566         }
7567         FI->replaceAllUsesWith(CmpI);
7568         FI->eraseFromParent();
7569         return true;
7570       }
7571     }
7572     return false;
7573   }
7574 
7575   if (tryToSinkFreeOperands(I))
7576     return true;
7577 
7578   switch (I->getOpcode()) {
7579   case Instruction::Shl:
7580   case Instruction::LShr:
7581   case Instruction::AShr:
7582     return optimizeShiftInst(cast<BinaryOperator>(I));
7583   case Instruction::Call:
7584     return optimizeCallInst(cast<CallInst>(I), ModifiedDT);
7585   case Instruction::Select:
7586     return optimizeSelectInst(cast<SelectInst>(I));
7587   case Instruction::ShuffleVector:
7588     return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
7589   case Instruction::Switch:
7590     return optimizeSwitchInst(cast<SwitchInst>(I));
7591   case Instruction::ExtractElement:
7592     return optimizeExtractElementInst(cast<ExtractElementInst>(I));
7593   }
7594 
7595   return false;
7596 }
7597 
7598 /// Given an OR instruction, check to see if this is a bitreverse
7599 /// idiom. If so, insert the new intrinsic and return true.
7600 static bool makeBitReverse(Instruction &I, const DataLayout &DL,
7601                            const TargetLowering &TLI) {
7602   if (!I.getType()->isIntegerTy() ||
7603       !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
7604                                     TLI.getValueType(DL, I.getType(), true)))
7605     return false;
7606 
7607   SmallVector<Instruction*, 4> Insts;
7608   if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
7609     return false;
7610   Instruction *LastInst = Insts.back();
7611   I.replaceAllUsesWith(LastInst);
7612   RecursivelyDeleteTriviallyDeadInstructions(&I);
7613   return true;
7614 }
7615 
7616 // In this pass we look for GEP and cast instructions that are used
7617 // across basic blocks and rewrite them to improve basic-block-at-a-time
7618 // selection.
7619 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
7620   SunkAddrs.clear();
7621   bool MadeChange = false;
7622 
7623   CurInstIterator = BB.begin();
7624   while (CurInstIterator != BB.end()) {
7625     MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
7626     if (ModifiedDT)
7627       return true;
7628   }
7629 
7630   bool MadeBitReverse = true;
7631   while (MadeBitReverse) {
7632     MadeBitReverse = false;
7633     for (auto &I : reverse(BB)) {
7634       if (makeBitReverse(I, *DL, *TLI)) {
7635         MadeBitReverse = MadeChange = true;
7636         break;
7637       }
7638     }
7639   }
7640   MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT);
7641 
7642   return MadeChange;
7643 }
7644 
7645 // Some CGP optimizations may move or alter what's computed in a block. Check
7646 // whether a dbg.value intrinsic could be pointed at a more appropriate operand.
7647 bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
7648   assert(isa<DbgValueInst>(I));
7649   DbgValueInst &DVI = *cast<DbgValueInst>(I);
7650 
7651   // Does this dbg.value refer to a sunk address calculation?
7652   Value *Location = DVI.getVariableLocation();
7653   WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
7654   Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
7655   if (SunkAddr) {
7656     // Point dbg.value at locally computed address, which should give the best
7657     // opportunity to be accurately lowered. This update may change the type of
7658     // pointer being referred to; however this makes no difference to debugging
7659     // information, and we can't generate bitcasts that may affect codegen.
7660     DVI.setOperand(0, MetadataAsValue::get(DVI.getContext(),
7661                                            ValueAsMetadata::get(SunkAddr)));
7662     return true;
7663   }
7664   return false;
7665 }
7666 
7667 // A llvm.dbg.value may be using a value before its definition, due to
7668 // optimizations in this pass and others. Scan for such dbg.values, and rescue
7669 // them by moving the dbg.value to immediately after the value definition.
7670 // FIXME: Ideally this should never be necessary, and this has the potential
7671 // to re-order dbg.value intrinsics.
7672 bool CodeGenPrepare::placeDbgValues(Function &F) {
7673   bool MadeChange = false;
7674   DominatorTree DT(F);
7675 
7676   for (BasicBlock &BB : F) {
7677     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
7678       Instruction *Insn = &*BI++;
7679       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
7680       if (!DVI)
7681         continue;
7682 
7683       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
7684 
7685       if (!VI || VI->isTerminator())
7686         continue;
7687 
7688       // If VI is a phi in a block with an EHPad terminator, we can't insert
7689       // after it.
7690       if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
7691         continue;
7692 
7693       // If the defining instruction dominates the dbg.value, we do not need
7694       // to move the dbg.value.
7695       if (DT.dominates(VI, DVI))
7696         continue;
7697 
7698       LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
7699                         << *DVI << ' ' << *VI);
7700       DVI->removeFromParent();
7701       if (isa<PHINode>(VI))
7702         DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
7703       else
7704         DVI->insertAfter(VI);
7705       MadeChange = true;
7706       ++NumDbgValueMoved;
7707     }
7708   }
7709   return MadeChange;
7710 }
7711 
7712 /// Scale down both weights to fit into uint32_t.
7713 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
7714   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
7715   uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
7716   NewTrue = NewTrue / Scale;
7717   NewFalse = NewFalse / Scale;
7718 }
7719 
7720 /// Some targets prefer to split a conditional branch like:
7721 /// \code
7722 ///   %0 = icmp ne i32 %a, 0
7723 ///   %1 = icmp ne i32 %b, 0
7724 ///   %or.cond = or i1 %0, %1
7725 ///   br i1 %or.cond, label %TrueBB, label %FalseBB
7726 /// \endcode
7727 /// into multiple branch instructions like:
7728 /// \code
7729 ///   bb1:
7730 ///     %0 = icmp ne i32 %a, 0
7731 ///     br i1 %0, label %TrueBB, label %bb2
7732 ///   bb2:
7733 ///     %1 = icmp ne i32 %b, 0
7734 ///     br i1 %1, label %TrueBB, label %FalseBB
7735 /// \endcode
7736 /// This usually allows instruction selection to do even further optimizations
7737 /// and combine the compare with the branch instruction. Currently this is
7738 /// applied for targets which have "cheap" jump instructions.
7739 ///
7740 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
7741 ///
7742 bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
7743   if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
7744     return false;
7745 
7746   bool MadeChange = false;
7747   for (auto &BB : F) {
7748     // Does this BB end with the following?
7749     //   %cond1 = icmp|fcmp|binary instruction ...
7750     //   %cond2 = icmp|fcmp|binary instruction ...
7751     //   %cond.or = or|and i1 %cond1, cond2
7752     //   br i1 %cond.or label %dest1, label %dest2"
7753     BinaryOperator *LogicOp;
7754     BasicBlock *TBB, *FBB;
7755     if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
7756       continue;
7757 
7758     auto *Br1 = cast<BranchInst>(BB.getTerminator());
7759     if (Br1->getMetadata(LLVMContext::MD_unpredictable))
7760       continue;
7761 
7762     // The merging of mostly empty BB can cause a degenerate branch.
7763     if (TBB == FBB)
7764       continue;
7765 
7766     unsigned Opc;
7767     Value *Cond1, *Cond2;
7768     if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
7769                              m_OneUse(m_Value(Cond2)))))
7770       Opc = Instruction::And;
7771     else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
7772                                  m_OneUse(m_Value(Cond2)))))
7773       Opc = Instruction::Or;
7774     else
7775       continue;
7776 
7777     if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
7778         !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
7779       continue;
7780 
7781     LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
7782 
7783     // Create a new BB.
7784     auto *TmpBB =
7785         BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
7786                            BB.getParent(), BB.getNextNode());
7787 
7788     // Update original basic block by using the first condition directly by the
7789     // branch instruction and removing the no longer needed and/or instruction.
7790     Br1->setCondition(Cond1);
7791     LogicOp->eraseFromParent();
7792 
7793     // Depending on the condition we have to either replace the true or the
7794     // false successor of the original branch instruction.
7795     if (Opc == Instruction::And)
7796       Br1->setSuccessor(0, TmpBB);
7797     else
7798       Br1->setSuccessor(1, TmpBB);
7799 
7800     // Fill in the new basic block.
7801     auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
7802     if (auto *I = dyn_cast<Instruction>(Cond2)) {
7803       I->removeFromParent();
7804       I->insertBefore(Br2);
7805     }
7806 
7807     // Update PHI nodes in both successors. The original BB needs to be
7808     // replaced in one successor's PHI nodes, because the branch comes now from
7809     // the newly generated BB (NewBB). In the other successor we need to add one
7810     // incoming edge to the PHI nodes, because both branch instructions target
7811     // now the same successor. Depending on the original branch condition
7812     // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
7813     // we perform the correct update for the PHI nodes.
7814     // This doesn't change the successor order of the just created branch
7815     // instruction (or any other instruction).
7816     if (Opc == Instruction::Or)
7817       std::swap(TBB, FBB);
7818 
7819     // Replace the old BB with the new BB.
7820     TBB->replacePhiUsesWith(&BB, TmpBB);
7821 
7822     // Add another incoming edge form the new BB.
7823     for (PHINode &PN : FBB->phis()) {
7824       auto *Val = PN.getIncomingValueForBlock(&BB);
7825       PN.addIncoming(Val, TmpBB);
7826     }
7827 
7828     // Update the branch weights (from SelectionDAGBuilder::
7829     // FindMergedConditions).
7830     if (Opc == Instruction::Or) {
7831       // Codegen X | Y as:
7832       // BB1:
7833       //   jmp_if_X TBB
7834       //   jmp TmpBB
7835       // TmpBB:
7836       //   jmp_if_Y TBB
7837       //   jmp FBB
7838       //
7839 
7840       // We have flexibility in setting Prob for BB1 and Prob for NewBB.
7841       // The requirement is that
7842       //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
7843       //     = TrueProb for original BB.
7844       // Assuming the original weights are A and B, one choice is to set BB1's
7845       // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
7846       // assumes that
7847       //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
7848       // Another choice is to assume TrueProb for BB1 equals to TrueProb for
7849       // TmpBB, but the math is more complicated.
7850       uint64_t TrueWeight, FalseWeight;
7851       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
7852         uint64_t NewTrueWeight = TrueWeight;
7853         uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
7854         scaleWeights(NewTrueWeight, NewFalseWeight);
7855         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
7856                          .createBranchWeights(TrueWeight, FalseWeight));
7857 
7858         NewTrueWeight = TrueWeight;
7859         NewFalseWeight = 2 * FalseWeight;
7860         scaleWeights(NewTrueWeight, NewFalseWeight);
7861         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
7862                          .createBranchWeights(TrueWeight, FalseWeight));
7863       }
7864     } else {
7865       // Codegen X & Y as:
7866       // BB1:
7867       //   jmp_if_X TmpBB
7868       //   jmp FBB
7869       // TmpBB:
7870       //   jmp_if_Y TBB
7871       //   jmp FBB
7872       //
7873       //  This requires creation of TmpBB after CurBB.
7874 
7875       // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
7876       // The requirement is that
7877       //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
7878       //     = FalseProb for original BB.
7879       // Assuming the original weights are A and B, one choice is to set BB1's
7880       // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
7881       // assumes that
7882       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
7883       uint64_t TrueWeight, FalseWeight;
7884       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
7885         uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
7886         uint64_t NewFalseWeight = FalseWeight;
7887         scaleWeights(NewTrueWeight, NewFalseWeight);
7888         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
7889                          .createBranchWeights(TrueWeight, FalseWeight));
7890 
7891         NewTrueWeight = 2 * TrueWeight;
7892         NewFalseWeight = FalseWeight;
7893         scaleWeights(NewTrueWeight, NewFalseWeight);
7894         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
7895                          .createBranchWeights(TrueWeight, FalseWeight));
7896       }
7897     }
7898 
7899     ModifiedDT = true;
7900     MadeChange = true;
7901 
7902     LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
7903                TmpBB->dump());
7904   }
7905   return MadeChange;
7906 }
7907