1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass munges the code in the input function to better prepare it for
10 // SelectionDAG-based code generation. This works around limitations in it's
11 // basic-block-at-a-time approach. It should eventually be removed.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/APInt.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/MapVector.h"
19 #include "llvm/ADT/PointerIntPair.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/BlockFrequencyInfo.h"
25 #include "llvm/Analysis/BranchProbabilityInfo.h"
26 #include "llvm/Analysis/ConstantFolding.h"
27 #include "llvm/Analysis/InstructionSimplify.h"
28 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/Analysis/MemoryBuiltins.h"
30 #include "llvm/Analysis/ProfileSummaryInfo.h"
31 #include "llvm/Analysis/TargetLibraryInfo.h"
32 #include "llvm/Analysis/TargetTransformInfo.h"
33 #include "llvm/Analysis/ValueTracking.h"
34 #include "llvm/Analysis/VectorUtils.h"
35 #include "llvm/CodeGen/Analysis.h"
36 #include "llvm/CodeGen/ISDOpcodes.h"
37 #include "llvm/CodeGen/SelectionDAGNodes.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/TargetPassConfig.h"
40 #include "llvm/CodeGen/TargetSubtargetInfo.h"
41 #include "llvm/CodeGen/ValueTypes.h"
42 #include "llvm/Config/llvm-config.h"
43 #include "llvm/IR/Argument.h"
44 #include "llvm/IR/Attributes.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/Constants.h"
48 #include "llvm/IR/DataLayout.h"
49 #include "llvm/IR/DerivedTypes.h"
50 #include "llvm/IR/Dominators.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/IR/GetElementPtrTypeIterator.h"
53 #include "llvm/IR/GlobalValue.h"
54 #include "llvm/IR/GlobalVariable.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/InlineAsm.h"
57 #include "llvm/IR/InstrTypes.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/IntrinsicInst.h"
61 #include "llvm/IR/Intrinsics.h"
62 #include "llvm/IR/IntrinsicsAArch64.h"
63 #include "llvm/IR/LLVMContext.h"
64 #include "llvm/IR/MDBuilder.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PatternMatch.h"
68 #include "llvm/IR/Statepoint.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/IR/Use.h"
71 #include "llvm/IR/User.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/IR/ValueHandle.h"
74 #include "llvm/IR/ValueMap.h"
75 #include "llvm/InitializePasses.h"
76 #include "llvm/Pass.h"
77 #include "llvm/Support/BlockFrequency.h"
78 #include "llvm/Support/BranchProbability.h"
79 #include "llvm/Support/Casting.h"
80 #include "llvm/Support/CommandLine.h"
81 #include "llvm/Support/Compiler.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/ErrorHandling.h"
84 #include "llvm/Support/MachineValueType.h"
85 #include "llvm/Support/MathExtras.h"
86 #include "llvm/Support/raw_ostream.h"
87 #include "llvm/Target/TargetMachine.h"
88 #include "llvm/Target/TargetOptions.h"
89 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
90 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
91 #include "llvm/Transforms/Utils/Local.h"
92 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
93 #include "llvm/Transforms/Utils/SizeOpts.h"
94 #include <algorithm>
95 #include <cassert>
96 #include <cstdint>
97 #include <iterator>
98 #include <limits>
99 #include <memory>
100 #include <utility>
101 #include <vector>
102 
103 using namespace llvm;
104 using namespace llvm::PatternMatch;
105 
106 #define DEBUG_TYPE "codegenprepare"
107 
108 STATISTIC(NumBlocksElim, "Number of blocks eliminated");
109 STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
110 STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
111 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
112                       "sunken Cmps");
113 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
114                        "of sunken Casts");
115 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
116                           "computations were sunk");
117 STATISTIC(NumMemoryInstsPhiCreated,
118           "Number of phis created when address "
119           "computations were sunk to memory instructions");
120 STATISTIC(NumMemoryInstsSelectCreated,
121           "Number of select created when address "
122           "computations were sunk to memory instructions");
123 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
124 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
125 STATISTIC(NumAndsAdded,
126           "Number of and mask instructions added to form ext loads");
127 STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
128 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
129 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
130 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
131 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
132 
133 static cl::opt<bool> DisableBranchOpts(
134   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
135   cl::desc("Disable branch optimizations in CodeGenPrepare"));
136 
137 static cl::opt<bool>
138     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
139                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
140 
141 static cl::opt<bool> DisableSelectToBranch(
142   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
143   cl::desc("Disable select to branch conversion."));
144 
145 static cl::opt<bool> AddrSinkUsingGEPs(
146   "addr-sink-using-gep", cl::Hidden, cl::init(true),
147   cl::desc("Address sinking in CGP using GEPs."));
148 
149 static cl::opt<bool> EnableAndCmpSinking(
150    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
151    cl::desc("Enable sinkinig and/cmp into branches."));
152 
153 static cl::opt<bool> DisableStoreExtract(
154     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
155     cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
156 
157 static cl::opt<bool> StressStoreExtract(
158     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
159     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
160 
161 static cl::opt<bool> DisableExtLdPromotion(
162     "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
163     cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
164              "CodeGenPrepare"));
165 
166 static cl::opt<bool> StressExtLdPromotion(
167     "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
168     cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
169              "optimization in CodeGenPrepare"));
170 
171 static cl::opt<bool> DisablePreheaderProtect(
172     "disable-preheader-prot", cl::Hidden, cl::init(false),
173     cl::desc("Disable protection against removing loop preheaders"));
174 
175 static cl::opt<bool> ProfileGuidedSectionPrefix(
176     "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
177     cl::desc("Use profile info to add section prefix for hot/cold functions"));
178 
179 static cl::opt<bool> ProfileUnknownInSpecialSection(
180     "profile-unknown-in-special-section", cl::Hidden, cl::init(false),
181     cl::ZeroOrMore,
182     cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
183              "profile, we cannot tell the function is cold for sure because "
184              "it may be a function newly added without ever being sampled. "
185              "With the flag enabled, compiler can put such profile unknown "
186              "functions into a special section, so runtime system can choose "
187              "to handle it in a different way than .text section, to save "
188              "RAM for example. "));
189 
190 static cl::opt<unsigned> FreqRatioToSkipMerge(
191     "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
192     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
193              "(frequency of destination block) is greater than this ratio"));
194 
195 static cl::opt<bool> ForceSplitStore(
196     "force-split-store", cl::Hidden, cl::init(false),
197     cl::desc("Force store splitting no matter what the target query says."));
198 
199 static cl::opt<bool>
200 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
201     cl::desc("Enable merging of redundant sexts when one is dominating"
202     " the other."), cl::init(true));
203 
204 static cl::opt<bool> DisableComplexAddrModes(
205     "disable-complex-addr-modes", cl::Hidden, cl::init(false),
206     cl::desc("Disables combining addressing modes with different parts "
207              "in optimizeMemoryInst."));
208 
209 static cl::opt<bool>
210 AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
211                 cl::desc("Allow creation of Phis in Address sinking."));
212 
213 static cl::opt<bool>
214 AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true),
215                    cl::desc("Allow creation of selects in Address sinking."));
216 
217 static cl::opt<bool> AddrSinkCombineBaseReg(
218     "addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
219     cl::desc("Allow combining of BaseReg field in Address sinking."));
220 
221 static cl::opt<bool> AddrSinkCombineBaseGV(
222     "addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
223     cl::desc("Allow combining of BaseGV field in Address sinking."));
224 
225 static cl::opt<bool> AddrSinkCombineBaseOffs(
226     "addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
227     cl::desc("Allow combining of BaseOffs field in Address sinking."));
228 
229 static cl::opt<bool> AddrSinkCombineScaledReg(
230     "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
231     cl::desc("Allow combining of ScaledReg field in Address sinking."));
232 
233 static cl::opt<bool>
234     EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
235                          cl::init(true),
236                          cl::desc("Enable splitting large offset of GEP."));
237 
238 static cl::opt<bool> EnableICMP_EQToICMP_ST(
239     "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),
240     cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
241 
242 static cl::opt<bool>
243     VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),
244                      cl::desc("Enable BFI update verification for "
245                               "CodeGenPrepare."));
246 
247 static cl::opt<bool> OptimizePhiTypes(
248     "cgp-optimize-phi-types", cl::Hidden, cl::init(false),
249     cl::desc("Enable converting phi types in CodeGenPrepare"));
250 
251 namespace {
252 
253 enum ExtType {
254   ZeroExtension,   // Zero extension has been seen.
255   SignExtension,   // Sign extension has been seen.
256   BothExtension    // This extension type is used if we saw sext after
257                    // ZeroExtension had been set, or if we saw zext after
258                    // SignExtension had been set. It makes the type
259                    // information of a promoted instruction invalid.
260 };
261 
262 using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
263 using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
264 using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
265 using SExts = SmallVector<Instruction *, 16>;
266 using ValueToSExts = DenseMap<Value *, SExts>;
267 
268 class TypePromotionTransaction;
269 
270   class CodeGenPrepare : public FunctionPass {
271     const TargetMachine *TM = nullptr;
272     const TargetSubtargetInfo *SubtargetInfo;
273     const TargetLowering *TLI = nullptr;
274     const TargetRegisterInfo *TRI;
275     const TargetTransformInfo *TTI = nullptr;
276     const TargetLibraryInfo *TLInfo;
277     const LoopInfo *LI;
278     std::unique_ptr<BlockFrequencyInfo> BFI;
279     std::unique_ptr<BranchProbabilityInfo> BPI;
280     ProfileSummaryInfo *PSI;
281 
282     /// As we scan instructions optimizing them, this is the next instruction
283     /// to optimize. Transforms that can invalidate this should update it.
284     BasicBlock::iterator CurInstIterator;
285 
286     /// Keeps track of non-local addresses that have been sunk into a block.
287     /// This allows us to avoid inserting duplicate code for blocks with
288     /// multiple load/stores of the same address. The usage of WeakTrackingVH
289     /// enables SunkAddrs to be treated as a cache whose entries can be
290     /// invalidated if a sunken address computation has been erased.
291     ValueMap<Value*, WeakTrackingVH> SunkAddrs;
292 
293     /// Keeps track of all instructions inserted for the current function.
294     SetOfInstrs InsertedInsts;
295 
296     /// Keeps track of the type of the related instruction before their
297     /// promotion for the current function.
298     InstrToOrigTy PromotedInsts;
299 
300     /// Keep track of instructions removed during promotion.
301     SetOfInstrs RemovedInsts;
302 
303     /// Keep track of sext chains based on their initial value.
304     DenseMap<Value *, Instruction *> SeenChainsForSExt;
305 
306     /// Keep track of GEPs accessing the same data structures such as structs or
307     /// arrays that are candidates to be split later because of their large
308     /// size.
309     MapVector<
310         AssertingVH<Value>,
311         SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
312         LargeOffsetGEPMap;
313 
314     /// Keep track of new GEP base after splitting the GEPs having large offset.
315     SmallSet<AssertingVH<Value>, 2> NewGEPBases;
316 
317     /// Map serial numbers to Large offset GEPs.
318     DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
319 
320     /// Keep track of SExt promoted.
321     ValueToSExts ValToSExtendedUses;
322 
323     /// True if the function has the OptSize attribute.
324     bool OptSize;
325 
326     /// DataLayout for the Function being processed.
327     const DataLayout *DL = nullptr;
328 
329     /// Building the dominator tree can be expensive, so we only build it
330     /// lazily and update it when required.
331     std::unique_ptr<DominatorTree> DT;
332 
333   public:
334     static char ID; // Pass identification, replacement for typeid
335 
336     CodeGenPrepare() : FunctionPass(ID) {
337       initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
338     }
339 
340     bool runOnFunction(Function &F) override;
341 
342     StringRef getPassName() const override { return "CodeGen Prepare"; }
343 
344     void getAnalysisUsage(AnalysisUsage &AU) const override {
345       // FIXME: When we can selectively preserve passes, preserve the domtree.
346       AU.addRequired<ProfileSummaryInfoWrapperPass>();
347       AU.addRequired<TargetLibraryInfoWrapperPass>();
348       AU.addRequired<TargetPassConfig>();
349       AU.addRequired<TargetTransformInfoWrapperPass>();
350       AU.addRequired<LoopInfoWrapperPass>();
351     }
352 
353   private:
354     template <typename F>
355     void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
356       // Substituting can cause recursive simplifications, which can invalidate
357       // our iterator.  Use a WeakTrackingVH to hold onto it in case this
358       // happens.
359       Value *CurValue = &*CurInstIterator;
360       WeakTrackingVH IterHandle(CurValue);
361 
362       f();
363 
364       // If the iterator instruction was recursively deleted, start over at the
365       // start of the block.
366       if (IterHandle != CurValue) {
367         CurInstIterator = BB->begin();
368         SunkAddrs.clear();
369       }
370     }
371 
372     // Get the DominatorTree, building if necessary.
373     DominatorTree &getDT(Function &F) {
374       if (!DT)
375         DT = std::make_unique<DominatorTree>(F);
376       return *DT;
377     }
378 
379     void removeAllAssertingVHReferences(Value *V);
380     bool eliminateFallThrough(Function &F);
381     bool eliminateMostlyEmptyBlocks(Function &F);
382     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
383     bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
384     void eliminateMostlyEmptyBlock(BasicBlock *BB);
385     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
386                                        bool isPreheader);
387     bool makeBitReverse(Instruction &I);
388     bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
389     bool optimizeInst(Instruction *I, bool &ModifiedDT);
390     bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
391                             Type *AccessTy, unsigned AddrSpace);
392     bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
393     bool optimizeInlineAsmInst(CallInst *CS);
394     bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
395     bool optimizeExt(Instruction *&I);
396     bool optimizeExtUses(Instruction *I);
397     bool optimizeLoadExt(LoadInst *Load);
398     bool optimizeShiftInst(BinaryOperator *BO);
399     bool optimizeFunnelShift(IntrinsicInst *Fsh);
400     bool optimizeSelectInst(SelectInst *SI);
401     bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
402     bool optimizeSwitchInst(SwitchInst *SI);
403     bool optimizeExtractElementInst(Instruction *Inst);
404     bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
405     bool fixupDbgValue(Instruction *I);
406     bool placeDbgValues(Function &F);
407     bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
408                       LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
409     bool tryToPromoteExts(TypePromotionTransaction &TPT,
410                           const SmallVectorImpl<Instruction *> &Exts,
411                           SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
412                           unsigned CreatedInstsCost = 0);
413     bool mergeSExts(Function &F);
414     bool splitLargeGEPOffsets();
415     bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
416                          SmallPtrSetImpl<Instruction *> &DeletedInstrs);
417     bool optimizePhiTypes(Function &F);
418     bool performAddressTypePromotion(
419         Instruction *&Inst,
420         bool AllowPromotionWithoutCommonHeader,
421         bool HasPromoted, TypePromotionTransaction &TPT,
422         SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
423     bool splitBranchCondition(Function &F, bool &ModifiedDT);
424     bool simplifyOffsetableRelocate(GCStatepointInst &I);
425 
426     bool tryToSinkFreeOperands(Instruction *I);
427     bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,
428                                      Value *Arg1, CmpInst *Cmp,
429                                      Intrinsic::ID IID);
430     bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
431     bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
432     bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
433     void verifyBFIUpdates(Function &F);
434   };
435 
436 } // end anonymous namespace
437 
438 char CodeGenPrepare::ID = 0;
439 
440 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
441                       "Optimize for code generation", false, false)
442 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
443 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
444 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
445 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
446 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
447 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
448                     "Optimize for code generation", false, false)
449 
450 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
451 
452 bool CodeGenPrepare::runOnFunction(Function &F) {
453   if (skipFunction(F))
454     return false;
455 
456   DL = &F.getParent()->getDataLayout();
457 
458   bool EverMadeChange = false;
459   // Clear per function information.
460   InsertedInsts.clear();
461   PromotedInsts.clear();
462 
463   TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
464   SubtargetInfo = TM->getSubtargetImpl(F);
465   TLI = SubtargetInfo->getTargetLowering();
466   TRI = SubtargetInfo->getRegisterInfo();
467   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
468   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
469   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
470   BPI.reset(new BranchProbabilityInfo(F, *LI));
471   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
472   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
473   OptSize = F.hasOptSize();
474   if (ProfileGuidedSectionPrefix) {
475     if (PSI->isFunctionHotInCallGraph(&F, *BFI))
476       F.setSectionPrefix(".hot");
477     else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
478       F.setSectionPrefix(".unlikely");
479     else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
480              PSI->isFunctionHotnessUnknown(F))
481       F.setSectionPrefix(".unknown");
482   }
483 
484   /// This optimization identifies DIV instructions that can be
485   /// profitably bypassed and carried out with a shorter, faster divide.
486   if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
487     const DenseMap<unsigned int, unsigned int> &BypassWidths =
488         TLI->getBypassSlowDivWidths();
489     BasicBlock* BB = &*F.begin();
490     while (BB != nullptr) {
491       // bypassSlowDivision may create new BBs, but we don't want to reapply the
492       // optimization to those blocks.
493       BasicBlock* Next = BB->getNextNode();
494       // F.hasOptSize is already checked in the outer if statement.
495       if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
496         EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
497       BB = Next;
498     }
499   }
500 
501   // Eliminate blocks that contain only PHI nodes and an
502   // unconditional branch.
503   EverMadeChange |= eliminateMostlyEmptyBlocks(F);
504 
505   bool ModifiedDT = false;
506   if (!DisableBranchOpts)
507     EverMadeChange |= splitBranchCondition(F, ModifiedDT);
508 
509   // Split some critical edges where one of the sources is an indirect branch,
510   // to help generate sane code for PHIs involving such edges.
511   EverMadeChange |= SplitIndirectBrCriticalEdges(F);
512 
513   bool MadeChange = true;
514   while (MadeChange) {
515     MadeChange = false;
516     DT.reset();
517     for (Function::iterator I = F.begin(); I != F.end(); ) {
518       BasicBlock *BB = &*I++;
519       bool ModifiedDTOnIteration = false;
520       MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
521 
522       // Restart BB iteration if the dominator tree of the Function was changed
523       if (ModifiedDTOnIteration)
524         break;
525     }
526     if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
527       MadeChange |= mergeSExts(F);
528     if (!LargeOffsetGEPMap.empty())
529       MadeChange |= splitLargeGEPOffsets();
530     MadeChange |= optimizePhiTypes(F);
531 
532     if (MadeChange)
533       eliminateFallThrough(F);
534 
535     // Really free removed instructions during promotion.
536     for (Instruction *I : RemovedInsts)
537       I->deleteValue();
538 
539     EverMadeChange |= MadeChange;
540     SeenChainsForSExt.clear();
541     ValToSExtendedUses.clear();
542     RemovedInsts.clear();
543     LargeOffsetGEPMap.clear();
544     LargeOffsetGEPID.clear();
545   }
546 
547   SunkAddrs.clear();
548 
549   if (!DisableBranchOpts) {
550     MadeChange = false;
551     // Use a set vector to get deterministic iteration order. The order the
552     // blocks are removed may affect whether or not PHI nodes in successors
553     // are removed.
554     SmallSetVector<BasicBlock*, 8> WorkList;
555     for (BasicBlock &BB : F) {
556       SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
557       MadeChange |= ConstantFoldTerminator(&BB, true);
558       if (!MadeChange) continue;
559 
560       for (SmallVectorImpl<BasicBlock*>::iterator
561              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
562         if (pred_begin(*II) == pred_end(*II))
563           WorkList.insert(*II);
564     }
565 
566     // Delete the dead blocks and any of their dead successors.
567     MadeChange |= !WorkList.empty();
568     while (!WorkList.empty()) {
569       BasicBlock *BB = WorkList.pop_back_val();
570       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
571 
572       DeleteDeadBlock(BB);
573 
574       for (SmallVectorImpl<BasicBlock*>::iterator
575              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
576         if (pred_begin(*II) == pred_end(*II))
577           WorkList.insert(*II);
578     }
579 
580     // Merge pairs of basic blocks with unconditional branches, connected by
581     // a single edge.
582     if (EverMadeChange || MadeChange)
583       MadeChange |= eliminateFallThrough(F);
584 
585     EverMadeChange |= MadeChange;
586   }
587 
588   if (!DisableGCOpts) {
589     SmallVector<GCStatepointInst *, 2> Statepoints;
590     for (BasicBlock &BB : F)
591       for (Instruction &I : BB)
592         if (auto *SP = dyn_cast<GCStatepointInst>(&I))
593           Statepoints.push_back(SP);
594     for (auto &I : Statepoints)
595       EverMadeChange |= simplifyOffsetableRelocate(*I);
596   }
597 
598   // Do this last to clean up use-before-def scenarios introduced by other
599   // preparatory transforms.
600   EverMadeChange |= placeDbgValues(F);
601 
602 #ifndef NDEBUG
603   if (VerifyBFIUpdates)
604     verifyBFIUpdates(F);
605 #endif
606 
607   return EverMadeChange;
608 }
609 
610 /// An instruction is about to be deleted, so remove all references to it in our
611 /// GEP-tracking data strcutures.
612 void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
613   LargeOffsetGEPMap.erase(V);
614   NewGEPBases.erase(V);
615 
616   auto GEP = dyn_cast<GetElementPtrInst>(V);
617   if (!GEP)
618     return;
619 
620   LargeOffsetGEPID.erase(GEP);
621 
622   auto VecI = LargeOffsetGEPMap.find(GEP->getPointerOperand());
623   if (VecI == LargeOffsetGEPMap.end())
624     return;
625 
626   auto &GEPVector = VecI->second;
627   const auto &I = std::find_if(GEPVector.begin(), GEPVector.end(),
628                                [=](auto &Elt) { return Elt.first == GEP; });
629   if (I == GEPVector.end())
630     return;
631 
632   GEPVector.erase(I);
633   if (GEPVector.empty())
634     LargeOffsetGEPMap.erase(VecI);
635 }
636 
637 // Verify BFI has been updated correctly by recomputing BFI and comparing them.
638 void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
639   DominatorTree NewDT(F);
640   LoopInfo NewLI(NewDT);
641   BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
642   BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
643   NewBFI.verifyMatch(*BFI);
644 }
645 
646 /// Merge basic blocks which are connected by a single edge, where one of the
647 /// basic blocks has a single successor pointing to the other basic block,
648 /// which has a single predecessor.
649 bool CodeGenPrepare::eliminateFallThrough(Function &F) {
650   bool Changed = false;
651   // Scan all of the blocks in the function, except for the entry block.
652   // Use a temporary array to avoid iterator being invalidated when
653   // deleting blocks.
654   SmallVector<WeakTrackingVH, 16> Blocks;
655   for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
656     Blocks.push_back(&Block);
657 
658   SmallSet<WeakTrackingVH, 16> Preds;
659   for (auto &Block : Blocks) {
660     auto *BB = cast_or_null<BasicBlock>(Block);
661     if (!BB)
662       continue;
663     // If the destination block has a single pred, then this is a trivial
664     // edge, just collapse it.
665     BasicBlock *SinglePred = BB->getSinglePredecessor();
666 
667     // Don't merge if BB's address is taken.
668     if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
669 
670     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
671     if (Term && !Term->isConditional()) {
672       Changed = true;
673       LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
674 
675       // Merge BB into SinglePred and delete it.
676       MergeBlockIntoPredecessor(BB);
677       Preds.insert(SinglePred);
678     }
679   }
680 
681   // (Repeatedly) merging blocks into their predecessors can create redundant
682   // debug intrinsics.
683   for (auto &Pred : Preds)
684     if (auto *BB = cast_or_null<BasicBlock>(Pred))
685       RemoveRedundantDbgInstrs(BB);
686 
687   return Changed;
688 }
689 
690 /// Find a destination block from BB if BB is mergeable empty block.
691 BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
692   // If this block doesn't end with an uncond branch, ignore it.
693   BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
694   if (!BI || !BI->isUnconditional())
695     return nullptr;
696 
697   // If the instruction before the branch (skipping debug info) isn't a phi
698   // node, then other stuff is happening here.
699   BasicBlock::iterator BBI = BI->getIterator();
700   if (BBI != BB->begin()) {
701     --BBI;
702     while (isa<DbgInfoIntrinsic>(BBI)) {
703       if (BBI == BB->begin())
704         break;
705       --BBI;
706     }
707     if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
708       return nullptr;
709   }
710 
711   // Do not break infinite loops.
712   BasicBlock *DestBB = BI->getSuccessor(0);
713   if (DestBB == BB)
714     return nullptr;
715 
716   if (!canMergeBlocks(BB, DestBB))
717     DestBB = nullptr;
718 
719   return DestBB;
720 }
721 
722 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
723 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
724 /// edges in ways that are non-optimal for isel. Start by eliminating these
725 /// blocks so we can split them the way we want them.
726 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
727   SmallPtrSet<BasicBlock *, 16> Preheaders;
728   SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
729   while (!LoopList.empty()) {
730     Loop *L = LoopList.pop_back_val();
731     LoopList.insert(LoopList.end(), L->begin(), L->end());
732     if (BasicBlock *Preheader = L->getLoopPreheader())
733       Preheaders.insert(Preheader);
734   }
735 
736   bool MadeChange = false;
737   // Copy blocks into a temporary array to avoid iterator invalidation issues
738   // as we remove them.
739   // Note that this intentionally skips the entry block.
740   SmallVector<WeakTrackingVH, 16> Blocks;
741   for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
742     Blocks.push_back(&Block);
743 
744   for (auto &Block : Blocks) {
745     BasicBlock *BB = cast_or_null<BasicBlock>(Block);
746     if (!BB)
747       continue;
748     BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
749     if (!DestBB ||
750         !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
751       continue;
752 
753     eliminateMostlyEmptyBlock(BB);
754     MadeChange = true;
755   }
756   return MadeChange;
757 }
758 
759 bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
760                                                    BasicBlock *DestBB,
761                                                    bool isPreheader) {
762   // Do not delete loop preheaders if doing so would create a critical edge.
763   // Loop preheaders can be good locations to spill registers. If the
764   // preheader is deleted and we create a critical edge, registers may be
765   // spilled in the loop body instead.
766   if (!DisablePreheaderProtect && isPreheader &&
767       !(BB->getSinglePredecessor() &&
768         BB->getSinglePredecessor()->getSingleSuccessor()))
769     return false;
770 
771   // Skip merging if the block's successor is also a successor to any callbr
772   // that leads to this block.
773   // FIXME: Is this really needed? Is this a correctness issue?
774   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
775     if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
776       for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
777         if (DestBB == CBI->getSuccessor(i))
778           return false;
779   }
780 
781   // Try to skip merging if the unique predecessor of BB is terminated by a
782   // switch or indirect branch instruction, and BB is used as an incoming block
783   // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
784   // add COPY instructions in the predecessor of BB instead of BB (if it is not
785   // merged). Note that the critical edge created by merging such blocks wont be
786   // split in MachineSink because the jump table is not analyzable. By keeping
787   // such empty block (BB), ISel will place COPY instructions in BB, not in the
788   // predecessor of BB.
789   BasicBlock *Pred = BB->getUniquePredecessor();
790   if (!Pred ||
791       !(isa<SwitchInst>(Pred->getTerminator()) ||
792         isa<IndirectBrInst>(Pred->getTerminator())))
793     return true;
794 
795   if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
796     return true;
797 
798   // We use a simple cost heuristic which determine skipping merging is
799   // profitable if the cost of skipping merging is less than the cost of
800   // merging : Cost(skipping merging) < Cost(merging BB), where the
801   // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
802   // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
803   // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
804   //   Freq(Pred) / Freq(BB) > 2.
805   // Note that if there are multiple empty blocks sharing the same incoming
806   // value for the PHIs in the DestBB, we consider them together. In such
807   // case, Cost(merging BB) will be the sum of their frequencies.
808 
809   if (!isa<PHINode>(DestBB->begin()))
810     return true;
811 
812   SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
813 
814   // Find all other incoming blocks from which incoming values of all PHIs in
815   // DestBB are the same as the ones from BB.
816   for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
817        ++PI) {
818     BasicBlock *DestBBPred = *PI;
819     if (DestBBPred == BB)
820       continue;
821 
822     if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
823           return DestPN.getIncomingValueForBlock(BB) ==
824                  DestPN.getIncomingValueForBlock(DestBBPred);
825         }))
826       SameIncomingValueBBs.insert(DestBBPred);
827   }
828 
829   // See if all BB's incoming values are same as the value from Pred. In this
830   // case, no reason to skip merging because COPYs are expected to be place in
831   // Pred already.
832   if (SameIncomingValueBBs.count(Pred))
833     return true;
834 
835   BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
836   BlockFrequency BBFreq = BFI->getBlockFreq(BB);
837 
838   for (auto *SameValueBB : SameIncomingValueBBs)
839     if (SameValueBB->getUniquePredecessor() == Pred &&
840         DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
841       BBFreq += BFI->getBlockFreq(SameValueBB);
842 
843   return PredFreq.getFrequency() <=
844          BBFreq.getFrequency() * FreqRatioToSkipMerge;
845 }
846 
847 /// Return true if we can merge BB into DestBB if there is a single
848 /// unconditional branch between them, and BB contains no other non-phi
849 /// instructions.
850 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
851                                     const BasicBlock *DestBB) const {
852   // We only want to eliminate blocks whose phi nodes are used by phi nodes in
853   // the successor.  If there are more complex condition (e.g. preheaders),
854   // don't mess around with them.
855   for (const PHINode &PN : BB->phis()) {
856     for (const User *U : PN.users()) {
857       const Instruction *UI = cast<Instruction>(U);
858       if (UI->getParent() != DestBB || !isa<PHINode>(UI))
859         return false;
860       // If User is inside DestBB block and it is a PHINode then check
861       // incoming value. If incoming value is not from BB then this is
862       // a complex condition (e.g. preheaders) we want to avoid here.
863       if (UI->getParent() == DestBB) {
864         if (const PHINode *UPN = dyn_cast<PHINode>(UI))
865           for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
866             Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
867             if (Insn && Insn->getParent() == BB &&
868                 Insn->getParent() != UPN->getIncomingBlock(I))
869               return false;
870           }
871       }
872     }
873   }
874 
875   // If BB and DestBB contain any common predecessors, then the phi nodes in BB
876   // and DestBB may have conflicting incoming values for the block.  If so, we
877   // can't merge the block.
878   const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
879   if (!DestBBPN) return true;  // no conflict.
880 
881   // Collect the preds of BB.
882   SmallPtrSet<const BasicBlock*, 16> BBPreds;
883   if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
884     // It is faster to get preds from a PHI than with pred_iterator.
885     for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
886       BBPreds.insert(BBPN->getIncomingBlock(i));
887   } else {
888     BBPreds.insert(pred_begin(BB), pred_end(BB));
889   }
890 
891   // Walk the preds of DestBB.
892   for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
893     BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
894     if (BBPreds.count(Pred)) {   // Common predecessor?
895       for (const PHINode &PN : DestBB->phis()) {
896         const Value *V1 = PN.getIncomingValueForBlock(Pred);
897         const Value *V2 = PN.getIncomingValueForBlock(BB);
898 
899         // If V2 is a phi node in BB, look up what the mapped value will be.
900         if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
901           if (V2PN->getParent() == BB)
902             V2 = V2PN->getIncomingValueForBlock(Pred);
903 
904         // If there is a conflict, bail out.
905         if (V1 != V2) return false;
906       }
907     }
908   }
909 
910   return true;
911 }
912 
913 /// Eliminate a basic block that has only phi's and an unconditional branch in
914 /// it.
915 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
916   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
917   BasicBlock *DestBB = BI->getSuccessor(0);
918 
919   LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
920                     << *BB << *DestBB);
921 
922   // If the destination block has a single pred, then this is a trivial edge,
923   // just collapse it.
924   if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
925     if (SinglePred != DestBB) {
926       assert(SinglePred == BB &&
927              "Single predecessor not the same as predecessor");
928       // Merge DestBB into SinglePred/BB and delete it.
929       MergeBlockIntoPredecessor(DestBB);
930       // Note: BB(=SinglePred) will not be deleted on this path.
931       // DestBB(=its single successor) is the one that was deleted.
932       LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
933       return;
934     }
935   }
936 
937   // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
938   // to handle the new incoming edges it is about to have.
939   for (PHINode &PN : DestBB->phis()) {
940     // Remove the incoming value for BB, and remember it.
941     Value *InVal = PN.removeIncomingValue(BB, false);
942 
943     // Two options: either the InVal is a phi node defined in BB or it is some
944     // value that dominates BB.
945     PHINode *InValPhi = dyn_cast<PHINode>(InVal);
946     if (InValPhi && InValPhi->getParent() == BB) {
947       // Add all of the input values of the input PHI as inputs of this phi.
948       for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
949         PN.addIncoming(InValPhi->getIncomingValue(i),
950                        InValPhi->getIncomingBlock(i));
951     } else {
952       // Otherwise, add one instance of the dominating value for each edge that
953       // we will be adding.
954       if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
955         for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
956           PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
957       } else {
958         for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
959           PN.addIncoming(InVal, *PI);
960       }
961     }
962   }
963 
964   // The PHIs are now updated, change everything that refers to BB to use
965   // DestBB and remove BB.
966   BB->replaceAllUsesWith(DestBB);
967   BB->eraseFromParent();
968   ++NumBlocksElim;
969 
970   LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
971 }
972 
973 // Computes a map of base pointer relocation instructions to corresponding
974 // derived pointer relocation instructions given a vector of all relocate calls
975 static void computeBaseDerivedRelocateMap(
976     const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
977     DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
978         &RelocateInstMap) {
979   // Collect information in two maps: one primarily for locating the base object
980   // while filling the second map; the second map is the final structure holding
981   // a mapping between Base and corresponding Derived relocate calls
982   DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
983   for (auto *ThisRelocate : AllRelocateCalls) {
984     auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
985                             ThisRelocate->getDerivedPtrIndex());
986     RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
987   }
988   for (auto &Item : RelocateIdxMap) {
989     std::pair<unsigned, unsigned> Key = Item.first;
990     if (Key.first == Key.second)
991       // Base relocation: nothing to insert
992       continue;
993 
994     GCRelocateInst *I = Item.second;
995     auto BaseKey = std::make_pair(Key.first, Key.first);
996 
997     // We're iterating over RelocateIdxMap so we cannot modify it.
998     auto MaybeBase = RelocateIdxMap.find(BaseKey);
999     if (MaybeBase == RelocateIdxMap.end())
1000       // TODO: We might want to insert a new base object relocate and gep off
1001       // that, if there are enough derived object relocates.
1002       continue;
1003 
1004     RelocateInstMap[MaybeBase->second].push_back(I);
1005   }
1006 }
1007 
1008 // Accepts a GEP and extracts the operands into a vector provided they're all
1009 // small integer constants
1010 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
1011                                           SmallVectorImpl<Value *> &OffsetV) {
1012   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
1013     // Only accept small constant integer operands
1014     auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
1015     if (!Op || Op->getZExtValue() > 20)
1016       return false;
1017   }
1018 
1019   for (unsigned i = 1; i < GEP->getNumOperands(); i++)
1020     OffsetV.push_back(GEP->getOperand(i));
1021   return true;
1022 }
1023 
1024 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
1025 // replace, computes a replacement, and affects it.
1026 static bool
1027 simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
1028                           const SmallVectorImpl<GCRelocateInst *> &Targets) {
1029   bool MadeChange = false;
1030   // We must ensure the relocation of derived pointer is defined after
1031   // relocation of base pointer. If we find a relocation corresponding to base
1032   // defined earlier than relocation of base then we move relocation of base
1033   // right before found relocation. We consider only relocation in the same
1034   // basic block as relocation of base. Relocations from other basic block will
1035   // be skipped by optimization and we do not care about them.
1036   for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
1037        &*R != RelocatedBase; ++R)
1038     if (auto *RI = dyn_cast<GCRelocateInst>(R))
1039       if (RI->getStatepoint() == RelocatedBase->getStatepoint())
1040         if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
1041           RelocatedBase->moveBefore(RI);
1042           break;
1043         }
1044 
1045   for (GCRelocateInst *ToReplace : Targets) {
1046     assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
1047            "Not relocating a derived object of the original base object");
1048     if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
1049       // A duplicate relocate call. TODO: coalesce duplicates.
1050       continue;
1051     }
1052 
1053     if (RelocatedBase->getParent() != ToReplace->getParent()) {
1054       // Base and derived relocates are in different basic blocks.
1055       // In this case transform is only valid when base dominates derived
1056       // relocate. However it would be too expensive to check dominance
1057       // for each such relocate, so we skip the whole transformation.
1058       continue;
1059     }
1060 
1061     Value *Base = ToReplace->getBasePtr();
1062     auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
1063     if (!Derived || Derived->getPointerOperand() != Base)
1064       continue;
1065 
1066     SmallVector<Value *, 2> OffsetV;
1067     if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
1068       continue;
1069 
1070     // Create a Builder and replace the target callsite with a gep
1071     assert(RelocatedBase->getNextNode() &&
1072            "Should always have one since it's not a terminator");
1073 
1074     // Insert after RelocatedBase
1075     IRBuilder<> Builder(RelocatedBase->getNextNode());
1076     Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
1077 
1078     // If gc_relocate does not match the actual type, cast it to the right type.
1079     // In theory, there must be a bitcast after gc_relocate if the type does not
1080     // match, and we should reuse it to get the derived pointer. But it could be
1081     // cases like this:
1082     // bb1:
1083     //  ...
1084     //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
1085     //  br label %merge
1086     //
1087     // bb2:
1088     //  ...
1089     //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
1090     //  br label %merge
1091     //
1092     // merge:
1093     //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
1094     //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
1095     //
1096     // In this case, we can not find the bitcast any more. So we insert a new bitcast
1097     // no matter there is already one or not. In this way, we can handle all cases, and
1098     // the extra bitcast should be optimized away in later passes.
1099     Value *ActualRelocatedBase = RelocatedBase;
1100     if (RelocatedBase->getType() != Base->getType()) {
1101       ActualRelocatedBase =
1102           Builder.CreateBitCast(RelocatedBase, Base->getType());
1103     }
1104     Value *Replacement = Builder.CreateGEP(
1105         Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
1106     Replacement->takeName(ToReplace);
1107     // If the newly generated derived pointer's type does not match the original derived
1108     // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
1109     Value *ActualReplacement = Replacement;
1110     if (Replacement->getType() != ToReplace->getType()) {
1111       ActualReplacement =
1112           Builder.CreateBitCast(Replacement, ToReplace->getType());
1113     }
1114     ToReplace->replaceAllUsesWith(ActualReplacement);
1115     ToReplace->eraseFromParent();
1116 
1117     MadeChange = true;
1118   }
1119   return MadeChange;
1120 }
1121 
1122 // Turns this:
1123 //
1124 // %base = ...
1125 // %ptr = gep %base + 15
1126 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1127 // %base' = relocate(%tok, i32 4, i32 4)
1128 // %ptr' = relocate(%tok, i32 4, i32 5)
1129 // %val = load %ptr'
1130 //
1131 // into this:
1132 //
1133 // %base = ...
1134 // %ptr = gep %base + 15
1135 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1136 // %base' = gc.relocate(%tok, i32 4, i32 4)
1137 // %ptr' = gep %base' + 15
1138 // %val = load %ptr'
1139 bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
1140   bool MadeChange = false;
1141   SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
1142   for (auto *U : I.users())
1143     if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
1144       // Collect all the relocate calls associated with a statepoint
1145       AllRelocateCalls.push_back(Relocate);
1146 
1147   // We need at least one base pointer relocation + one derived pointer
1148   // relocation to mangle
1149   if (AllRelocateCalls.size() < 2)
1150     return false;
1151 
1152   // RelocateInstMap is a mapping from the base relocate instruction to the
1153   // corresponding derived relocate instructions
1154   DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
1155   computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
1156   if (RelocateInstMap.empty())
1157     return false;
1158 
1159   for (auto &Item : RelocateInstMap)
1160     // Item.first is the RelocatedBase to offset against
1161     // Item.second is the vector of Targets to replace
1162     MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
1163   return MadeChange;
1164 }
1165 
1166 /// Sink the specified cast instruction into its user blocks.
1167 static bool SinkCast(CastInst *CI) {
1168   BasicBlock *DefBB = CI->getParent();
1169 
1170   /// InsertedCasts - Only insert a cast in each block once.
1171   DenseMap<BasicBlock*, CastInst*> InsertedCasts;
1172 
1173   bool MadeChange = false;
1174   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
1175        UI != E; ) {
1176     Use &TheUse = UI.getUse();
1177     Instruction *User = cast<Instruction>(*UI);
1178 
1179     // Figure out which BB this cast is used in.  For PHI's this is the
1180     // appropriate predecessor block.
1181     BasicBlock *UserBB = User->getParent();
1182     if (PHINode *PN = dyn_cast<PHINode>(User)) {
1183       UserBB = PN->getIncomingBlock(TheUse);
1184     }
1185 
1186     // Preincrement use iterator so we don't invalidate it.
1187     ++UI;
1188 
1189     // The first insertion point of a block containing an EH pad is after the
1190     // pad.  If the pad is the user, we cannot sink the cast past the pad.
1191     if (User->isEHPad())
1192       continue;
1193 
1194     // If the block selected to receive the cast is an EH pad that does not
1195     // allow non-PHI instructions before the terminator, we can't sink the
1196     // cast.
1197     if (UserBB->getTerminator()->isEHPad())
1198       continue;
1199 
1200     // If this user is in the same block as the cast, don't change the cast.
1201     if (UserBB == DefBB) continue;
1202 
1203     // If we have already inserted a cast into this block, use it.
1204     CastInst *&InsertedCast = InsertedCasts[UserBB];
1205 
1206     if (!InsertedCast) {
1207       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1208       assert(InsertPt != UserBB->end());
1209       InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
1210                                       CI->getType(), "", &*InsertPt);
1211       InsertedCast->setDebugLoc(CI->getDebugLoc());
1212     }
1213 
1214     // Replace a use of the cast with a use of the new cast.
1215     TheUse = InsertedCast;
1216     MadeChange = true;
1217     ++NumCastUses;
1218   }
1219 
1220   // If we removed all uses, nuke the cast.
1221   if (CI->use_empty()) {
1222     salvageDebugInfo(*CI);
1223     CI->eraseFromParent();
1224     MadeChange = true;
1225   }
1226 
1227   return MadeChange;
1228 }
1229 
1230 /// If the specified cast instruction is a noop copy (e.g. it's casting from
1231 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
1232 /// reduce the number of virtual registers that must be created and coalesced.
1233 ///
1234 /// Return true if any changes are made.
1235 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
1236                                        const DataLayout &DL) {
1237   // Sink only "cheap" (or nop) address-space casts.  This is a weaker condition
1238   // than sinking only nop casts, but is helpful on some platforms.
1239   if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
1240     if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),
1241                                  ASC->getDestAddressSpace()))
1242       return false;
1243   }
1244 
1245   // If this is a noop copy,
1246   EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1247   EVT DstVT = TLI.getValueType(DL, CI->getType());
1248 
1249   // This is an fp<->int conversion?
1250   if (SrcVT.isInteger() != DstVT.isInteger())
1251     return false;
1252 
1253   // If this is an extension, it will be a zero or sign extension, which
1254   // isn't a noop.
1255   if (SrcVT.bitsLT(DstVT)) return false;
1256 
1257   // If these values will be promoted, find out what they will be promoted
1258   // to.  This helps us consider truncates on PPC as noop copies when they
1259   // are.
1260   if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
1261       TargetLowering::TypePromoteInteger)
1262     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
1263   if (TLI.getTypeAction(CI->getContext(), DstVT) ==
1264       TargetLowering::TypePromoteInteger)
1265     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
1266 
1267   // If, after promotion, these are the same types, this is a noop copy.
1268   if (SrcVT != DstVT)
1269     return false;
1270 
1271   return SinkCast(CI);
1272 }
1273 
1274 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
1275                                                  Value *Arg0, Value *Arg1,
1276                                                  CmpInst *Cmp,
1277                                                  Intrinsic::ID IID) {
1278   if (BO->getParent() != Cmp->getParent()) {
1279     // We used to use a dominator tree here to allow multi-block optimization.
1280     // But that was problematic because:
1281     // 1. It could cause a perf regression by hoisting the math op into the
1282     //    critical path.
1283     // 2. It could cause a perf regression by creating a value that was live
1284     //    across multiple blocks and increasing register pressure.
1285     // 3. Use of a dominator tree could cause large compile-time regression.
1286     //    This is because we recompute the DT on every change in the main CGP
1287     //    run-loop. The recomputing is probably unnecessary in many cases, so if
1288     //    that was fixed, using a DT here would be ok.
1289     return false;
1290   }
1291 
1292   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
1293   if (BO->getOpcode() == Instruction::Add &&
1294       IID == Intrinsic::usub_with_overflow) {
1295     assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
1296     Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
1297   }
1298 
1299   // Insert at the first instruction of the pair.
1300   Instruction *InsertPt = nullptr;
1301   for (Instruction &Iter : *Cmp->getParent()) {
1302     // If BO is an XOR, it is not guaranteed that it comes after both inputs to
1303     // the overflow intrinsic are defined.
1304     if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
1305       InsertPt = &Iter;
1306       break;
1307     }
1308   }
1309   assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
1310 
1311   IRBuilder<> Builder(InsertPt);
1312   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
1313   if (BO->getOpcode() != Instruction::Xor) {
1314     Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
1315     BO->replaceAllUsesWith(Math);
1316   } else
1317     assert(BO->hasOneUse() &&
1318            "Patterns with XOr should use the BO only in the compare");
1319   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
1320   Cmp->replaceAllUsesWith(OV);
1321   Cmp->eraseFromParent();
1322   BO->eraseFromParent();
1323   return true;
1324 }
1325 
1326 /// Match special-case patterns that check for unsigned add overflow.
1327 static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
1328                                                    BinaryOperator *&Add) {
1329   // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
1330   // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
1331   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
1332 
1333   // We are not expecting non-canonical/degenerate code. Just bail out.
1334   if (isa<Constant>(A))
1335     return false;
1336 
1337   ICmpInst::Predicate Pred = Cmp->getPredicate();
1338   if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))
1339     B = ConstantInt::get(B->getType(), 1);
1340   else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))
1341     B = ConstantInt::get(B->getType(), -1);
1342   else
1343     return false;
1344 
1345   // Check the users of the variable operand of the compare looking for an add
1346   // with the adjusted constant.
1347   for (User *U : A->users()) {
1348     if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {
1349       Add = cast<BinaryOperator>(U);
1350       return true;
1351     }
1352   }
1353   return false;
1354 }
1355 
1356 /// Try to combine the compare into a call to the llvm.uadd.with.overflow
1357 /// intrinsic. Return true if any changes were made.
1358 bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
1359                                                bool &ModifiedDT) {
1360   Value *A, *B;
1361   BinaryOperator *Add;
1362   if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
1363     if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
1364       return false;
1365     // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
1366     A = Add->getOperand(0);
1367     B = Add->getOperand(1);
1368   }
1369 
1370   if (!TLI->shouldFormOverflowOp(ISD::UADDO,
1371                                  TLI->getValueType(*DL, Add->getType()),
1372                                  Add->hasNUsesOrMore(2)))
1373     return false;
1374 
1375   // We don't want to move around uses of condition values this late, so we
1376   // check if it is legal to create the call to the intrinsic in the basic
1377   // block containing the icmp.
1378   if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
1379     return false;
1380 
1381   if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
1382                                    Intrinsic::uadd_with_overflow))
1383     return false;
1384 
1385   // Reset callers - do not crash by iterating over a dead instruction.
1386   ModifiedDT = true;
1387   return true;
1388 }
1389 
1390 bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
1391                                                bool &ModifiedDT) {
1392   // We are not expecting non-canonical/degenerate code. Just bail out.
1393   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
1394   if (isa<Constant>(A) && isa<Constant>(B))
1395     return false;
1396 
1397   // Convert (A u> B) to (A u< B) to simplify pattern matching.
1398   ICmpInst::Predicate Pred = Cmp->getPredicate();
1399   if (Pred == ICmpInst::ICMP_UGT) {
1400     std::swap(A, B);
1401     Pred = ICmpInst::ICMP_ULT;
1402   }
1403   // Convert special-case: (A == 0) is the same as (A u< 1).
1404   if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
1405     B = ConstantInt::get(B->getType(), 1);
1406     Pred = ICmpInst::ICMP_ULT;
1407   }
1408   // Convert special-case: (A != 0) is the same as (0 u< A).
1409   if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {
1410     std::swap(A, B);
1411     Pred = ICmpInst::ICMP_ULT;
1412   }
1413   if (Pred != ICmpInst::ICMP_ULT)
1414     return false;
1415 
1416   // Walk the users of a variable operand of a compare looking for a subtract or
1417   // add with that same operand. Also match the 2nd operand of the compare to
1418   // the add/sub, but that may be a negated constant operand of an add.
1419   Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
1420   BinaryOperator *Sub = nullptr;
1421   for (User *U : CmpVariableOperand->users()) {
1422     // A - B, A u< B --> usubo(A, B)
1423     if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
1424       Sub = cast<BinaryOperator>(U);
1425       break;
1426     }
1427 
1428     // A + (-C), A u< C (canonicalized form of (sub A, C))
1429     const APInt *CmpC, *AddC;
1430     if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
1431         match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {
1432       Sub = cast<BinaryOperator>(U);
1433       break;
1434     }
1435   }
1436   if (!Sub)
1437     return false;
1438 
1439   if (!TLI->shouldFormOverflowOp(ISD::USUBO,
1440                                  TLI->getValueType(*DL, Sub->getType()),
1441                                  Sub->hasNUsesOrMore(2)))
1442     return false;
1443 
1444   if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
1445                                    Cmp, Intrinsic::usub_with_overflow))
1446     return false;
1447 
1448   // Reset callers - do not crash by iterating over a dead instruction.
1449   ModifiedDT = true;
1450   return true;
1451 }
1452 
1453 /// Sink the given CmpInst into user blocks to reduce the number of virtual
1454 /// registers that must be created and coalesced. This is a clear win except on
1455 /// targets with multiple condition code registers (PowerPC), where it might
1456 /// lose; some adjustment may be wanted there.
1457 ///
1458 /// Return true if any changes are made.
1459 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
1460   if (TLI.hasMultipleConditionRegisters())
1461     return false;
1462 
1463   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
1464   if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
1465     return false;
1466 
1467   // Only insert a cmp in each block once.
1468   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
1469 
1470   bool MadeChange = false;
1471   for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
1472        UI != E; ) {
1473     Use &TheUse = UI.getUse();
1474     Instruction *User = cast<Instruction>(*UI);
1475 
1476     // Preincrement use iterator so we don't invalidate it.
1477     ++UI;
1478 
1479     // Don't bother for PHI nodes.
1480     if (isa<PHINode>(User))
1481       continue;
1482 
1483     // Figure out which BB this cmp is used in.
1484     BasicBlock *UserBB = User->getParent();
1485     BasicBlock *DefBB = Cmp->getParent();
1486 
1487     // If this user is in the same block as the cmp, don't change the cmp.
1488     if (UserBB == DefBB) continue;
1489 
1490     // If we have already inserted a cmp into this block, use it.
1491     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
1492 
1493     if (!InsertedCmp) {
1494       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1495       assert(InsertPt != UserBB->end());
1496       InsertedCmp =
1497           CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),
1498                           Cmp->getOperand(0), Cmp->getOperand(1), "",
1499                           &*InsertPt);
1500       // Propagate the debug info.
1501       InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
1502     }
1503 
1504     // Replace a use of the cmp with a use of the new cmp.
1505     TheUse = InsertedCmp;
1506     MadeChange = true;
1507     ++NumCmpUses;
1508   }
1509 
1510   // If we removed all uses, nuke the cmp.
1511   if (Cmp->use_empty()) {
1512     Cmp->eraseFromParent();
1513     MadeChange = true;
1514   }
1515 
1516   return MadeChange;
1517 }
1518 
1519 /// For pattern like:
1520 ///
1521 ///   DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB)
1522 ///   ...
1523 /// DomBB:
1524 ///   ...
1525 ///   br DomCond, TrueBB, CmpBB
1526 /// CmpBB: (with DomBB being the single predecessor)
1527 ///   ...
1528 ///   Cmp = icmp eq CmpOp0, CmpOp1
1529 ///   ...
1530 ///
1531 /// It would use two comparison on targets that lowering of icmp sgt/slt is
1532 /// different from lowering of icmp eq (PowerPC). This function try to convert
1533 /// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'.
1534 /// After that, DomCond and Cmp can use the same comparison so reduce one
1535 /// comparison.
1536 ///
1537 /// Return true if any changes are made.
1538 static bool foldICmpWithDominatingICmp(CmpInst *Cmp,
1539                                        const TargetLowering &TLI) {
1540   if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp())
1541     return false;
1542 
1543   ICmpInst::Predicate Pred = Cmp->getPredicate();
1544   if (Pred != ICmpInst::ICMP_EQ)
1545     return false;
1546 
1547   // If icmp eq has users other than BranchInst and SelectInst, converting it to
1548   // icmp slt/sgt would introduce more redundant LLVM IR.
1549   for (User *U : Cmp->users()) {
1550     if (isa<BranchInst>(U))
1551       continue;
1552     if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == Cmp)
1553       continue;
1554     return false;
1555   }
1556 
1557   // This is a cheap/incomplete check for dominance - just match a single
1558   // predecessor with a conditional branch.
1559   BasicBlock *CmpBB = Cmp->getParent();
1560   BasicBlock *DomBB = CmpBB->getSinglePredecessor();
1561   if (!DomBB)
1562     return false;
1563 
1564   // We want to ensure that the only way control gets to the comparison of
1565   // interest is that a less/greater than comparison on the same operands is
1566   // false.
1567   Value *DomCond;
1568   BasicBlock *TrueBB, *FalseBB;
1569   if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
1570     return false;
1571   if (CmpBB != FalseBB)
1572     return false;
1573 
1574   Value *CmpOp0 = Cmp->getOperand(0), *CmpOp1 = Cmp->getOperand(1);
1575   ICmpInst::Predicate DomPred;
1576   if (!match(DomCond, m_ICmp(DomPred, m_Specific(CmpOp0), m_Specific(CmpOp1))))
1577     return false;
1578   if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT)
1579     return false;
1580 
1581   // Convert the equality comparison to the opposite of the dominating
1582   // comparison and swap the direction for all branch/select users.
1583   // We have conceptually converted:
1584   // Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>;
1585   // to
1586   // Res = (a < b) ? <LT_RES> : (a > b)  ? <GT_RES> : <EQ_RES>;
1587   // And similarly for branches.
1588   for (User *U : Cmp->users()) {
1589     if (auto *BI = dyn_cast<BranchInst>(U)) {
1590       assert(BI->isConditional() && "Must be conditional");
1591       BI->swapSuccessors();
1592       continue;
1593     }
1594     if (auto *SI = dyn_cast<SelectInst>(U)) {
1595       // Swap operands
1596       SI->swapValues();
1597       SI->swapProfMetadata();
1598       continue;
1599     }
1600     llvm_unreachable("Must be a branch or a select");
1601   }
1602   Cmp->setPredicate(CmpInst::getSwappedPredicate(DomPred));
1603   return true;
1604 }
1605 
1606 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
1607   if (sinkCmpExpression(Cmp, *TLI))
1608     return true;
1609 
1610   if (combineToUAddWithOverflow(Cmp, ModifiedDT))
1611     return true;
1612 
1613   if (combineToUSubWithOverflow(Cmp, ModifiedDT))
1614     return true;
1615 
1616   if (foldICmpWithDominatingICmp(Cmp, *TLI))
1617     return true;
1618 
1619   return false;
1620 }
1621 
1622 /// Duplicate and sink the given 'and' instruction into user blocks where it is
1623 /// used in a compare to allow isel to generate better code for targets where
1624 /// this operation can be combined.
1625 ///
1626 /// Return true if any changes are made.
1627 static bool sinkAndCmp0Expression(Instruction *AndI,
1628                                   const TargetLowering &TLI,
1629                                   SetOfInstrs &InsertedInsts) {
1630   // Double-check that we're not trying to optimize an instruction that was
1631   // already optimized by some other part of this pass.
1632   assert(!InsertedInsts.count(AndI) &&
1633          "Attempting to optimize already optimized and instruction");
1634   (void) InsertedInsts;
1635 
1636   // Nothing to do for single use in same basic block.
1637   if (AndI->hasOneUse() &&
1638       AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
1639     return false;
1640 
1641   // Try to avoid cases where sinking/duplicating is likely to increase register
1642   // pressure.
1643   if (!isa<ConstantInt>(AndI->getOperand(0)) &&
1644       !isa<ConstantInt>(AndI->getOperand(1)) &&
1645       AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
1646     return false;
1647 
1648   for (auto *U : AndI->users()) {
1649     Instruction *User = cast<Instruction>(U);
1650 
1651     // Only sink 'and' feeding icmp with 0.
1652     if (!isa<ICmpInst>(User))
1653       return false;
1654 
1655     auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
1656     if (!CmpC || !CmpC->isZero())
1657       return false;
1658   }
1659 
1660   if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
1661     return false;
1662 
1663   LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
1664   LLVM_DEBUG(AndI->getParent()->dump());
1665 
1666   // Push the 'and' into the same block as the icmp 0.  There should only be
1667   // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
1668   // others, so we don't need to keep track of which BBs we insert into.
1669   for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
1670        UI != E; ) {
1671     Use &TheUse = UI.getUse();
1672     Instruction *User = cast<Instruction>(*UI);
1673 
1674     // Preincrement use iterator so we don't invalidate it.
1675     ++UI;
1676 
1677     LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
1678 
1679     // Keep the 'and' in the same place if the use is already in the same block.
1680     Instruction *InsertPt =
1681         User->getParent() == AndI->getParent() ? AndI : User;
1682     Instruction *InsertedAnd =
1683         BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
1684                                AndI->getOperand(1), "", InsertPt);
1685     // Propagate the debug info.
1686     InsertedAnd->setDebugLoc(AndI->getDebugLoc());
1687 
1688     // Replace a use of the 'and' with a use of the new 'and'.
1689     TheUse = InsertedAnd;
1690     ++NumAndUses;
1691     LLVM_DEBUG(User->getParent()->dump());
1692   }
1693 
1694   // We removed all uses, nuke the and.
1695   AndI->eraseFromParent();
1696   return true;
1697 }
1698 
1699 /// Check if the candidates could be combined with a shift instruction, which
1700 /// includes:
1701 /// 1. Truncate instruction
1702 /// 2. And instruction and the imm is a mask of the low bits:
1703 /// imm & (imm+1) == 0
1704 static bool isExtractBitsCandidateUse(Instruction *User) {
1705   if (!isa<TruncInst>(User)) {
1706     if (User->getOpcode() != Instruction::And ||
1707         !isa<ConstantInt>(User->getOperand(1)))
1708       return false;
1709 
1710     const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
1711 
1712     if ((Cimm & (Cimm + 1)).getBoolValue())
1713       return false;
1714   }
1715   return true;
1716 }
1717 
1718 /// Sink both shift and truncate instruction to the use of truncate's BB.
1719 static bool
1720 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
1721                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
1722                      const TargetLowering &TLI, const DataLayout &DL) {
1723   BasicBlock *UserBB = User->getParent();
1724   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
1725   auto *TruncI = cast<TruncInst>(User);
1726   bool MadeChange = false;
1727 
1728   for (Value::user_iterator TruncUI = TruncI->user_begin(),
1729                             TruncE = TruncI->user_end();
1730        TruncUI != TruncE;) {
1731 
1732     Use &TruncTheUse = TruncUI.getUse();
1733     Instruction *TruncUser = cast<Instruction>(*TruncUI);
1734     // Preincrement use iterator so we don't invalidate it.
1735 
1736     ++TruncUI;
1737 
1738     int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
1739     if (!ISDOpcode)
1740       continue;
1741 
1742     // If the use is actually a legal node, there will not be an
1743     // implicit truncate.
1744     // FIXME: always querying the result type is just an
1745     // approximation; some nodes' legality is determined by the
1746     // operand or other means. There's no good way to find out though.
1747     if (TLI.isOperationLegalOrCustom(
1748             ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
1749       continue;
1750 
1751     // Don't bother for PHI nodes.
1752     if (isa<PHINode>(TruncUser))
1753       continue;
1754 
1755     BasicBlock *TruncUserBB = TruncUser->getParent();
1756 
1757     if (UserBB == TruncUserBB)
1758       continue;
1759 
1760     BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
1761     CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
1762 
1763     if (!InsertedShift && !InsertedTrunc) {
1764       BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
1765       assert(InsertPt != TruncUserBB->end());
1766       // Sink the shift
1767       if (ShiftI->getOpcode() == Instruction::AShr)
1768         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1769                                                    "", &*InsertPt);
1770       else
1771         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1772                                                    "", &*InsertPt);
1773       InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
1774 
1775       // Sink the trunc
1776       BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
1777       TruncInsertPt++;
1778       assert(TruncInsertPt != TruncUserBB->end());
1779 
1780       InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
1781                                        TruncI->getType(), "", &*TruncInsertPt);
1782       InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());
1783 
1784       MadeChange = true;
1785 
1786       TruncTheUse = InsertedTrunc;
1787     }
1788   }
1789   return MadeChange;
1790 }
1791 
1792 /// Sink the shift *right* instruction into user blocks if the uses could
1793 /// potentially be combined with this shift instruction and generate BitExtract
1794 /// instruction. It will only be applied if the architecture supports BitExtract
1795 /// instruction. Here is an example:
1796 /// BB1:
1797 ///   %x.extract.shift = lshr i64 %arg1, 32
1798 /// BB2:
1799 ///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
1800 /// ==>
1801 ///
1802 /// BB2:
1803 ///   %x.extract.shift.1 = lshr i64 %arg1, 32
1804 ///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
1805 ///
1806 /// CodeGen will recognize the pattern in BB2 and generate BitExtract
1807 /// instruction.
1808 /// Return true if any changes are made.
1809 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
1810                                 const TargetLowering &TLI,
1811                                 const DataLayout &DL) {
1812   BasicBlock *DefBB = ShiftI->getParent();
1813 
1814   /// Only insert instructions in each block once.
1815   DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
1816 
1817   bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
1818 
1819   bool MadeChange = false;
1820   for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
1821        UI != E;) {
1822     Use &TheUse = UI.getUse();
1823     Instruction *User = cast<Instruction>(*UI);
1824     // Preincrement use iterator so we don't invalidate it.
1825     ++UI;
1826 
1827     // Don't bother for PHI nodes.
1828     if (isa<PHINode>(User))
1829       continue;
1830 
1831     if (!isExtractBitsCandidateUse(User))
1832       continue;
1833 
1834     BasicBlock *UserBB = User->getParent();
1835 
1836     if (UserBB == DefBB) {
1837       // If the shift and truncate instruction are in the same BB. The use of
1838       // the truncate(TruncUse) may still introduce another truncate if not
1839       // legal. In this case, we would like to sink both shift and truncate
1840       // instruction to the BB of TruncUse.
1841       // for example:
1842       // BB1:
1843       // i64 shift.result = lshr i64 opnd, imm
1844       // trunc.result = trunc shift.result to i16
1845       //
1846       // BB2:
1847       //   ----> We will have an implicit truncate here if the architecture does
1848       //   not have i16 compare.
1849       // cmp i16 trunc.result, opnd2
1850       //
1851       if (isa<TruncInst>(User) && shiftIsLegal
1852           // If the type of the truncate is legal, no truncate will be
1853           // introduced in other basic blocks.
1854           &&
1855           (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
1856         MadeChange =
1857             SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
1858 
1859       continue;
1860     }
1861     // If we have already inserted a shift into this block, use it.
1862     BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
1863 
1864     if (!InsertedShift) {
1865       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1866       assert(InsertPt != UserBB->end());
1867 
1868       if (ShiftI->getOpcode() == Instruction::AShr)
1869         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1870                                                    "", &*InsertPt);
1871       else
1872         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1873                                                    "", &*InsertPt);
1874       InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
1875 
1876       MadeChange = true;
1877     }
1878 
1879     // Replace a use of the shift with a use of the new shift.
1880     TheUse = InsertedShift;
1881   }
1882 
1883   // If we removed all uses, or there are none, nuke the shift.
1884   if (ShiftI->use_empty()) {
1885     salvageDebugInfo(*ShiftI);
1886     ShiftI->eraseFromParent();
1887     MadeChange = true;
1888   }
1889 
1890   return MadeChange;
1891 }
1892 
1893 /// If counting leading or trailing zeros is an expensive operation and a zero
1894 /// input is defined, add a check for zero to avoid calling the intrinsic.
1895 ///
1896 /// We want to transform:
1897 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
1898 ///
1899 /// into:
1900 ///   entry:
1901 ///     %cmpz = icmp eq i64 %A, 0
1902 ///     br i1 %cmpz, label %cond.end, label %cond.false
1903 ///   cond.false:
1904 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
1905 ///     br label %cond.end
1906 ///   cond.end:
1907 ///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
1908 ///
1909 /// If the transform is performed, return true and set ModifiedDT to true.
1910 static bool despeculateCountZeros(IntrinsicInst *CountZeros,
1911                                   const TargetLowering *TLI,
1912                                   const DataLayout *DL,
1913                                   bool &ModifiedDT) {
1914   // If a zero input is undefined, it doesn't make sense to despeculate that.
1915   if (match(CountZeros->getOperand(1), m_One()))
1916     return false;
1917 
1918   // If it's cheap to speculate, there's nothing to do.
1919   auto IntrinsicID = CountZeros->getIntrinsicID();
1920   if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
1921       (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
1922     return false;
1923 
1924   // Only handle legal scalar cases. Anything else requires too much work.
1925   Type *Ty = CountZeros->getType();
1926   unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
1927   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
1928     return false;
1929 
1930   // The intrinsic will be sunk behind a compare against zero and branch.
1931   BasicBlock *StartBlock = CountZeros->getParent();
1932   BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
1933 
1934   // Create another block after the count zero intrinsic. A PHI will be added
1935   // in this block to select the result of the intrinsic or the bit-width
1936   // constant if the input to the intrinsic is zero.
1937   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
1938   BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
1939 
1940   // Set up a builder to create a compare, conditional branch, and PHI.
1941   IRBuilder<> Builder(CountZeros->getContext());
1942   Builder.SetInsertPoint(StartBlock->getTerminator());
1943   Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
1944 
1945   // Replace the unconditional branch that was created by the first split with
1946   // a compare against zero and a conditional branch.
1947   Value *Zero = Constant::getNullValue(Ty);
1948   Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
1949   Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
1950   StartBlock->getTerminator()->eraseFromParent();
1951 
1952   // Create a PHI in the end block to select either the output of the intrinsic
1953   // or the bit width of the operand.
1954   Builder.SetInsertPoint(&EndBlock->front());
1955   PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
1956   CountZeros->replaceAllUsesWith(PN);
1957   Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
1958   PN->addIncoming(BitWidth, StartBlock);
1959   PN->addIncoming(CountZeros, CallBlock);
1960 
1961   // We are explicitly handling the zero case, so we can set the intrinsic's
1962   // undefined zero argument to 'true'. This will also prevent reprocessing the
1963   // intrinsic; we only despeculate when a zero input is defined.
1964   CountZeros->setArgOperand(1, Builder.getTrue());
1965   ModifiedDT = true;
1966   return true;
1967 }
1968 
1969 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
1970   BasicBlock *BB = CI->getParent();
1971 
1972   // Lower inline assembly if we can.
1973   // If we found an inline asm expession, and if the target knows how to
1974   // lower it to normal LLVM code, do so now.
1975   if (CI->isInlineAsm()) {
1976     if (TLI->ExpandInlineAsm(CI)) {
1977       // Avoid invalidating the iterator.
1978       CurInstIterator = BB->begin();
1979       // Avoid processing instructions out of order, which could cause
1980       // reuse before a value is defined.
1981       SunkAddrs.clear();
1982       return true;
1983     }
1984     // Sink address computing for memory operands into the block.
1985     if (optimizeInlineAsmInst(CI))
1986       return true;
1987   }
1988 
1989   // Align the pointer arguments to this call if the target thinks it's a good
1990   // idea
1991   unsigned MinSize, PrefAlign;
1992   if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
1993     for (auto &Arg : CI->arg_operands()) {
1994       // We want to align both objects whose address is used directly and
1995       // objects whose address is used in casts and GEPs, though it only makes
1996       // sense for GEPs if the offset is a multiple of the desired alignment and
1997       // if size - offset meets the size threshold.
1998       if (!Arg->getType()->isPointerTy())
1999         continue;
2000       APInt Offset(DL->getIndexSizeInBits(
2001                        cast<PointerType>(Arg->getType())->getAddressSpace()),
2002                    0);
2003       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
2004       uint64_t Offset2 = Offset.getLimitedValue();
2005       if ((Offset2 & (PrefAlign-1)) != 0)
2006         continue;
2007       AllocaInst *AI;
2008       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
2009           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
2010         AI->setAlignment(Align(PrefAlign));
2011       // Global variables can only be aligned if they are defined in this
2012       // object (i.e. they are uniquely initialized in this object), and
2013       // over-aligning global variables that have an explicit section is
2014       // forbidden.
2015       GlobalVariable *GV;
2016       if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
2017           GV->getPointerAlignment(*DL) < PrefAlign &&
2018           DL->getTypeAllocSize(GV->getValueType()) >=
2019               MinSize + Offset2)
2020         GV->setAlignment(MaybeAlign(PrefAlign));
2021     }
2022     // If this is a memcpy (or similar) then we may be able to improve the
2023     // alignment
2024     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
2025       Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
2026       MaybeAlign MIDestAlign = MI->getDestAlign();
2027       if (!MIDestAlign || DestAlign > *MIDestAlign)
2028         MI->setDestAlignment(DestAlign);
2029       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
2030         MaybeAlign MTISrcAlign = MTI->getSourceAlign();
2031         Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
2032         if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
2033           MTI->setSourceAlignment(SrcAlign);
2034       }
2035     }
2036   }
2037 
2038   // If we have a cold call site, try to sink addressing computation into the
2039   // cold block.  This interacts with our handling for loads and stores to
2040   // ensure that we can fold all uses of a potential addressing computation
2041   // into their uses.  TODO: generalize this to work over profiling data
2042   if (CI->hasFnAttr(Attribute::Cold) &&
2043       !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
2044     for (auto &Arg : CI->arg_operands()) {
2045       if (!Arg->getType()->isPointerTy())
2046         continue;
2047       unsigned AS = Arg->getType()->getPointerAddressSpace();
2048       return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
2049     }
2050 
2051   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
2052   if (II) {
2053     switch (II->getIntrinsicID()) {
2054     default: break;
2055     case Intrinsic::assume: {
2056       Value *Operand = II->getOperand(0);
2057       II->eraseFromParent();
2058       // Prune the operand, it's most likely dead.
2059       resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
2060         RecursivelyDeleteTriviallyDeadInstructions(
2061             Operand, TLInfo, nullptr,
2062             [&](Value *V) { removeAllAssertingVHReferences(V); });
2063       });
2064       return true;
2065     }
2066 
2067     case Intrinsic::experimental_widenable_condition: {
2068       // Give up on future widening oppurtunties so that we can fold away dead
2069       // paths and merge blocks before going into block-local instruction
2070       // selection.
2071       if (II->use_empty()) {
2072         II->eraseFromParent();
2073         return true;
2074       }
2075       Constant *RetVal = ConstantInt::getTrue(II->getContext());
2076       resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
2077         replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
2078       });
2079       return true;
2080     }
2081     case Intrinsic::objectsize:
2082       llvm_unreachable("llvm.objectsize.* should have been lowered already");
2083     case Intrinsic::is_constant:
2084       llvm_unreachable("llvm.is.constant.* should have been lowered already");
2085     case Intrinsic::aarch64_stlxr:
2086     case Intrinsic::aarch64_stxr: {
2087       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
2088       if (!ExtVal || !ExtVal->hasOneUse() ||
2089           ExtVal->getParent() == CI->getParent())
2090         return false;
2091       // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
2092       ExtVal->moveBefore(CI);
2093       // Mark this instruction as "inserted by CGP", so that other
2094       // optimizations don't touch it.
2095       InsertedInsts.insert(ExtVal);
2096       return true;
2097     }
2098 
2099     case Intrinsic::launder_invariant_group:
2100     case Intrinsic::strip_invariant_group: {
2101       Value *ArgVal = II->getArgOperand(0);
2102       auto it = LargeOffsetGEPMap.find(II);
2103       if (it != LargeOffsetGEPMap.end()) {
2104           // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
2105           // Make sure not to have to deal with iterator invalidation
2106           // after possibly adding ArgVal to LargeOffsetGEPMap.
2107           auto GEPs = std::move(it->second);
2108           LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
2109           LargeOffsetGEPMap.erase(II);
2110       }
2111 
2112       II->replaceAllUsesWith(ArgVal);
2113       II->eraseFromParent();
2114       return true;
2115     }
2116     case Intrinsic::cttz:
2117     case Intrinsic::ctlz:
2118       // If counting zeros is expensive, try to avoid it.
2119       return despeculateCountZeros(II, TLI, DL, ModifiedDT);
2120     case Intrinsic::fshl:
2121     case Intrinsic::fshr:
2122       return optimizeFunnelShift(II);
2123     case Intrinsic::dbg_value:
2124       return fixupDbgValue(II);
2125     case Intrinsic::vscale: {
2126       // If datalayout has no special restrictions on vector data layout,
2127       // replace `llvm.vscale` by an equivalent constant expression
2128       // to benefit from cheap constant propagation.
2129       Type *ScalableVectorTy =
2130           VectorType::get(Type::getInt8Ty(II->getContext()), 1, true);
2131       if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) {
2132         auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo());
2133         auto *One = ConstantInt::getSigned(II->getType(), 1);
2134         auto *CGep =
2135             ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);
2136         II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));
2137         II->eraseFromParent();
2138         return true;
2139       }
2140       break;
2141     }
2142     case Intrinsic::masked_gather:
2143       return optimizeGatherScatterInst(II, II->getArgOperand(0));
2144     case Intrinsic::masked_scatter:
2145       return optimizeGatherScatterInst(II, II->getArgOperand(1));
2146     }
2147 
2148     SmallVector<Value *, 2> PtrOps;
2149     Type *AccessTy;
2150     if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
2151       while (!PtrOps.empty()) {
2152         Value *PtrVal = PtrOps.pop_back_val();
2153         unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2154         if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
2155           return true;
2156       }
2157   }
2158 
2159   // From here on out we're working with named functions.
2160   if (!CI->getCalledFunction()) return false;
2161 
2162   // Lower all default uses of _chk calls.  This is very similar
2163   // to what InstCombineCalls does, but here we are only lowering calls
2164   // to fortified library functions (e.g. __memcpy_chk) that have the default
2165   // "don't know" as the objectsize.  Anything else should be left alone.
2166   FortifiedLibCallSimplifier Simplifier(TLInfo, true);
2167   IRBuilder<> Builder(CI);
2168   if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
2169     CI->replaceAllUsesWith(V);
2170     CI->eraseFromParent();
2171     return true;
2172   }
2173 
2174   return false;
2175 }
2176 
2177 /// Look for opportunities to duplicate return instructions to the predecessor
2178 /// to enable tail call optimizations. The case it is currently looking for is:
2179 /// @code
2180 /// bb0:
2181 ///   %tmp0 = tail call i32 @f0()
2182 ///   br label %return
2183 /// bb1:
2184 ///   %tmp1 = tail call i32 @f1()
2185 ///   br label %return
2186 /// bb2:
2187 ///   %tmp2 = tail call i32 @f2()
2188 ///   br label %return
2189 /// return:
2190 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
2191 ///   ret i32 %retval
2192 /// @endcode
2193 ///
2194 /// =>
2195 ///
2196 /// @code
2197 /// bb0:
2198 ///   %tmp0 = tail call i32 @f0()
2199 ///   ret i32 %tmp0
2200 /// bb1:
2201 ///   %tmp1 = tail call i32 @f1()
2202 ///   ret i32 %tmp1
2203 /// bb2:
2204 ///   %tmp2 = tail call i32 @f2()
2205 ///   ret i32 %tmp2
2206 /// @endcode
2207 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
2208   ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
2209   if (!RetI)
2210     return false;
2211 
2212   PHINode *PN = nullptr;
2213   ExtractValueInst *EVI = nullptr;
2214   BitCastInst *BCI = nullptr;
2215   Value *V = RetI->getReturnValue();
2216   if (V) {
2217     BCI = dyn_cast<BitCastInst>(V);
2218     if (BCI)
2219       V = BCI->getOperand(0);
2220 
2221     EVI = dyn_cast<ExtractValueInst>(V);
2222     if (EVI) {
2223       V = EVI->getOperand(0);
2224       if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),
2225                        [](unsigned idx) { return idx == 0; }))
2226         return false;
2227     }
2228 
2229     PN = dyn_cast<PHINode>(V);
2230     if (!PN)
2231       return false;
2232   }
2233 
2234   if (PN && PN->getParent() != BB)
2235     return false;
2236 
2237   // Make sure there are no instructions between the PHI and return, or that the
2238   // return is the first instruction in the block.
2239   if (PN) {
2240     BasicBlock::iterator BI = BB->begin();
2241     // Skip over debug and the bitcast.
2242     do {
2243       ++BI;
2244     } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
2245              isa<PseudoProbeInst>(BI));
2246     if (&*BI != RetI)
2247       return false;
2248   } else {
2249     if (BB->getFirstNonPHIOrDbg(true) != RetI)
2250       return false;
2251   }
2252 
2253   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
2254   /// call.
2255   const Function *F = BB->getParent();
2256   SmallVector<BasicBlock*, 4> TailCallBBs;
2257   if (PN) {
2258     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
2259       // Look through bitcasts.
2260       Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
2261       CallInst *CI = dyn_cast<CallInst>(IncomingVal);
2262       BasicBlock *PredBB = PN->getIncomingBlock(I);
2263       // Make sure the phi value is indeed produced by the tail call.
2264       if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
2265           TLI->mayBeEmittedAsTailCall(CI) &&
2266           attributesPermitTailCall(F, CI, RetI, *TLI))
2267         TailCallBBs.push_back(PredBB);
2268     }
2269   } else {
2270     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
2271     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
2272       if (!VisitedBBs.insert(*PI).second)
2273         continue;
2274       if (Instruction *I = (*PI)->rbegin()->getPrevNonDebugInstruction(true)) {
2275         CallInst *CI = dyn_cast<CallInst>(I);
2276         if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
2277             attributesPermitTailCall(F, CI, RetI, *TLI))
2278           TailCallBBs.push_back(*PI);
2279       }
2280     }
2281   }
2282 
2283   bool Changed = false;
2284   for (auto const &TailCallBB : TailCallBBs) {
2285     // Make sure the call instruction is followed by an unconditional branch to
2286     // the return block.
2287     BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());
2288     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
2289       continue;
2290 
2291     // Duplicate the return into TailCallBB.
2292     (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
2293     assert(!VerifyBFIUpdates ||
2294            BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
2295     BFI->setBlockFreq(
2296         BB,
2297         (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
2298     ModifiedDT = Changed = true;
2299     ++NumRetsDup;
2300   }
2301 
2302   // If we eliminated all predecessors of the block, delete the block now.
2303   if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
2304     BB->eraseFromParent();
2305 
2306   return Changed;
2307 }
2308 
2309 //===----------------------------------------------------------------------===//
2310 // Memory Optimization
2311 //===----------------------------------------------------------------------===//
2312 
2313 namespace {
2314 
2315 /// This is an extended version of TargetLowering::AddrMode
2316 /// which holds actual Value*'s for register values.
2317 struct ExtAddrMode : public TargetLowering::AddrMode {
2318   Value *BaseReg = nullptr;
2319   Value *ScaledReg = nullptr;
2320   Value *OriginalValue = nullptr;
2321   bool InBounds = true;
2322 
2323   enum FieldName {
2324     NoField        = 0x00,
2325     BaseRegField   = 0x01,
2326     BaseGVField    = 0x02,
2327     BaseOffsField  = 0x04,
2328     ScaledRegField = 0x08,
2329     ScaleField     = 0x10,
2330     MultipleFields = 0xff
2331   };
2332 
2333 
2334   ExtAddrMode() = default;
2335 
2336   void print(raw_ostream &OS) const;
2337   void dump() const;
2338 
2339   FieldName compare(const ExtAddrMode &other) {
2340     // First check that the types are the same on each field, as differing types
2341     // is something we can't cope with later on.
2342     if (BaseReg && other.BaseReg &&
2343         BaseReg->getType() != other.BaseReg->getType())
2344       return MultipleFields;
2345     if (BaseGV && other.BaseGV &&
2346         BaseGV->getType() != other.BaseGV->getType())
2347       return MultipleFields;
2348     if (ScaledReg && other.ScaledReg &&
2349         ScaledReg->getType() != other.ScaledReg->getType())
2350       return MultipleFields;
2351 
2352     // Conservatively reject 'inbounds' mismatches.
2353     if (InBounds != other.InBounds)
2354       return MultipleFields;
2355 
2356     // Check each field to see if it differs.
2357     unsigned Result = NoField;
2358     if (BaseReg != other.BaseReg)
2359       Result |= BaseRegField;
2360     if (BaseGV != other.BaseGV)
2361       Result |= BaseGVField;
2362     if (BaseOffs != other.BaseOffs)
2363       Result |= BaseOffsField;
2364     if (ScaledReg != other.ScaledReg)
2365       Result |= ScaledRegField;
2366     // Don't count 0 as being a different scale, because that actually means
2367     // unscaled (which will already be counted by having no ScaledReg).
2368     if (Scale && other.Scale && Scale != other.Scale)
2369       Result |= ScaleField;
2370 
2371     if (countPopulation(Result) > 1)
2372       return MultipleFields;
2373     else
2374       return static_cast<FieldName>(Result);
2375   }
2376 
2377   // An AddrMode is trivial if it involves no calculation i.e. it is just a base
2378   // with no offset.
2379   bool isTrivial() {
2380     // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
2381     // trivial if at most one of these terms is nonzero, except that BaseGV and
2382     // BaseReg both being zero actually means a null pointer value, which we
2383     // consider to be 'non-zero' here.
2384     return !BaseOffs && !Scale && !(BaseGV && BaseReg);
2385   }
2386 
2387   Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {
2388     switch (Field) {
2389     default:
2390       return nullptr;
2391     case BaseRegField:
2392       return BaseReg;
2393     case BaseGVField:
2394       return BaseGV;
2395     case ScaledRegField:
2396       return ScaledReg;
2397     case BaseOffsField:
2398       return ConstantInt::get(IntPtrTy, BaseOffs);
2399     }
2400   }
2401 
2402   void SetCombinedField(FieldName Field, Value *V,
2403                         const SmallVectorImpl<ExtAddrMode> &AddrModes) {
2404     switch (Field) {
2405     default:
2406       llvm_unreachable("Unhandled fields are expected to be rejected earlier");
2407       break;
2408     case ExtAddrMode::BaseRegField:
2409       BaseReg = V;
2410       break;
2411     case ExtAddrMode::BaseGVField:
2412       // A combined BaseGV is an Instruction, not a GlobalValue, so it goes
2413       // in the BaseReg field.
2414       assert(BaseReg == nullptr);
2415       BaseReg = V;
2416       BaseGV = nullptr;
2417       break;
2418     case ExtAddrMode::ScaledRegField:
2419       ScaledReg = V;
2420       // If we have a mix of scaled and unscaled addrmodes then we want scale
2421       // to be the scale and not zero.
2422       if (!Scale)
2423         for (const ExtAddrMode &AM : AddrModes)
2424           if (AM.Scale) {
2425             Scale = AM.Scale;
2426             break;
2427           }
2428       break;
2429     case ExtAddrMode::BaseOffsField:
2430       // The offset is no longer a constant, so it goes in ScaledReg with a
2431       // scale of 1.
2432       assert(ScaledReg == nullptr);
2433       ScaledReg = V;
2434       Scale = 1;
2435       BaseOffs = 0;
2436       break;
2437     }
2438   }
2439 };
2440 
2441 } // end anonymous namespace
2442 
2443 #ifndef NDEBUG
2444 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
2445   AM.print(OS);
2446   return OS;
2447 }
2448 #endif
2449 
2450 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2451 void ExtAddrMode::print(raw_ostream &OS) const {
2452   bool NeedPlus = false;
2453   OS << "[";
2454   if (InBounds)
2455     OS << "inbounds ";
2456   if (BaseGV) {
2457     OS << (NeedPlus ? " + " : "")
2458        << "GV:";
2459     BaseGV->printAsOperand(OS, /*PrintType=*/false);
2460     NeedPlus = true;
2461   }
2462 
2463   if (BaseOffs) {
2464     OS << (NeedPlus ? " + " : "")
2465        << BaseOffs;
2466     NeedPlus = true;
2467   }
2468 
2469   if (BaseReg) {
2470     OS << (NeedPlus ? " + " : "")
2471        << "Base:";
2472     BaseReg->printAsOperand(OS, /*PrintType=*/false);
2473     NeedPlus = true;
2474   }
2475   if (Scale) {
2476     OS << (NeedPlus ? " + " : "")
2477        << Scale << "*";
2478     ScaledReg->printAsOperand(OS, /*PrintType=*/false);
2479   }
2480 
2481   OS << ']';
2482 }
2483 
2484 LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
2485   print(dbgs());
2486   dbgs() << '\n';
2487 }
2488 #endif
2489 
2490 namespace {
2491 
2492 /// This class provides transaction based operation on the IR.
2493 /// Every change made through this class is recorded in the internal state and
2494 /// can be undone (rollback) until commit is called.
2495 /// CGP does not check if instructions could be speculatively executed when
2496 /// moved. Preserving the original location would pessimize the debugging
2497 /// experience, as well as negatively impact the quality of sample PGO.
2498 class TypePromotionTransaction {
2499   /// This represents the common interface of the individual transaction.
2500   /// Each class implements the logic for doing one specific modification on
2501   /// the IR via the TypePromotionTransaction.
2502   class TypePromotionAction {
2503   protected:
2504     /// The Instruction modified.
2505     Instruction *Inst;
2506 
2507   public:
2508     /// Constructor of the action.
2509     /// The constructor performs the related action on the IR.
2510     TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
2511 
2512     virtual ~TypePromotionAction() = default;
2513 
2514     /// Undo the modification done by this action.
2515     /// When this method is called, the IR must be in the same state as it was
2516     /// before this action was applied.
2517     /// \pre Undoing the action works if and only if the IR is in the exact same
2518     /// state as it was directly after this action was applied.
2519     virtual void undo() = 0;
2520 
2521     /// Advocate every change made by this action.
2522     /// When the results on the IR of the action are to be kept, it is important
2523     /// to call this function, otherwise hidden information may be kept forever.
2524     virtual void commit() {
2525       // Nothing to be done, this action is not doing anything.
2526     }
2527   };
2528 
2529   /// Utility to remember the position of an instruction.
2530   class InsertionHandler {
2531     /// Position of an instruction.
2532     /// Either an instruction:
2533     /// - Is the first in a basic block: BB is used.
2534     /// - Has a previous instruction: PrevInst is used.
2535     union {
2536       Instruction *PrevInst;
2537       BasicBlock *BB;
2538     } Point;
2539 
2540     /// Remember whether or not the instruction had a previous instruction.
2541     bool HasPrevInstruction;
2542 
2543   public:
2544     /// Record the position of \p Inst.
2545     InsertionHandler(Instruction *Inst) {
2546       BasicBlock::iterator It = Inst->getIterator();
2547       HasPrevInstruction = (It != (Inst->getParent()->begin()));
2548       if (HasPrevInstruction)
2549         Point.PrevInst = &*--It;
2550       else
2551         Point.BB = Inst->getParent();
2552     }
2553 
2554     /// Insert \p Inst at the recorded position.
2555     void insert(Instruction *Inst) {
2556       if (HasPrevInstruction) {
2557         if (Inst->getParent())
2558           Inst->removeFromParent();
2559         Inst->insertAfter(Point.PrevInst);
2560       } else {
2561         Instruction *Position = &*Point.BB->getFirstInsertionPt();
2562         if (Inst->getParent())
2563           Inst->moveBefore(Position);
2564         else
2565           Inst->insertBefore(Position);
2566       }
2567     }
2568   };
2569 
2570   /// Move an instruction before another.
2571   class InstructionMoveBefore : public TypePromotionAction {
2572     /// Original position of the instruction.
2573     InsertionHandler Position;
2574 
2575   public:
2576     /// Move \p Inst before \p Before.
2577     InstructionMoveBefore(Instruction *Inst, Instruction *Before)
2578         : TypePromotionAction(Inst), Position(Inst) {
2579       LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
2580                         << "\n");
2581       Inst->moveBefore(Before);
2582     }
2583 
2584     /// Move the instruction back to its original position.
2585     void undo() override {
2586       LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
2587       Position.insert(Inst);
2588     }
2589   };
2590 
2591   /// Set the operand of an instruction with a new value.
2592   class OperandSetter : public TypePromotionAction {
2593     /// Original operand of the instruction.
2594     Value *Origin;
2595 
2596     /// Index of the modified instruction.
2597     unsigned Idx;
2598 
2599   public:
2600     /// Set \p Idx operand of \p Inst with \p NewVal.
2601     OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
2602         : TypePromotionAction(Inst), Idx(Idx) {
2603       LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
2604                         << "for:" << *Inst << "\n"
2605                         << "with:" << *NewVal << "\n");
2606       Origin = Inst->getOperand(Idx);
2607       Inst->setOperand(Idx, NewVal);
2608     }
2609 
2610     /// Restore the original value of the instruction.
2611     void undo() override {
2612       LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
2613                         << "for: " << *Inst << "\n"
2614                         << "with: " << *Origin << "\n");
2615       Inst->setOperand(Idx, Origin);
2616     }
2617   };
2618 
2619   /// Hide the operands of an instruction.
2620   /// Do as if this instruction was not using any of its operands.
2621   class OperandsHider : public TypePromotionAction {
2622     /// The list of original operands.
2623     SmallVector<Value *, 4> OriginalValues;
2624 
2625   public:
2626     /// Remove \p Inst from the uses of the operands of \p Inst.
2627     OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
2628       LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
2629       unsigned NumOpnds = Inst->getNumOperands();
2630       OriginalValues.reserve(NumOpnds);
2631       for (unsigned It = 0; It < NumOpnds; ++It) {
2632         // Save the current operand.
2633         Value *Val = Inst->getOperand(It);
2634         OriginalValues.push_back(Val);
2635         // Set a dummy one.
2636         // We could use OperandSetter here, but that would imply an overhead
2637         // that we are not willing to pay.
2638         Inst->setOperand(It, UndefValue::get(Val->getType()));
2639       }
2640     }
2641 
2642     /// Restore the original list of uses.
2643     void undo() override {
2644       LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
2645       for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
2646         Inst->setOperand(It, OriginalValues[It]);
2647     }
2648   };
2649 
2650   /// Build a truncate instruction.
2651   class TruncBuilder : public TypePromotionAction {
2652     Value *Val;
2653 
2654   public:
2655     /// Build a truncate instruction of \p Opnd producing a \p Ty
2656     /// result.
2657     /// trunc Opnd to Ty.
2658     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
2659       IRBuilder<> Builder(Opnd);
2660       Builder.SetCurrentDebugLocation(DebugLoc());
2661       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
2662       LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
2663     }
2664 
2665     /// Get the built value.
2666     Value *getBuiltValue() { return Val; }
2667 
2668     /// Remove the built instruction.
2669     void undo() override {
2670       LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
2671       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2672         IVal->eraseFromParent();
2673     }
2674   };
2675 
2676   /// Build a sign extension instruction.
2677   class SExtBuilder : public TypePromotionAction {
2678     Value *Val;
2679 
2680   public:
2681     /// Build a sign extension instruction of \p Opnd producing a \p Ty
2682     /// result.
2683     /// sext Opnd to Ty.
2684     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2685         : TypePromotionAction(InsertPt) {
2686       IRBuilder<> Builder(InsertPt);
2687       Val = Builder.CreateSExt(Opnd, Ty, "promoted");
2688       LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
2689     }
2690 
2691     /// Get the built value.
2692     Value *getBuiltValue() { return Val; }
2693 
2694     /// Remove the built instruction.
2695     void undo() override {
2696       LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
2697       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2698         IVal->eraseFromParent();
2699     }
2700   };
2701 
2702   /// Build a zero extension instruction.
2703   class ZExtBuilder : public TypePromotionAction {
2704     Value *Val;
2705 
2706   public:
2707     /// Build a zero extension instruction of \p Opnd producing a \p Ty
2708     /// result.
2709     /// zext Opnd to Ty.
2710     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2711         : TypePromotionAction(InsertPt) {
2712       IRBuilder<> Builder(InsertPt);
2713       Builder.SetCurrentDebugLocation(DebugLoc());
2714       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
2715       LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
2716     }
2717 
2718     /// Get the built value.
2719     Value *getBuiltValue() { return Val; }
2720 
2721     /// Remove the built instruction.
2722     void undo() override {
2723       LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
2724       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2725         IVal->eraseFromParent();
2726     }
2727   };
2728 
2729   /// Mutate an instruction to another type.
2730   class TypeMutator : public TypePromotionAction {
2731     /// Record the original type.
2732     Type *OrigTy;
2733 
2734   public:
2735     /// Mutate the type of \p Inst into \p NewTy.
2736     TypeMutator(Instruction *Inst, Type *NewTy)
2737         : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
2738       LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
2739                         << "\n");
2740       Inst->mutateType(NewTy);
2741     }
2742 
2743     /// Mutate the instruction back to its original type.
2744     void undo() override {
2745       LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
2746                         << "\n");
2747       Inst->mutateType(OrigTy);
2748     }
2749   };
2750 
2751   /// Replace the uses of an instruction by another instruction.
2752   class UsesReplacer : public TypePromotionAction {
2753     /// Helper structure to keep track of the replaced uses.
2754     struct InstructionAndIdx {
2755       /// The instruction using the instruction.
2756       Instruction *Inst;
2757 
2758       /// The index where this instruction is used for Inst.
2759       unsigned Idx;
2760 
2761       InstructionAndIdx(Instruction *Inst, unsigned Idx)
2762           : Inst(Inst), Idx(Idx) {}
2763     };
2764 
2765     /// Keep track of the original uses (pair Instruction, Index).
2766     SmallVector<InstructionAndIdx, 4> OriginalUses;
2767     /// Keep track of the debug users.
2768     SmallVector<DbgValueInst *, 1> DbgValues;
2769 
2770     using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
2771 
2772   public:
2773     /// Replace all the use of \p Inst by \p New.
2774     UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
2775       LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
2776                         << "\n");
2777       // Record the original uses.
2778       for (Use &U : Inst->uses()) {
2779         Instruction *UserI = cast<Instruction>(U.getUser());
2780         OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
2781       }
2782       // Record the debug uses separately. They are not in the instruction's
2783       // use list, but they are replaced by RAUW.
2784       findDbgValues(DbgValues, Inst);
2785 
2786       // Now, we can replace the uses.
2787       Inst->replaceAllUsesWith(New);
2788     }
2789 
2790     /// Reassign the original uses of Inst to Inst.
2791     void undo() override {
2792       LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
2793       for (use_iterator UseIt = OriginalUses.begin(),
2794                         EndIt = OriginalUses.end();
2795            UseIt != EndIt; ++UseIt) {
2796         UseIt->Inst->setOperand(UseIt->Idx, Inst);
2797       }
2798       // RAUW has replaced all original uses with references to the new value,
2799       // including the debug uses. Since we are undoing the replacements,
2800       // the original debug uses must also be reinstated to maintain the
2801       // correctness and utility of debug value instructions.
2802       for (auto *DVI: DbgValues) {
2803         LLVMContext &Ctx = Inst->getType()->getContext();
2804         auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst));
2805         DVI->setOperand(0, MV);
2806       }
2807     }
2808   };
2809 
2810   /// Remove an instruction from the IR.
2811   class InstructionRemover : public TypePromotionAction {
2812     /// Original position of the instruction.
2813     InsertionHandler Inserter;
2814 
2815     /// Helper structure to hide all the link to the instruction. In other
2816     /// words, this helps to do as if the instruction was removed.
2817     OperandsHider Hider;
2818 
2819     /// Keep track of the uses replaced, if any.
2820     UsesReplacer *Replacer = nullptr;
2821 
2822     /// Keep track of instructions removed.
2823     SetOfInstrs &RemovedInsts;
2824 
2825   public:
2826     /// Remove all reference of \p Inst and optionally replace all its
2827     /// uses with New.
2828     /// \p RemovedInsts Keep track of the instructions removed by this Action.
2829     /// \pre If !Inst->use_empty(), then New != nullptr
2830     InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
2831                        Value *New = nullptr)
2832         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
2833           RemovedInsts(RemovedInsts) {
2834       if (New)
2835         Replacer = new UsesReplacer(Inst, New);
2836       LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
2837       RemovedInsts.insert(Inst);
2838       /// The instructions removed here will be freed after completing
2839       /// optimizeBlock() for all blocks as we need to keep track of the
2840       /// removed instructions during promotion.
2841       Inst->removeFromParent();
2842     }
2843 
2844     ~InstructionRemover() override { delete Replacer; }
2845 
2846     /// Resurrect the instruction and reassign it to the proper uses if
2847     /// new value was provided when build this action.
2848     void undo() override {
2849       LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
2850       Inserter.insert(Inst);
2851       if (Replacer)
2852         Replacer->undo();
2853       Hider.undo();
2854       RemovedInsts.erase(Inst);
2855     }
2856   };
2857 
2858 public:
2859   /// Restoration point.
2860   /// The restoration point is a pointer to an action instead of an iterator
2861   /// because the iterator may be invalidated but not the pointer.
2862   using ConstRestorationPt = const TypePromotionAction *;
2863 
2864   TypePromotionTransaction(SetOfInstrs &RemovedInsts)
2865       : RemovedInsts(RemovedInsts) {}
2866 
2867   /// Advocate every changes made in that transaction. Return true if any change
2868   /// happen.
2869   bool commit();
2870 
2871   /// Undo all the changes made after the given point.
2872   void rollback(ConstRestorationPt Point);
2873 
2874   /// Get the current restoration point.
2875   ConstRestorationPt getRestorationPoint() const;
2876 
2877   /// \name API for IR modification with state keeping to support rollback.
2878   /// @{
2879   /// Same as Instruction::setOperand.
2880   void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
2881 
2882   /// Same as Instruction::eraseFromParent.
2883   void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
2884 
2885   /// Same as Value::replaceAllUsesWith.
2886   void replaceAllUsesWith(Instruction *Inst, Value *New);
2887 
2888   /// Same as Value::mutateType.
2889   void mutateType(Instruction *Inst, Type *NewTy);
2890 
2891   /// Same as IRBuilder::createTrunc.
2892   Value *createTrunc(Instruction *Opnd, Type *Ty);
2893 
2894   /// Same as IRBuilder::createSExt.
2895   Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
2896 
2897   /// Same as IRBuilder::createZExt.
2898   Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
2899 
2900   /// Same as Instruction::moveBefore.
2901   void moveBefore(Instruction *Inst, Instruction *Before);
2902   /// @}
2903 
2904 private:
2905   /// The ordered list of actions made so far.
2906   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
2907 
2908   using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;
2909 
2910   SetOfInstrs &RemovedInsts;
2911 };
2912 
2913 } // end anonymous namespace
2914 
2915 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
2916                                           Value *NewVal) {
2917   Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(
2918       Inst, Idx, NewVal));
2919 }
2920 
2921 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
2922                                                 Value *NewVal) {
2923   Actions.push_back(
2924       std::make_unique<TypePromotionTransaction::InstructionRemover>(
2925           Inst, RemovedInsts, NewVal));
2926 }
2927 
2928 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
2929                                                   Value *New) {
2930   Actions.push_back(
2931       std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
2932 }
2933 
2934 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
2935   Actions.push_back(
2936       std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
2937 }
2938 
2939 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
2940                                              Type *Ty) {
2941   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
2942   Value *Val = Ptr->getBuiltValue();
2943   Actions.push_back(std::move(Ptr));
2944   return Val;
2945 }
2946 
2947 Value *TypePromotionTransaction::createSExt(Instruction *Inst,
2948                                             Value *Opnd, Type *Ty) {
2949   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
2950   Value *Val = Ptr->getBuiltValue();
2951   Actions.push_back(std::move(Ptr));
2952   return Val;
2953 }
2954 
2955 Value *TypePromotionTransaction::createZExt(Instruction *Inst,
2956                                             Value *Opnd, Type *Ty) {
2957   std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
2958   Value *Val = Ptr->getBuiltValue();
2959   Actions.push_back(std::move(Ptr));
2960   return Val;
2961 }
2962 
2963 void TypePromotionTransaction::moveBefore(Instruction *Inst,
2964                                           Instruction *Before) {
2965   Actions.push_back(
2966       std::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
2967           Inst, Before));
2968 }
2969 
2970 TypePromotionTransaction::ConstRestorationPt
2971 TypePromotionTransaction::getRestorationPoint() const {
2972   return !Actions.empty() ? Actions.back().get() : nullptr;
2973 }
2974 
2975 bool TypePromotionTransaction::commit() {
2976   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
2977        ++It)
2978     (*It)->commit();
2979   bool Modified = !Actions.empty();
2980   Actions.clear();
2981   return Modified;
2982 }
2983 
2984 void TypePromotionTransaction::rollback(
2985     TypePromotionTransaction::ConstRestorationPt Point) {
2986   while (!Actions.empty() && Point != Actions.back().get()) {
2987     std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
2988     Curr->undo();
2989   }
2990 }
2991 
2992 namespace {
2993 
2994 /// A helper class for matching addressing modes.
2995 ///
2996 /// This encapsulates the logic for matching the target-legal addressing modes.
2997 class AddressingModeMatcher {
2998   SmallVectorImpl<Instruction*> &AddrModeInsts;
2999   const TargetLowering &TLI;
3000   const TargetRegisterInfo &TRI;
3001   const DataLayout &DL;
3002 
3003   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
3004   /// the memory instruction that we're computing this address for.
3005   Type *AccessTy;
3006   unsigned AddrSpace;
3007   Instruction *MemoryInst;
3008 
3009   /// This is the addressing mode that we're building up. This is
3010   /// part of the return value of this addressing mode matching stuff.
3011   ExtAddrMode &AddrMode;
3012 
3013   /// The instructions inserted by other CodeGenPrepare optimizations.
3014   const SetOfInstrs &InsertedInsts;
3015 
3016   /// A map from the instructions to their type before promotion.
3017   InstrToOrigTy &PromotedInsts;
3018 
3019   /// The ongoing transaction where every action should be registered.
3020   TypePromotionTransaction &TPT;
3021 
3022   // A GEP which has too large offset to be folded into the addressing mode.
3023   std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
3024 
3025   /// This is set to true when we should not do profitability checks.
3026   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
3027   bool IgnoreProfitability;
3028 
3029   /// True if we are optimizing for size.
3030   bool OptSize;
3031 
3032   ProfileSummaryInfo *PSI;
3033   BlockFrequencyInfo *BFI;
3034 
3035   AddressingModeMatcher(
3036       SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
3037       const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
3038       ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
3039       InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
3040       std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3041       bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
3042       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
3043         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
3044         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
3045         PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
3046         OptSize(OptSize), PSI(PSI), BFI(BFI) {
3047     IgnoreProfitability = false;
3048   }
3049 
3050 public:
3051   /// Find the maximal addressing mode that a load/store of V can fold,
3052   /// give an access type of AccessTy.  This returns a list of involved
3053   /// instructions in AddrModeInsts.
3054   /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
3055   /// optimizations.
3056   /// \p PromotedInsts maps the instructions to their type before promotion.
3057   /// \p The ongoing transaction where every action should be registered.
3058   static ExtAddrMode
3059   Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
3060         SmallVectorImpl<Instruction *> &AddrModeInsts,
3061         const TargetLowering &TLI, const TargetRegisterInfo &TRI,
3062         const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
3063         TypePromotionTransaction &TPT,
3064         std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3065         bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
3066     ExtAddrMode Result;
3067 
3068     bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
3069                                          MemoryInst, Result, InsertedInsts,
3070                                          PromotedInsts, TPT, LargeOffsetGEP,
3071                                          OptSize, PSI, BFI)
3072                        .matchAddr(V, 0);
3073     (void)Success; assert(Success && "Couldn't select *anything*?");
3074     return Result;
3075   }
3076 
3077 private:
3078   bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
3079   bool matchAddr(Value *Addr, unsigned Depth);
3080   bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
3081                           bool *MovedAway = nullptr);
3082   bool isProfitableToFoldIntoAddressingMode(Instruction *I,
3083                                             ExtAddrMode &AMBefore,
3084                                             ExtAddrMode &AMAfter);
3085   bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
3086   bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
3087                              Value *PromotedOperand) const;
3088 };
3089 
3090 class PhiNodeSet;
3091 
3092 /// An iterator for PhiNodeSet.
3093 class PhiNodeSetIterator {
3094   PhiNodeSet * const Set;
3095   size_t CurrentIndex = 0;
3096 
3097 public:
3098   /// The constructor. Start should point to either a valid element, or be equal
3099   /// to the size of the underlying SmallVector of the PhiNodeSet.
3100   PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start);
3101   PHINode * operator*() const;
3102   PhiNodeSetIterator& operator++();
3103   bool operator==(const PhiNodeSetIterator &RHS) const;
3104   bool operator!=(const PhiNodeSetIterator &RHS) const;
3105 };
3106 
3107 /// Keeps a set of PHINodes.
3108 ///
3109 /// This is a minimal set implementation for a specific use case:
3110 /// It is very fast when there are very few elements, but also provides good
3111 /// performance when there are many. It is similar to SmallPtrSet, but also
3112 /// provides iteration by insertion order, which is deterministic and stable
3113 /// across runs. It is also similar to SmallSetVector, but provides removing
3114 /// elements in O(1) time. This is achieved by not actually removing the element
3115 /// from the underlying vector, so comes at the cost of using more memory, but
3116 /// that is fine, since PhiNodeSets are used as short lived objects.
3117 class PhiNodeSet {
3118   friend class PhiNodeSetIterator;
3119 
3120   using MapType = SmallDenseMap<PHINode *, size_t, 32>;
3121   using iterator =  PhiNodeSetIterator;
3122 
3123   /// Keeps the elements in the order of their insertion in the underlying
3124   /// vector. To achieve constant time removal, it never deletes any element.
3125   SmallVector<PHINode *, 32> NodeList;
3126 
3127   /// Keeps the elements in the underlying set implementation. This (and not the
3128   /// NodeList defined above) is the source of truth on whether an element
3129   /// is actually in the collection.
3130   MapType NodeMap;
3131 
3132   /// Points to the first valid (not deleted) element when the set is not empty
3133   /// and the value is not zero. Equals to the size of the underlying vector
3134   /// when the set is empty. When the value is 0, as in the beginning, the
3135   /// first element may or may not be valid.
3136   size_t FirstValidElement = 0;
3137 
3138 public:
3139   /// Inserts a new element to the collection.
3140   /// \returns true if the element is actually added, i.e. was not in the
3141   /// collection before the operation.
3142   bool insert(PHINode *Ptr) {
3143     if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
3144       NodeList.push_back(Ptr);
3145       return true;
3146     }
3147     return false;
3148   }
3149 
3150   /// Removes the element from the collection.
3151   /// \returns whether the element is actually removed, i.e. was in the
3152   /// collection before the operation.
3153   bool erase(PHINode *Ptr) {
3154     auto it = NodeMap.find(Ptr);
3155     if (it != NodeMap.end()) {
3156       NodeMap.erase(Ptr);
3157       SkipRemovedElements(FirstValidElement);
3158       return true;
3159     }
3160     return false;
3161   }
3162 
3163   /// Removes all elements and clears the collection.
3164   void clear() {
3165     NodeMap.clear();
3166     NodeList.clear();
3167     FirstValidElement = 0;
3168   }
3169 
3170   /// \returns an iterator that will iterate the elements in the order of
3171   /// insertion.
3172   iterator begin() {
3173     if (FirstValidElement == 0)
3174       SkipRemovedElements(FirstValidElement);
3175     return PhiNodeSetIterator(this, FirstValidElement);
3176   }
3177 
3178   /// \returns an iterator that points to the end of the collection.
3179   iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
3180 
3181   /// Returns the number of elements in the collection.
3182   size_t size() const {
3183     return NodeMap.size();
3184   }
3185 
3186   /// \returns 1 if the given element is in the collection, and 0 if otherwise.
3187   size_t count(PHINode *Ptr) const {
3188     return NodeMap.count(Ptr);
3189   }
3190 
3191 private:
3192   /// Updates the CurrentIndex so that it will point to a valid element.
3193   ///
3194   /// If the element of NodeList at CurrentIndex is valid, it does not
3195   /// change it. If there are no more valid elements, it updates CurrentIndex
3196   /// to point to the end of the NodeList.
3197   void SkipRemovedElements(size_t &CurrentIndex) {
3198     while (CurrentIndex < NodeList.size()) {
3199       auto it = NodeMap.find(NodeList[CurrentIndex]);
3200       // If the element has been deleted and added again later, NodeMap will
3201       // point to a different index, so CurrentIndex will still be invalid.
3202       if (it != NodeMap.end() && it->second == CurrentIndex)
3203         break;
3204       ++CurrentIndex;
3205     }
3206   }
3207 };
3208 
3209 PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
3210     : Set(Set), CurrentIndex(Start) {}
3211 
3212 PHINode * PhiNodeSetIterator::operator*() const {
3213   assert(CurrentIndex < Set->NodeList.size() &&
3214          "PhiNodeSet access out of range");
3215   return Set->NodeList[CurrentIndex];
3216 }
3217 
3218 PhiNodeSetIterator& PhiNodeSetIterator::operator++() {
3219   assert(CurrentIndex < Set->NodeList.size() &&
3220          "PhiNodeSet access out of range");
3221   ++CurrentIndex;
3222   Set->SkipRemovedElements(CurrentIndex);
3223   return *this;
3224 }
3225 
3226 bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
3227   return CurrentIndex == RHS.CurrentIndex;
3228 }
3229 
3230 bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
3231   return !((*this) == RHS);
3232 }
3233 
3234 /// Keep track of simplification of Phi nodes.
3235 /// Accept the set of all phi nodes and erase phi node from this set
3236 /// if it is simplified.
3237 class SimplificationTracker {
3238   DenseMap<Value *, Value *> Storage;
3239   const SimplifyQuery &SQ;
3240   // Tracks newly created Phi nodes. The elements are iterated by insertion
3241   // order.
3242   PhiNodeSet AllPhiNodes;
3243   // Tracks newly created Select nodes.
3244   SmallPtrSet<SelectInst *, 32> AllSelectNodes;
3245 
3246 public:
3247   SimplificationTracker(const SimplifyQuery &sq)
3248       : SQ(sq) {}
3249 
3250   Value *Get(Value *V) {
3251     do {
3252       auto SV = Storage.find(V);
3253       if (SV == Storage.end())
3254         return V;
3255       V = SV->second;
3256     } while (true);
3257   }
3258 
3259   Value *Simplify(Value *Val) {
3260     SmallVector<Value *, 32> WorkList;
3261     SmallPtrSet<Value *, 32> Visited;
3262     WorkList.push_back(Val);
3263     while (!WorkList.empty()) {
3264       auto *P = WorkList.pop_back_val();
3265       if (!Visited.insert(P).second)
3266         continue;
3267       if (auto *PI = dyn_cast<Instruction>(P))
3268         if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
3269           for (auto *U : PI->users())
3270             WorkList.push_back(cast<Value>(U));
3271           Put(PI, V);
3272           PI->replaceAllUsesWith(V);
3273           if (auto *PHI = dyn_cast<PHINode>(PI))
3274             AllPhiNodes.erase(PHI);
3275           if (auto *Select = dyn_cast<SelectInst>(PI))
3276             AllSelectNodes.erase(Select);
3277           PI->eraseFromParent();
3278         }
3279     }
3280     return Get(Val);
3281   }
3282 
3283   void Put(Value *From, Value *To) {
3284     Storage.insert({ From, To });
3285   }
3286 
3287   void ReplacePhi(PHINode *From, PHINode *To) {
3288     Value* OldReplacement = Get(From);
3289     while (OldReplacement != From) {
3290       From = To;
3291       To = dyn_cast<PHINode>(OldReplacement);
3292       OldReplacement = Get(From);
3293     }
3294     assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
3295     Put(From, To);
3296     From->replaceAllUsesWith(To);
3297     AllPhiNodes.erase(From);
3298     From->eraseFromParent();
3299   }
3300 
3301   PhiNodeSet& newPhiNodes() { return AllPhiNodes; }
3302 
3303   void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
3304 
3305   void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }
3306 
3307   unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
3308 
3309   unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
3310 
3311   void destroyNewNodes(Type *CommonType) {
3312     // For safe erasing, replace the uses with dummy value first.
3313     auto *Dummy = UndefValue::get(CommonType);
3314     for (auto *I : AllPhiNodes) {
3315       I->replaceAllUsesWith(Dummy);
3316       I->eraseFromParent();
3317     }
3318     AllPhiNodes.clear();
3319     for (auto *I : AllSelectNodes) {
3320       I->replaceAllUsesWith(Dummy);
3321       I->eraseFromParent();
3322     }
3323     AllSelectNodes.clear();
3324   }
3325 };
3326 
3327 /// A helper class for combining addressing modes.
3328 class AddressingModeCombiner {
3329   typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
3330   typedef std::pair<PHINode *, PHINode *> PHIPair;
3331 
3332 private:
3333   /// The addressing modes we've collected.
3334   SmallVector<ExtAddrMode, 16> AddrModes;
3335 
3336   /// The field in which the AddrModes differ, when we have more than one.
3337   ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;
3338 
3339   /// Are the AddrModes that we have all just equal to their original values?
3340   bool AllAddrModesTrivial = true;
3341 
3342   /// Common Type for all different fields in addressing modes.
3343   Type *CommonType;
3344 
3345   /// SimplifyQuery for simplifyInstruction utility.
3346   const SimplifyQuery &SQ;
3347 
3348   /// Original Address.
3349   Value *Original;
3350 
3351 public:
3352   AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
3353       : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
3354 
3355   /// Get the combined AddrMode
3356   const ExtAddrMode &getAddrMode() const {
3357     return AddrModes[0];
3358   }
3359 
3360   /// Add a new AddrMode if it's compatible with the AddrModes we already
3361   /// have.
3362   /// \return True iff we succeeded in doing so.
3363   bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
3364     // Take note of if we have any non-trivial AddrModes, as we need to detect
3365     // when all AddrModes are trivial as then we would introduce a phi or select
3366     // which just duplicates what's already there.
3367     AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();
3368 
3369     // If this is the first addrmode then everything is fine.
3370     if (AddrModes.empty()) {
3371       AddrModes.emplace_back(NewAddrMode);
3372       return true;
3373     }
3374 
3375     // Figure out how different this is from the other address modes, which we
3376     // can do just by comparing against the first one given that we only care
3377     // about the cumulative difference.
3378     ExtAddrMode::FieldName ThisDifferentField =
3379       AddrModes[0].compare(NewAddrMode);
3380     if (DifferentField == ExtAddrMode::NoField)
3381       DifferentField = ThisDifferentField;
3382     else if (DifferentField != ThisDifferentField)
3383       DifferentField = ExtAddrMode::MultipleFields;
3384 
3385     // If NewAddrMode differs in more than one dimension we cannot handle it.
3386     bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
3387 
3388     // If Scale Field is different then we reject.
3389     CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
3390 
3391     // We also must reject the case when base offset is different and
3392     // scale reg is not null, we cannot handle this case due to merge of
3393     // different offsets will be used as ScaleReg.
3394     CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
3395                               !NewAddrMode.ScaledReg);
3396 
3397     // We also must reject the case when GV is different and BaseReg installed
3398     // due to we want to use base reg as a merge of GV values.
3399     CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
3400                               !NewAddrMode.HasBaseReg);
3401 
3402     // Even if NewAddMode is the same we still need to collect it due to
3403     // original value is different. And later we will need all original values
3404     // as anchors during finding the common Phi node.
3405     if (CanHandle)
3406       AddrModes.emplace_back(NewAddrMode);
3407     else
3408       AddrModes.clear();
3409 
3410     return CanHandle;
3411   }
3412 
3413   /// Combine the addressing modes we've collected into a single
3414   /// addressing mode.
3415   /// \return True iff we successfully combined them or we only had one so
3416   /// didn't need to combine them anyway.
3417   bool combineAddrModes() {
3418     // If we have no AddrModes then they can't be combined.
3419     if (AddrModes.size() == 0)
3420       return false;
3421 
3422     // A single AddrMode can trivially be combined.
3423     if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)
3424       return true;
3425 
3426     // If the AddrModes we collected are all just equal to the value they are
3427     // derived from then combining them wouldn't do anything useful.
3428     if (AllAddrModesTrivial)
3429       return false;
3430 
3431     if (!addrModeCombiningAllowed())
3432       return false;
3433 
3434     // Build a map between <original value, basic block where we saw it> to
3435     // value of base register.
3436     // Bail out if there is no common type.
3437     FoldAddrToValueMapping Map;
3438     if (!initializeMap(Map))
3439       return false;
3440 
3441     Value *CommonValue = findCommon(Map);
3442     if (CommonValue)
3443       AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
3444     return CommonValue != nullptr;
3445   }
3446 
3447 private:
3448   /// Initialize Map with anchor values. For address seen
3449   /// we set the value of different field saw in this address.
3450   /// At the same time we find a common type for different field we will
3451   /// use to create new Phi/Select nodes. Keep it in CommonType field.
3452   /// Return false if there is no common type found.
3453   bool initializeMap(FoldAddrToValueMapping &Map) {
3454     // Keep track of keys where the value is null. We will need to replace it
3455     // with constant null when we know the common type.
3456     SmallVector<Value *, 2> NullValue;
3457     Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
3458     for (auto &AM : AddrModes) {
3459       Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
3460       if (DV) {
3461         auto *Type = DV->getType();
3462         if (CommonType && CommonType != Type)
3463           return false;
3464         CommonType = Type;
3465         Map[AM.OriginalValue] = DV;
3466       } else {
3467         NullValue.push_back(AM.OriginalValue);
3468       }
3469     }
3470     assert(CommonType && "At least one non-null value must be!");
3471     for (auto *V : NullValue)
3472       Map[V] = Constant::getNullValue(CommonType);
3473     return true;
3474   }
3475 
3476   /// We have mapping between value A and other value B where B was a field in
3477   /// addressing mode represented by A. Also we have an original value C
3478   /// representing an address we start with. Traversing from C through phi and
3479   /// selects we ended up with A's in a map. This utility function tries to find
3480   /// a value V which is a field in addressing mode C and traversing through phi
3481   /// nodes and selects we will end up in corresponded values B in a map.
3482   /// The utility will create a new Phi/Selects if needed.
3483   // The simple example looks as follows:
3484   // BB1:
3485   //   p1 = b1 + 40
3486   //   br cond BB2, BB3
3487   // BB2:
3488   //   p2 = b2 + 40
3489   //   br BB3
3490   // BB3:
3491   //   p = phi [p1, BB1], [p2, BB2]
3492   //   v = load p
3493   // Map is
3494   //   p1 -> b1
3495   //   p2 -> b2
3496   // Request is
3497   //   p -> ?
3498   // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
3499   Value *findCommon(FoldAddrToValueMapping &Map) {
3500     // Tracks the simplification of newly created phi nodes. The reason we use
3501     // this mapping is because we will add new created Phi nodes in AddrToBase.
3502     // Simplification of Phi nodes is recursive, so some Phi node may
3503     // be simplified after we added it to AddrToBase. In reality this
3504     // simplification is possible only if original phi/selects were not
3505     // simplified yet.
3506     // Using this mapping we can find the current value in AddrToBase.
3507     SimplificationTracker ST(SQ);
3508 
3509     // First step, DFS to create PHI nodes for all intermediate blocks.
3510     // Also fill traverse order for the second step.
3511     SmallVector<Value *, 32> TraverseOrder;
3512     InsertPlaceholders(Map, TraverseOrder, ST);
3513 
3514     // Second Step, fill new nodes by merged values and simplify if possible.
3515     FillPlaceholders(Map, TraverseOrder, ST);
3516 
3517     if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
3518       ST.destroyNewNodes(CommonType);
3519       return nullptr;
3520     }
3521 
3522     // Now we'd like to match New Phi nodes to existed ones.
3523     unsigned PhiNotMatchedCount = 0;
3524     if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
3525       ST.destroyNewNodes(CommonType);
3526       return nullptr;
3527     }
3528 
3529     auto *Result = ST.Get(Map.find(Original)->second);
3530     if (Result) {
3531       NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
3532       NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
3533     }
3534     return Result;
3535   }
3536 
3537   /// Try to match PHI node to Candidate.
3538   /// Matcher tracks the matched Phi nodes.
3539   bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
3540                     SmallSetVector<PHIPair, 8> &Matcher,
3541                     PhiNodeSet &PhiNodesToMatch) {
3542     SmallVector<PHIPair, 8> WorkList;
3543     Matcher.insert({ PHI, Candidate });
3544     SmallSet<PHINode *, 8> MatchedPHIs;
3545     MatchedPHIs.insert(PHI);
3546     WorkList.push_back({ PHI, Candidate });
3547     SmallSet<PHIPair, 8> Visited;
3548     while (!WorkList.empty()) {
3549       auto Item = WorkList.pop_back_val();
3550       if (!Visited.insert(Item).second)
3551         continue;
3552       // We iterate over all incoming values to Phi to compare them.
3553       // If values are different and both of them Phi and the first one is a
3554       // Phi we added (subject to match) and both of them is in the same basic
3555       // block then we can match our pair if values match. So we state that
3556       // these values match and add it to work list to verify that.
3557       for (auto B : Item.first->blocks()) {
3558         Value *FirstValue = Item.first->getIncomingValueForBlock(B);
3559         Value *SecondValue = Item.second->getIncomingValueForBlock(B);
3560         if (FirstValue == SecondValue)
3561           continue;
3562 
3563         PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
3564         PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
3565 
3566         // One of them is not Phi or
3567         // The first one is not Phi node from the set we'd like to match or
3568         // Phi nodes from different basic blocks then
3569         // we will not be able to match.
3570         if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
3571             FirstPhi->getParent() != SecondPhi->getParent())
3572           return false;
3573 
3574         // If we already matched them then continue.
3575         if (Matcher.count({ FirstPhi, SecondPhi }))
3576           continue;
3577         // So the values are different and does not match. So we need them to
3578         // match. (But we register no more than one match per PHI node, so that
3579         // we won't later try to replace them twice.)
3580         if (MatchedPHIs.insert(FirstPhi).second)
3581           Matcher.insert({ FirstPhi, SecondPhi });
3582         // But me must check it.
3583         WorkList.push_back({ FirstPhi, SecondPhi });
3584       }
3585     }
3586     return true;
3587   }
3588 
3589   /// For the given set of PHI nodes (in the SimplificationTracker) try
3590   /// to find their equivalents.
3591   /// Returns false if this matching fails and creation of new Phi is disabled.
3592   bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
3593                    unsigned &PhiNotMatchedCount) {
3594     // Matched and PhiNodesToMatch iterate their elements in a deterministic
3595     // order, so the replacements (ReplacePhi) are also done in a deterministic
3596     // order.
3597     SmallSetVector<PHIPair, 8> Matched;
3598     SmallPtrSet<PHINode *, 8> WillNotMatch;
3599     PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
3600     while (PhiNodesToMatch.size()) {
3601       PHINode *PHI = *PhiNodesToMatch.begin();
3602 
3603       // Add us, if no Phi nodes in the basic block we do not match.
3604       WillNotMatch.clear();
3605       WillNotMatch.insert(PHI);
3606 
3607       // Traverse all Phis until we found equivalent or fail to do that.
3608       bool IsMatched = false;
3609       for (auto &P : PHI->getParent()->phis()) {
3610         if (&P == PHI)
3611           continue;
3612         if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
3613           break;
3614         // If it does not match, collect all Phi nodes from matcher.
3615         // if we end up with no match, them all these Phi nodes will not match
3616         // later.
3617         for (auto M : Matched)
3618           WillNotMatch.insert(M.first);
3619         Matched.clear();
3620       }
3621       if (IsMatched) {
3622         // Replace all matched values and erase them.
3623         for (auto MV : Matched)
3624           ST.ReplacePhi(MV.first, MV.second);
3625         Matched.clear();
3626         continue;
3627       }
3628       // If we are not allowed to create new nodes then bail out.
3629       if (!AllowNewPhiNodes)
3630         return false;
3631       // Just remove all seen values in matcher. They will not match anything.
3632       PhiNotMatchedCount += WillNotMatch.size();
3633       for (auto *P : WillNotMatch)
3634         PhiNodesToMatch.erase(P);
3635     }
3636     return true;
3637   }
3638   /// Fill the placeholders with values from predecessors and simplify them.
3639   void FillPlaceholders(FoldAddrToValueMapping &Map,
3640                         SmallVectorImpl<Value *> &TraverseOrder,
3641                         SimplificationTracker &ST) {
3642     while (!TraverseOrder.empty()) {
3643       Value *Current = TraverseOrder.pop_back_val();
3644       assert(Map.find(Current) != Map.end() && "No node to fill!!!");
3645       Value *V = Map[Current];
3646 
3647       if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
3648         // CurrentValue also must be Select.
3649         auto *CurrentSelect = cast<SelectInst>(Current);
3650         auto *TrueValue = CurrentSelect->getTrueValue();
3651         assert(Map.find(TrueValue) != Map.end() && "No True Value!");
3652         Select->setTrueValue(ST.Get(Map[TrueValue]));
3653         auto *FalseValue = CurrentSelect->getFalseValue();
3654         assert(Map.find(FalseValue) != Map.end() && "No False Value!");
3655         Select->setFalseValue(ST.Get(Map[FalseValue]));
3656       } else {
3657         // Must be a Phi node then.
3658         auto *PHI = cast<PHINode>(V);
3659         // Fill the Phi node with values from predecessors.
3660         for (auto *B : predecessors(PHI->getParent())) {
3661           Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
3662           assert(Map.find(PV) != Map.end() && "No predecessor Value!");
3663           PHI->addIncoming(ST.Get(Map[PV]), B);
3664         }
3665       }
3666       Map[Current] = ST.Simplify(V);
3667     }
3668   }
3669 
3670   /// Starting from original value recursively iterates over def-use chain up to
3671   /// known ending values represented in a map. For each traversed phi/select
3672   /// inserts a placeholder Phi or Select.
3673   /// Reports all new created Phi/Select nodes by adding them to set.
3674   /// Also reports and order in what values have been traversed.
3675   void InsertPlaceholders(FoldAddrToValueMapping &Map,
3676                           SmallVectorImpl<Value *> &TraverseOrder,
3677                           SimplificationTracker &ST) {
3678     SmallVector<Value *, 32> Worklist;
3679     assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
3680            "Address must be a Phi or Select node");
3681     auto *Dummy = UndefValue::get(CommonType);
3682     Worklist.push_back(Original);
3683     while (!Worklist.empty()) {
3684       Value *Current = Worklist.pop_back_val();
3685       // if it is already visited or it is an ending value then skip it.
3686       if (Map.find(Current) != Map.end())
3687         continue;
3688       TraverseOrder.push_back(Current);
3689 
3690       // CurrentValue must be a Phi node or select. All others must be covered
3691       // by anchors.
3692       if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
3693         // Is it OK to get metadata from OrigSelect?!
3694         // Create a Select placeholder with dummy value.
3695         SelectInst *Select = SelectInst::Create(
3696             CurrentSelect->getCondition(), Dummy, Dummy,
3697             CurrentSelect->getName(), CurrentSelect, CurrentSelect);
3698         Map[Current] = Select;
3699         ST.insertNewSelect(Select);
3700         // We are interested in True and False values.
3701         Worklist.push_back(CurrentSelect->getTrueValue());
3702         Worklist.push_back(CurrentSelect->getFalseValue());
3703       } else {
3704         // It must be a Phi node then.
3705         PHINode *CurrentPhi = cast<PHINode>(Current);
3706         unsigned PredCount = CurrentPhi->getNumIncomingValues();
3707         PHINode *PHI =
3708             PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
3709         Map[Current] = PHI;
3710         ST.insertNewPhi(PHI);
3711         for (Value *P : CurrentPhi->incoming_values())
3712           Worklist.push_back(P);
3713       }
3714     }
3715   }
3716 
3717   bool addrModeCombiningAllowed() {
3718     if (DisableComplexAddrModes)
3719       return false;
3720     switch (DifferentField) {
3721     default:
3722       return false;
3723     case ExtAddrMode::BaseRegField:
3724       return AddrSinkCombineBaseReg;
3725     case ExtAddrMode::BaseGVField:
3726       return AddrSinkCombineBaseGV;
3727     case ExtAddrMode::BaseOffsField:
3728       return AddrSinkCombineBaseOffs;
3729     case ExtAddrMode::ScaledRegField:
3730       return AddrSinkCombineScaledReg;
3731     }
3732   }
3733 };
3734 } // end anonymous namespace
3735 
3736 /// Try adding ScaleReg*Scale to the current addressing mode.
3737 /// Return true and update AddrMode if this addr mode is legal for the target,
3738 /// false if not.
3739 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
3740                                              unsigned Depth) {
3741   // If Scale is 1, then this is the same as adding ScaleReg to the addressing
3742   // mode.  Just process that directly.
3743   if (Scale == 1)
3744     return matchAddr(ScaleReg, Depth);
3745 
3746   // If the scale is 0, it takes nothing to add this.
3747   if (Scale == 0)
3748     return true;
3749 
3750   // If we already have a scale of this value, we can add to it, otherwise, we
3751   // need an available scale field.
3752   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
3753     return false;
3754 
3755   ExtAddrMode TestAddrMode = AddrMode;
3756 
3757   // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
3758   // [A+B + A*7] -> [B+A*8].
3759   TestAddrMode.Scale += Scale;
3760   TestAddrMode.ScaledReg = ScaleReg;
3761 
3762   // If the new address isn't legal, bail out.
3763   if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
3764     return false;
3765 
3766   // It was legal, so commit it.
3767   AddrMode = TestAddrMode;
3768 
3769   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
3770   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
3771   // X*Scale + C*Scale to addr mode.
3772   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
3773   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
3774       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
3775       CI->getValue().isSignedIntN(64)) {
3776     TestAddrMode.InBounds = false;
3777     TestAddrMode.ScaledReg = AddLHS;
3778     TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
3779 
3780     // If this addressing mode is legal, commit it and remember that we folded
3781     // this instruction.
3782     if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
3783       AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
3784       AddrMode = TestAddrMode;
3785       return true;
3786     }
3787   }
3788 
3789   // Otherwise, not (x+c)*scale, just return what we have.
3790   return true;
3791 }
3792 
3793 /// This is a little filter, which returns true if an addressing computation
3794 /// involving I might be folded into a load/store accessing it.
3795 /// This doesn't need to be perfect, but needs to accept at least
3796 /// the set of instructions that MatchOperationAddr can.
3797 static bool MightBeFoldableInst(Instruction *I) {
3798   switch (I->getOpcode()) {
3799   case Instruction::BitCast:
3800   case Instruction::AddrSpaceCast:
3801     // Don't touch identity bitcasts.
3802     if (I->getType() == I->getOperand(0)->getType())
3803       return false;
3804     return I->getType()->isIntOrPtrTy();
3805   case Instruction::PtrToInt:
3806     // PtrToInt is always a noop, as we know that the int type is pointer sized.
3807     return true;
3808   case Instruction::IntToPtr:
3809     // We know the input is intptr_t, so this is foldable.
3810     return true;
3811   case Instruction::Add:
3812     return true;
3813   case Instruction::Mul:
3814   case Instruction::Shl:
3815     // Can only handle X*C and X << C.
3816     return isa<ConstantInt>(I->getOperand(1));
3817   case Instruction::GetElementPtr:
3818     return true;
3819   default:
3820     return false;
3821   }
3822 }
3823 
3824 /// Check whether or not \p Val is a legal instruction for \p TLI.
3825 /// \note \p Val is assumed to be the product of some type promotion.
3826 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
3827 /// to be legal, as the non-promoted value would have had the same state.
3828 static bool isPromotedInstructionLegal(const TargetLowering &TLI,
3829                                        const DataLayout &DL, Value *Val) {
3830   Instruction *PromotedInst = dyn_cast<Instruction>(Val);
3831   if (!PromotedInst)
3832     return false;
3833   int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
3834   // If the ISDOpcode is undefined, it was undefined before the promotion.
3835   if (!ISDOpcode)
3836     return true;
3837   // Otherwise, check if the promoted instruction is legal or not.
3838   return TLI.isOperationLegalOrCustom(
3839       ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
3840 }
3841 
3842 namespace {
3843 
3844 /// Hepler class to perform type promotion.
3845 class TypePromotionHelper {
3846   /// Utility function to add a promoted instruction \p ExtOpnd to
3847   /// \p PromotedInsts and record the type of extension we have seen.
3848   static void addPromotedInst(InstrToOrigTy &PromotedInsts,
3849                               Instruction *ExtOpnd,
3850                               bool IsSExt) {
3851     ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
3852     InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);
3853     if (It != PromotedInsts.end()) {
3854       // If the new extension is same as original, the information in
3855       // PromotedInsts[ExtOpnd] is still correct.
3856       if (It->second.getInt() == ExtTy)
3857         return;
3858 
3859       // Now the new extension is different from old extension, we make
3860       // the type information invalid by setting extension type to
3861       // BothExtension.
3862       ExtTy = BothExtension;
3863     }
3864     PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);
3865   }
3866 
3867   /// Utility function to query the original type of instruction \p Opnd
3868   /// with a matched extension type. If the extension doesn't match, we
3869   /// cannot use the information we had on the original type.
3870   /// BothExtension doesn't match any extension type.
3871   static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
3872                                  Instruction *Opnd,
3873                                  bool IsSExt) {
3874     ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
3875     InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
3876     if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
3877       return It->second.getPointer();
3878     return nullptr;
3879   }
3880 
3881   /// Utility function to check whether or not a sign or zero extension
3882   /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
3883   /// either using the operands of \p Inst or promoting \p Inst.
3884   /// The type of the extension is defined by \p IsSExt.
3885   /// In other words, check if:
3886   /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
3887   /// #1 Promotion applies:
3888   /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
3889   /// #2 Operand reuses:
3890   /// ext opnd1 to ConsideredExtType.
3891   /// \p PromotedInsts maps the instructions to their type before promotion.
3892   static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
3893                             const InstrToOrigTy &PromotedInsts, bool IsSExt);
3894 
3895   /// Utility function to determine if \p OpIdx should be promoted when
3896   /// promoting \p Inst.
3897   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
3898     return !(isa<SelectInst>(Inst) && OpIdx == 0);
3899   }
3900 
3901   /// Utility function to promote the operand of \p Ext when this
3902   /// operand is a promotable trunc or sext or zext.
3903   /// \p PromotedInsts maps the instructions to their type before promotion.
3904   /// \p CreatedInstsCost[out] contains the cost of all instructions
3905   /// created to promote the operand of Ext.
3906   /// Newly added extensions are inserted in \p Exts.
3907   /// Newly added truncates are inserted in \p Truncs.
3908   /// Should never be called directly.
3909   /// \return The promoted value which is used instead of Ext.
3910   static Value *promoteOperandForTruncAndAnyExt(
3911       Instruction *Ext, TypePromotionTransaction &TPT,
3912       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3913       SmallVectorImpl<Instruction *> *Exts,
3914       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
3915 
3916   /// Utility function to promote the operand of \p Ext when this
3917   /// operand is promotable and is not a supported trunc or sext.
3918   /// \p PromotedInsts maps the instructions to their type before promotion.
3919   /// \p CreatedInstsCost[out] contains the cost of all the instructions
3920   /// created to promote the operand of Ext.
3921   /// Newly added extensions are inserted in \p Exts.
3922   /// Newly added truncates are inserted in \p Truncs.
3923   /// Should never be called directly.
3924   /// \return The promoted value which is used instead of Ext.
3925   static Value *promoteOperandForOther(Instruction *Ext,
3926                                        TypePromotionTransaction &TPT,
3927                                        InstrToOrigTy &PromotedInsts,
3928                                        unsigned &CreatedInstsCost,
3929                                        SmallVectorImpl<Instruction *> *Exts,
3930                                        SmallVectorImpl<Instruction *> *Truncs,
3931                                        const TargetLowering &TLI, bool IsSExt);
3932 
3933   /// \see promoteOperandForOther.
3934   static Value *signExtendOperandForOther(
3935       Instruction *Ext, TypePromotionTransaction &TPT,
3936       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3937       SmallVectorImpl<Instruction *> *Exts,
3938       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3939     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3940                                   Exts, Truncs, TLI, true);
3941   }
3942 
3943   /// \see promoteOperandForOther.
3944   static Value *zeroExtendOperandForOther(
3945       Instruction *Ext, TypePromotionTransaction &TPT,
3946       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3947       SmallVectorImpl<Instruction *> *Exts,
3948       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3949     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3950                                   Exts, Truncs, TLI, false);
3951   }
3952 
3953 public:
3954   /// Type for the utility function that promotes the operand of Ext.
3955   using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
3956                             InstrToOrigTy &PromotedInsts,
3957                             unsigned &CreatedInstsCost,
3958                             SmallVectorImpl<Instruction *> *Exts,
3959                             SmallVectorImpl<Instruction *> *Truncs,
3960                             const TargetLowering &TLI);
3961 
3962   /// Given a sign/zero extend instruction \p Ext, return the appropriate
3963   /// action to promote the operand of \p Ext instead of using Ext.
3964   /// \return NULL if no promotable action is possible with the current
3965   /// sign extension.
3966   /// \p InsertedInsts keeps track of all the instructions inserted by the
3967   /// other CodeGenPrepare optimizations. This information is important
3968   /// because we do not want to promote these instructions as CodeGenPrepare
3969   /// will reinsert them later. Thus creating an infinite loop: create/remove.
3970   /// \p PromotedInsts maps the instructions to their type before promotion.
3971   static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
3972                           const TargetLowering &TLI,
3973                           const InstrToOrigTy &PromotedInsts);
3974 };
3975 
3976 } // end anonymous namespace
3977 
3978 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
3979                                         Type *ConsideredExtType,
3980                                         const InstrToOrigTy &PromotedInsts,
3981                                         bool IsSExt) {
3982   // The promotion helper does not know how to deal with vector types yet.
3983   // To be able to fix that, we would need to fix the places where we
3984   // statically extend, e.g., constants and such.
3985   if (Inst->getType()->isVectorTy())
3986     return false;
3987 
3988   // We can always get through zext.
3989   if (isa<ZExtInst>(Inst))
3990     return true;
3991 
3992   // sext(sext) is ok too.
3993   if (IsSExt && isa<SExtInst>(Inst))
3994     return true;
3995 
3996   // We can get through binary operator, if it is legal. In other words, the
3997   // binary operator must have a nuw or nsw flag.
3998   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
3999   if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&
4000       ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
4001        (IsSExt && BinOp->hasNoSignedWrap())))
4002     return true;
4003 
4004   // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
4005   if ((Inst->getOpcode() == Instruction::And ||
4006        Inst->getOpcode() == Instruction::Or))
4007     return true;
4008 
4009   // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
4010   if (Inst->getOpcode() == Instruction::Xor) {
4011     const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
4012     // Make sure it is not a NOT.
4013     if (Cst && !Cst->getValue().isAllOnesValue())
4014       return true;
4015   }
4016 
4017   // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
4018   // It may change a poisoned value into a regular value, like
4019   //     zext i32 (shrl i8 %val, 12)  -->  shrl i32 (zext i8 %val), 12
4020   //          poisoned value                    regular value
4021   // It should be OK since undef covers valid value.
4022   if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
4023     return true;
4024 
4025   // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
4026   // It may change a poisoned value into a regular value, like
4027   //     zext i32 (shl i8 %val, 12)  -->  shl i32 (zext i8 %val), 12
4028   //          poisoned value                    regular value
4029   // It should be OK since undef covers valid value.
4030   if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
4031     const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());
4032     if (ExtInst->hasOneUse()) {
4033       const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());
4034       if (AndInst && AndInst->getOpcode() == Instruction::And) {
4035         const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
4036         if (Cst &&
4037             Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
4038           return true;
4039       }
4040     }
4041   }
4042 
4043   // Check if we can do the following simplification.
4044   // ext(trunc(opnd)) --> ext(opnd)
4045   if (!isa<TruncInst>(Inst))
4046     return false;
4047 
4048   Value *OpndVal = Inst->getOperand(0);
4049   // Check if we can use this operand in the extension.
4050   // If the type is larger than the result type of the extension, we cannot.
4051   if (!OpndVal->getType()->isIntegerTy() ||
4052       OpndVal->getType()->getIntegerBitWidth() >
4053           ConsideredExtType->getIntegerBitWidth())
4054     return false;
4055 
4056   // If the operand of the truncate is not an instruction, we will not have
4057   // any information on the dropped bits.
4058   // (Actually we could for constant but it is not worth the extra logic).
4059   Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
4060   if (!Opnd)
4061     return false;
4062 
4063   // Check if the source of the type is narrow enough.
4064   // I.e., check that trunc just drops extended bits of the same kind of
4065   // the extension.
4066   // #1 get the type of the operand and check the kind of the extended bits.
4067   const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
4068   if (OpndType)
4069     ;
4070   else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
4071     OpndType = Opnd->getOperand(0)->getType();
4072   else
4073     return false;
4074 
4075   // #2 check that the truncate just drops extended bits.
4076   return Inst->getType()->getIntegerBitWidth() >=
4077          OpndType->getIntegerBitWidth();
4078 }
4079 
4080 TypePromotionHelper::Action TypePromotionHelper::getAction(
4081     Instruction *Ext, const SetOfInstrs &InsertedInsts,
4082     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
4083   assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4084          "Unexpected instruction type");
4085   Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
4086   Type *ExtTy = Ext->getType();
4087   bool IsSExt = isa<SExtInst>(Ext);
4088   // If the operand of the extension is not an instruction, we cannot
4089   // get through.
4090   // If it, check we can get through.
4091   if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
4092     return nullptr;
4093 
4094   // Do not promote if the operand has been added by codegenprepare.
4095   // Otherwise, it means we are undoing an optimization that is likely to be
4096   // redone, thus causing potential infinite loop.
4097   if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
4098     return nullptr;
4099 
4100   // SExt or Trunc instructions.
4101   // Return the related handler.
4102   if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
4103       isa<ZExtInst>(ExtOpnd))
4104     return promoteOperandForTruncAndAnyExt;
4105 
4106   // Regular instruction.
4107   // Abort early if we will have to insert non-free instructions.
4108   if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
4109     return nullptr;
4110   return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
4111 }
4112 
4113 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
4114     Instruction *SExt, TypePromotionTransaction &TPT,
4115     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4116     SmallVectorImpl<Instruction *> *Exts,
4117     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
4118   // By construction, the operand of SExt is an instruction. Otherwise we cannot
4119   // get through it and this method should not be called.
4120   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
4121   Value *ExtVal = SExt;
4122   bool HasMergedNonFreeExt = false;
4123   if (isa<ZExtInst>(SExtOpnd)) {
4124     // Replace s|zext(zext(opnd))
4125     // => zext(opnd).
4126     HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
4127     Value *ZExt =
4128         TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
4129     TPT.replaceAllUsesWith(SExt, ZExt);
4130     TPT.eraseInstruction(SExt);
4131     ExtVal = ZExt;
4132   } else {
4133     // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
4134     // => z|sext(opnd).
4135     TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
4136   }
4137   CreatedInstsCost = 0;
4138 
4139   // Remove dead code.
4140   if (SExtOpnd->use_empty())
4141     TPT.eraseInstruction(SExtOpnd);
4142 
4143   // Check if the extension is still needed.
4144   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
4145   if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
4146     if (ExtInst) {
4147       if (Exts)
4148         Exts->push_back(ExtInst);
4149       CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
4150     }
4151     return ExtVal;
4152   }
4153 
4154   // At this point we have: ext ty opnd to ty.
4155   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
4156   Value *NextVal = ExtInst->getOperand(0);
4157   TPT.eraseInstruction(ExtInst, NextVal);
4158   return NextVal;
4159 }
4160 
4161 Value *TypePromotionHelper::promoteOperandForOther(
4162     Instruction *Ext, TypePromotionTransaction &TPT,
4163     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4164     SmallVectorImpl<Instruction *> *Exts,
4165     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
4166     bool IsSExt) {
4167   // By construction, the operand of Ext is an instruction. Otherwise we cannot
4168   // get through it and this method should not be called.
4169   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
4170   CreatedInstsCost = 0;
4171   if (!ExtOpnd->hasOneUse()) {
4172     // ExtOpnd will be promoted.
4173     // All its uses, but Ext, will need to use a truncated value of the
4174     // promoted version.
4175     // Create the truncate now.
4176     Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
4177     if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
4178       // Insert it just after the definition.
4179       ITrunc->moveAfter(ExtOpnd);
4180       if (Truncs)
4181         Truncs->push_back(ITrunc);
4182     }
4183 
4184     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
4185     // Restore the operand of Ext (which has been replaced by the previous call
4186     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
4187     TPT.setOperand(Ext, 0, ExtOpnd);
4188   }
4189 
4190   // Get through the Instruction:
4191   // 1. Update its type.
4192   // 2. Replace the uses of Ext by Inst.
4193   // 3. Extend each operand that needs to be extended.
4194 
4195   // Remember the original type of the instruction before promotion.
4196   // This is useful to know that the high bits are sign extended bits.
4197   addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
4198   // Step #1.
4199   TPT.mutateType(ExtOpnd, Ext->getType());
4200   // Step #2.
4201   TPT.replaceAllUsesWith(Ext, ExtOpnd);
4202   // Step #3.
4203   Instruction *ExtForOpnd = Ext;
4204 
4205   LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
4206   for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
4207        ++OpIdx) {
4208     LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
4209     if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
4210         !shouldExtOperand(ExtOpnd, OpIdx)) {
4211       LLVM_DEBUG(dbgs() << "No need to propagate\n");
4212       continue;
4213     }
4214     // Check if we can statically extend the operand.
4215     Value *Opnd = ExtOpnd->getOperand(OpIdx);
4216     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
4217       LLVM_DEBUG(dbgs() << "Statically extend\n");
4218       unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
4219       APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
4220                             : Cst->getValue().zext(BitWidth);
4221       TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
4222       continue;
4223     }
4224     // UndefValue are typed, so we have to statically sign extend them.
4225     if (isa<UndefValue>(Opnd)) {
4226       LLVM_DEBUG(dbgs() << "Statically extend\n");
4227       TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
4228       continue;
4229     }
4230 
4231     // Otherwise we have to explicitly sign extend the operand.
4232     // Check if Ext was reused to extend an operand.
4233     if (!ExtForOpnd) {
4234       // If yes, create a new one.
4235       LLVM_DEBUG(dbgs() << "More operands to ext\n");
4236       Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
4237         : TPT.createZExt(Ext, Opnd, Ext->getType());
4238       if (!isa<Instruction>(ValForExtOpnd)) {
4239         TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
4240         continue;
4241       }
4242       ExtForOpnd = cast<Instruction>(ValForExtOpnd);
4243     }
4244     if (Exts)
4245       Exts->push_back(ExtForOpnd);
4246     TPT.setOperand(ExtForOpnd, 0, Opnd);
4247 
4248     // Move the sign extension before the insertion point.
4249     TPT.moveBefore(ExtForOpnd, ExtOpnd);
4250     TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
4251     CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
4252     // If more sext are required, new instructions will have to be created.
4253     ExtForOpnd = nullptr;
4254   }
4255   if (ExtForOpnd == Ext) {
4256     LLVM_DEBUG(dbgs() << "Extension is useless now\n");
4257     TPT.eraseInstruction(Ext);
4258   }
4259   return ExtOpnd;
4260 }
4261 
4262 /// Check whether or not promoting an instruction to a wider type is profitable.
4263 /// \p NewCost gives the cost of extension instructions created by the
4264 /// promotion.
4265 /// \p OldCost gives the cost of extension instructions before the promotion
4266 /// plus the number of instructions that have been
4267 /// matched in the addressing mode the promotion.
4268 /// \p PromotedOperand is the value that has been promoted.
4269 /// \return True if the promotion is profitable, false otherwise.
4270 bool AddressingModeMatcher::isPromotionProfitable(
4271     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
4272   LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
4273                     << '\n');
4274   // The cost of the new extensions is greater than the cost of the
4275   // old extension plus what we folded.
4276   // This is not profitable.
4277   if (NewCost > OldCost)
4278     return false;
4279   if (NewCost < OldCost)
4280     return true;
4281   // The promotion is neutral but it may help folding the sign extension in
4282   // loads for instance.
4283   // Check that we did not create an illegal instruction.
4284   return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
4285 }
4286 
4287 /// Given an instruction or constant expr, see if we can fold the operation
4288 /// into the addressing mode. If so, update the addressing mode and return
4289 /// true, otherwise return false without modifying AddrMode.
4290 /// If \p MovedAway is not NULL, it contains the information of whether or
4291 /// not AddrInst has to be folded into the addressing mode on success.
4292 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
4293 /// because it has been moved away.
4294 /// Thus AddrInst must not be added in the matched instructions.
4295 /// This state can happen when AddrInst is a sext, since it may be moved away.
4296 /// Therefore, AddrInst may not be valid when MovedAway is true and it must
4297 /// not be referenced anymore.
4298 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
4299                                                unsigned Depth,
4300                                                bool *MovedAway) {
4301   // Avoid exponential behavior on extremely deep expression trees.
4302   if (Depth >= 5) return false;
4303 
4304   // By default, all matched instructions stay in place.
4305   if (MovedAway)
4306     *MovedAway = false;
4307 
4308   switch (Opcode) {
4309   case Instruction::PtrToInt:
4310     // PtrToInt is always a noop, as we know that the int type is pointer sized.
4311     return matchAddr(AddrInst->getOperand(0), Depth);
4312   case Instruction::IntToPtr: {
4313     auto AS = AddrInst->getType()->getPointerAddressSpace();
4314     auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
4315     // This inttoptr is a no-op if the integer type is pointer sized.
4316     if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
4317       return matchAddr(AddrInst->getOperand(0), Depth);
4318     return false;
4319   }
4320   case Instruction::BitCast:
4321     // BitCast is always a noop, and we can handle it as long as it is
4322     // int->int or pointer->pointer (we don't want int<->fp or something).
4323     if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
4324         // Don't touch identity bitcasts.  These were probably put here by LSR,
4325         // and we don't want to mess around with them.  Assume it knows what it
4326         // is doing.
4327         AddrInst->getOperand(0)->getType() != AddrInst->getType())
4328       return matchAddr(AddrInst->getOperand(0), Depth);
4329     return false;
4330   case Instruction::AddrSpaceCast: {
4331     unsigned SrcAS
4332       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
4333     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
4334     if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS))
4335       return matchAddr(AddrInst->getOperand(0), Depth);
4336     return false;
4337   }
4338   case Instruction::Add: {
4339     // Check to see if we can merge in the RHS then the LHS.  If so, we win.
4340     ExtAddrMode BackupAddrMode = AddrMode;
4341     unsigned OldSize = AddrModeInsts.size();
4342     // Start a transaction at this point.
4343     // The LHS may match but not the RHS.
4344     // Therefore, we need a higher level restoration point to undo partially
4345     // matched operation.
4346     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4347         TPT.getRestorationPoint();
4348 
4349     AddrMode.InBounds = false;
4350     if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
4351         matchAddr(AddrInst->getOperand(0), Depth+1))
4352       return true;
4353 
4354     // Restore the old addr mode info.
4355     AddrMode = BackupAddrMode;
4356     AddrModeInsts.resize(OldSize);
4357     TPT.rollback(LastKnownGood);
4358 
4359     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
4360     if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
4361         matchAddr(AddrInst->getOperand(1), Depth+1))
4362       return true;
4363 
4364     // Otherwise we definitely can't merge the ADD in.
4365     AddrMode = BackupAddrMode;
4366     AddrModeInsts.resize(OldSize);
4367     TPT.rollback(LastKnownGood);
4368     break;
4369   }
4370   //case Instruction::Or:
4371   // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
4372   //break;
4373   case Instruction::Mul:
4374   case Instruction::Shl: {
4375     // Can only handle X*C and X << C.
4376     AddrMode.InBounds = false;
4377     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
4378     if (!RHS || RHS->getBitWidth() > 64)
4379       return false;
4380     int64_t Scale = RHS->getSExtValue();
4381     if (Opcode == Instruction::Shl)
4382       Scale = 1LL << Scale;
4383 
4384     return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
4385   }
4386   case Instruction::GetElementPtr: {
4387     // Scan the GEP.  We check it if it contains constant offsets and at most
4388     // one variable offset.
4389     int VariableOperand = -1;
4390     unsigned VariableScale = 0;
4391 
4392     int64_t ConstantOffset = 0;
4393     gep_type_iterator GTI = gep_type_begin(AddrInst);
4394     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
4395       if (StructType *STy = GTI.getStructTypeOrNull()) {
4396         const StructLayout *SL = DL.getStructLayout(STy);
4397         unsigned Idx =
4398           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
4399         ConstantOffset += SL->getElementOffset(Idx);
4400       } else {
4401         TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());
4402         if (TS.isNonZero()) {
4403           // The optimisations below currently only work for fixed offsets.
4404           if (TS.isScalable())
4405             return false;
4406           int64_t TypeSize = TS.getFixedSize();
4407           if (ConstantInt *CI =
4408                   dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
4409             const APInt &CVal = CI->getValue();
4410             if (CVal.getMinSignedBits() <= 64) {
4411               ConstantOffset += CVal.getSExtValue() * TypeSize;
4412               continue;
4413             }
4414           }
4415           // We only allow one variable index at the moment.
4416           if (VariableOperand != -1)
4417             return false;
4418 
4419           // Remember the variable index.
4420           VariableOperand = i;
4421           VariableScale = TypeSize;
4422         }
4423       }
4424     }
4425 
4426     // A common case is for the GEP to only do a constant offset.  In this case,
4427     // just add it to the disp field and check validity.
4428     if (VariableOperand == -1) {
4429       AddrMode.BaseOffs += ConstantOffset;
4430       if (ConstantOffset == 0 ||
4431           TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
4432         // Check to see if we can fold the base pointer in too.
4433         if (matchAddr(AddrInst->getOperand(0), Depth+1)) {
4434           if (!cast<GEPOperator>(AddrInst)->isInBounds())
4435             AddrMode.InBounds = false;
4436           return true;
4437         }
4438       } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
4439                  TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
4440                  ConstantOffset > 0) {
4441         // Record GEPs with non-zero offsets as candidates for splitting in the
4442         // event that the offset cannot fit into the r+i addressing mode.
4443         // Simple and common case that only one GEP is used in calculating the
4444         // address for the memory access.
4445         Value *Base = AddrInst->getOperand(0);
4446         auto *BaseI = dyn_cast<Instruction>(Base);
4447         auto *GEP = cast<GetElementPtrInst>(AddrInst);
4448         if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
4449             (BaseI && !isa<CastInst>(BaseI) &&
4450              !isa<GetElementPtrInst>(BaseI))) {
4451           // Make sure the parent block allows inserting non-PHI instructions
4452           // before the terminator.
4453           BasicBlock *Parent =
4454               BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
4455           if (!Parent->getTerminator()->isEHPad())
4456             LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
4457         }
4458       }
4459       AddrMode.BaseOffs -= ConstantOffset;
4460       return false;
4461     }
4462 
4463     // Save the valid addressing mode in case we can't match.
4464     ExtAddrMode BackupAddrMode = AddrMode;
4465     unsigned OldSize = AddrModeInsts.size();
4466 
4467     // See if the scale and offset amount is valid for this target.
4468     AddrMode.BaseOffs += ConstantOffset;
4469     if (!cast<GEPOperator>(AddrInst)->isInBounds())
4470       AddrMode.InBounds = false;
4471 
4472     // Match the base operand of the GEP.
4473     if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
4474       // If it couldn't be matched, just stuff the value in a register.
4475       if (AddrMode.HasBaseReg) {
4476         AddrMode = BackupAddrMode;
4477         AddrModeInsts.resize(OldSize);
4478         return false;
4479       }
4480       AddrMode.HasBaseReg = true;
4481       AddrMode.BaseReg = AddrInst->getOperand(0);
4482     }
4483 
4484     // Match the remaining variable portion of the GEP.
4485     if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
4486                           Depth)) {
4487       // If it couldn't be matched, try stuffing the base into a register
4488       // instead of matching it, and retrying the match of the scale.
4489       AddrMode = BackupAddrMode;
4490       AddrModeInsts.resize(OldSize);
4491       if (AddrMode.HasBaseReg)
4492         return false;
4493       AddrMode.HasBaseReg = true;
4494       AddrMode.BaseReg = AddrInst->getOperand(0);
4495       AddrMode.BaseOffs += ConstantOffset;
4496       if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
4497                             VariableScale, Depth)) {
4498         // If even that didn't work, bail.
4499         AddrMode = BackupAddrMode;
4500         AddrModeInsts.resize(OldSize);
4501         return false;
4502       }
4503     }
4504 
4505     return true;
4506   }
4507   case Instruction::SExt:
4508   case Instruction::ZExt: {
4509     Instruction *Ext = dyn_cast<Instruction>(AddrInst);
4510     if (!Ext)
4511       return false;
4512 
4513     // Try to move this ext out of the way of the addressing mode.
4514     // Ask for a method for doing so.
4515     TypePromotionHelper::Action TPH =
4516         TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
4517     if (!TPH)
4518       return false;
4519 
4520     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4521         TPT.getRestorationPoint();
4522     unsigned CreatedInstsCost = 0;
4523     unsigned ExtCost = !TLI.isExtFree(Ext);
4524     Value *PromotedOperand =
4525         TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
4526     // SExt has been moved away.
4527     // Thus either it will be rematched later in the recursive calls or it is
4528     // gone. Anyway, we must not fold it into the addressing mode at this point.
4529     // E.g.,
4530     // op = add opnd, 1
4531     // idx = ext op
4532     // addr = gep base, idx
4533     // is now:
4534     // promotedOpnd = ext opnd            <- no match here
4535     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
4536     // addr = gep base, op                <- match
4537     if (MovedAway)
4538       *MovedAway = true;
4539 
4540     assert(PromotedOperand &&
4541            "TypePromotionHelper should have filtered out those cases");
4542 
4543     ExtAddrMode BackupAddrMode = AddrMode;
4544     unsigned OldSize = AddrModeInsts.size();
4545 
4546     if (!matchAddr(PromotedOperand, Depth) ||
4547         // The total of the new cost is equal to the cost of the created
4548         // instructions.
4549         // The total of the old cost is equal to the cost of the extension plus
4550         // what we have saved in the addressing mode.
4551         !isPromotionProfitable(CreatedInstsCost,
4552                                ExtCost + (AddrModeInsts.size() - OldSize),
4553                                PromotedOperand)) {
4554       AddrMode = BackupAddrMode;
4555       AddrModeInsts.resize(OldSize);
4556       LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
4557       TPT.rollback(LastKnownGood);
4558       return false;
4559     }
4560     return true;
4561   }
4562   }
4563   return false;
4564 }
4565 
4566 /// If we can, try to add the value of 'Addr' into the current addressing mode.
4567 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode
4568 /// unmodified. This assumes that Addr is either a pointer type or intptr_t
4569 /// for the target.
4570 ///
4571 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
4572   // Start a transaction at this point that we will rollback if the matching
4573   // fails.
4574   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4575       TPT.getRestorationPoint();
4576   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
4577     if (CI->getValue().isSignedIntN(64)) {
4578       // Fold in immediates if legal for the target.
4579       AddrMode.BaseOffs += CI->getSExtValue();
4580       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4581         return true;
4582       AddrMode.BaseOffs -= CI->getSExtValue();
4583     }
4584   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
4585     // If this is a global variable, try to fold it into the addressing mode.
4586     if (!AddrMode.BaseGV) {
4587       AddrMode.BaseGV = GV;
4588       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4589         return true;
4590       AddrMode.BaseGV = nullptr;
4591     }
4592   } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
4593     ExtAddrMode BackupAddrMode = AddrMode;
4594     unsigned OldSize = AddrModeInsts.size();
4595 
4596     // Check to see if it is possible to fold this operation.
4597     bool MovedAway = false;
4598     if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
4599       // This instruction may have been moved away. If so, there is nothing
4600       // to check here.
4601       if (MovedAway)
4602         return true;
4603       // Okay, it's possible to fold this.  Check to see if it is actually
4604       // *profitable* to do so.  We use a simple cost model to avoid increasing
4605       // register pressure too much.
4606       if (I->hasOneUse() ||
4607           isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
4608         AddrModeInsts.push_back(I);
4609         return true;
4610       }
4611 
4612       // It isn't profitable to do this, roll back.
4613       //cerr << "NOT FOLDING: " << *I;
4614       AddrMode = BackupAddrMode;
4615       AddrModeInsts.resize(OldSize);
4616       TPT.rollback(LastKnownGood);
4617     }
4618   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
4619     if (matchOperationAddr(CE, CE->getOpcode(), Depth))
4620       return true;
4621     TPT.rollback(LastKnownGood);
4622   } else if (isa<ConstantPointerNull>(Addr)) {
4623     // Null pointer gets folded without affecting the addressing mode.
4624     return true;
4625   }
4626 
4627   // Worse case, the target should support [reg] addressing modes. :)
4628   if (!AddrMode.HasBaseReg) {
4629     AddrMode.HasBaseReg = true;
4630     AddrMode.BaseReg = Addr;
4631     // Still check for legality in case the target supports [imm] but not [i+r].
4632     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4633       return true;
4634     AddrMode.HasBaseReg = false;
4635     AddrMode.BaseReg = nullptr;
4636   }
4637 
4638   // If the base register is already taken, see if we can do [r+r].
4639   if (AddrMode.Scale == 0) {
4640     AddrMode.Scale = 1;
4641     AddrMode.ScaledReg = Addr;
4642     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4643       return true;
4644     AddrMode.Scale = 0;
4645     AddrMode.ScaledReg = nullptr;
4646   }
4647   // Couldn't match.
4648   TPT.rollback(LastKnownGood);
4649   return false;
4650 }
4651 
4652 /// Check to see if all uses of OpVal by the specified inline asm call are due
4653 /// to memory operands. If so, return true, otherwise return false.
4654 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
4655                                     const TargetLowering &TLI,
4656                                     const TargetRegisterInfo &TRI) {
4657   const Function *F = CI->getFunction();
4658   TargetLowering::AsmOperandInfoVector TargetConstraints =
4659       TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
4660 
4661   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
4662     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
4663 
4664     // Compute the constraint code and ConstraintType to use.
4665     TLI.ComputeConstraintToUse(OpInfo, SDValue());
4666 
4667     // If this asm operand is our Value*, and if it isn't an indirect memory
4668     // operand, we can't fold it!
4669     if (OpInfo.CallOperandVal == OpVal &&
4670         (OpInfo.ConstraintType != TargetLowering::C_Memory ||
4671          !OpInfo.isIndirect))
4672       return false;
4673   }
4674 
4675   return true;
4676 }
4677 
4678 // Max number of memory uses to look at before aborting the search to conserve
4679 // compile time.
4680 static constexpr int MaxMemoryUsesToScan = 20;
4681 
4682 /// Recursively walk all the uses of I until we find a memory use.
4683 /// If we find an obviously non-foldable instruction, return true.
4684 /// Add the ultimately found memory instructions to MemoryUses.
4685 static bool FindAllMemoryUses(
4686     Instruction *I,
4687     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
4688     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
4689     const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
4690     BlockFrequencyInfo *BFI, int SeenInsts = 0) {
4691   // If we already considered this instruction, we're done.
4692   if (!ConsideredInsts.insert(I).second)
4693     return false;
4694 
4695   // If this is an obviously unfoldable instruction, bail out.
4696   if (!MightBeFoldableInst(I))
4697     return true;
4698 
4699   // Loop over all the uses, recursively processing them.
4700   for (Use &U : I->uses()) {
4701     // Conservatively return true if we're seeing a large number or a deep chain
4702     // of users. This avoids excessive compilation times in pathological cases.
4703     if (SeenInsts++ >= MaxMemoryUsesToScan)
4704       return true;
4705 
4706     Instruction *UserI = cast<Instruction>(U.getUser());
4707     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
4708       MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
4709       continue;
4710     }
4711 
4712     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
4713       unsigned opNo = U.getOperandNo();
4714       if (opNo != StoreInst::getPointerOperandIndex())
4715         return true; // Storing addr, not into addr.
4716       MemoryUses.push_back(std::make_pair(SI, opNo));
4717       continue;
4718     }
4719 
4720     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
4721       unsigned opNo = U.getOperandNo();
4722       if (opNo != AtomicRMWInst::getPointerOperandIndex())
4723         return true; // Storing addr, not into addr.
4724       MemoryUses.push_back(std::make_pair(RMW, opNo));
4725       continue;
4726     }
4727 
4728     if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
4729       unsigned opNo = U.getOperandNo();
4730       if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
4731         return true; // Storing addr, not into addr.
4732       MemoryUses.push_back(std::make_pair(CmpX, opNo));
4733       continue;
4734     }
4735 
4736     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
4737       if (CI->hasFnAttr(Attribute::Cold)) {
4738         // If this is a cold call, we can sink the addressing calculation into
4739         // the cold path.  See optimizeCallInst
4740         bool OptForSize = OptSize ||
4741           llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
4742         if (!OptForSize)
4743           continue;
4744       }
4745 
4746       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());
4747       if (!IA) return true;
4748 
4749       // If this is a memory operand, we're cool, otherwise bail out.
4750       if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
4751         return true;
4752       continue;
4753     }
4754 
4755     if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
4756                           PSI, BFI, SeenInsts))
4757       return true;
4758   }
4759 
4760   return false;
4761 }
4762 
4763 /// Return true if Val is already known to be live at the use site that we're
4764 /// folding it into. If so, there is no cost to include it in the addressing
4765 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
4766 /// instruction already.
4767 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
4768                                                    Value *KnownLive2) {
4769   // If Val is either of the known-live values, we know it is live!
4770   if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
4771     return true;
4772 
4773   // All values other than instructions and arguments (e.g. constants) are live.
4774   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
4775 
4776   // If Val is a constant sized alloca in the entry block, it is live, this is
4777   // true because it is just a reference to the stack/frame pointer, which is
4778   // live for the whole function.
4779   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
4780     if (AI->isStaticAlloca())
4781       return true;
4782 
4783   // Check to see if this value is already used in the memory instruction's
4784   // block.  If so, it's already live into the block at the very least, so we
4785   // can reasonably fold it.
4786   return Val->isUsedInBasicBlock(MemoryInst->getParent());
4787 }
4788 
4789 /// It is possible for the addressing mode of the machine to fold the specified
4790 /// instruction into a load or store that ultimately uses it.
4791 /// However, the specified instruction has multiple uses.
4792 /// Given this, it may actually increase register pressure to fold it
4793 /// into the load. For example, consider this code:
4794 ///
4795 ///     X = ...
4796 ///     Y = X+1
4797 ///     use(Y)   -> nonload/store
4798 ///     Z = Y+1
4799 ///     load Z
4800 ///
4801 /// In this case, Y has multiple uses, and can be folded into the load of Z
4802 /// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
4803 /// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
4804 /// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
4805 /// number of computations either.
4806 ///
4807 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
4808 /// X was live across 'load Z' for other reasons, we actually *would* want to
4809 /// fold the addressing mode in the Z case.  This would make Y die earlier.
4810 bool AddressingModeMatcher::
4811 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
4812                                      ExtAddrMode &AMAfter) {
4813   if (IgnoreProfitability) return true;
4814 
4815   // AMBefore is the addressing mode before this instruction was folded into it,
4816   // and AMAfter is the addressing mode after the instruction was folded.  Get
4817   // the set of registers referenced by AMAfter and subtract out those
4818   // referenced by AMBefore: this is the set of values which folding in this
4819   // address extends the lifetime of.
4820   //
4821   // Note that there are only two potential values being referenced here,
4822   // BaseReg and ScaleReg (global addresses are always available, as are any
4823   // folded immediates).
4824   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
4825 
4826   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
4827   // lifetime wasn't extended by adding this instruction.
4828   if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4829     BaseReg = nullptr;
4830   if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4831     ScaledReg = nullptr;
4832 
4833   // If folding this instruction (and it's subexprs) didn't extend any live
4834   // ranges, we're ok with it.
4835   if (!BaseReg && !ScaledReg)
4836     return true;
4837 
4838   // If all uses of this instruction can have the address mode sunk into them,
4839   // we can remove the addressing mode and effectively trade one live register
4840   // for another (at worst.)  In this context, folding an addressing mode into
4841   // the use is just a particularly nice way of sinking it.
4842   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
4843   SmallPtrSet<Instruction*, 16> ConsideredInsts;
4844   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
4845                         PSI, BFI))
4846     return false;  // Has a non-memory, non-foldable use!
4847 
4848   // Now that we know that all uses of this instruction are part of a chain of
4849   // computation involving only operations that could theoretically be folded
4850   // into a memory use, loop over each of these memory operation uses and see
4851   // if they could  *actually* fold the instruction.  The assumption is that
4852   // addressing modes are cheap and that duplicating the computation involved
4853   // many times is worthwhile, even on a fastpath. For sinking candidates
4854   // (i.e. cold call sites), this serves as a way to prevent excessive code
4855   // growth since most architectures have some reasonable small and fast way to
4856   // compute an effective address.  (i.e LEA on x86)
4857   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
4858   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
4859     Instruction *User = MemoryUses[i].first;
4860     unsigned OpNo = MemoryUses[i].second;
4861 
4862     // Get the access type of this use.  If the use isn't a pointer, we don't
4863     // know what it accesses.
4864     Value *Address = User->getOperand(OpNo);
4865     PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
4866     if (!AddrTy)
4867       return false;
4868     Type *AddressAccessTy = AddrTy->getElementType();
4869     unsigned AS = AddrTy->getAddressSpace();
4870 
4871     // Do a match against the root of this address, ignoring profitability. This
4872     // will tell us if the addressing mode for the memory operation will
4873     // *actually* cover the shared instruction.
4874     ExtAddrMode Result;
4875     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
4876                                                                       0);
4877     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4878         TPT.getRestorationPoint();
4879     AddressingModeMatcher Matcher(
4880         MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
4881         InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
4882     Matcher.IgnoreProfitability = true;
4883     bool Success = Matcher.matchAddr(Address, 0);
4884     (void)Success; assert(Success && "Couldn't select *anything*?");
4885 
4886     // The match was to check the profitability, the changes made are not
4887     // part of the original matcher. Therefore, they should be dropped
4888     // otherwise the original matcher will not present the right state.
4889     TPT.rollback(LastKnownGood);
4890 
4891     // If the match didn't cover I, then it won't be shared by it.
4892     if (!is_contained(MatchedAddrModeInsts, I))
4893       return false;
4894 
4895     MatchedAddrModeInsts.clear();
4896   }
4897 
4898   return true;
4899 }
4900 
4901 /// Return true if the specified values are defined in a
4902 /// different basic block than BB.
4903 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
4904   if (Instruction *I = dyn_cast<Instruction>(V))
4905     return I->getParent() != BB;
4906   return false;
4907 }
4908 
4909 /// Sink addressing mode computation immediate before MemoryInst if doing so
4910 /// can be done without increasing register pressure.  The need for the
4911 /// register pressure constraint means this can end up being an all or nothing
4912 /// decision for all uses of the same addressing computation.
4913 ///
4914 /// Load and Store Instructions often have addressing modes that can do
4915 /// significant amounts of computation. As such, instruction selection will try
4916 /// to get the load or store to do as much computation as possible for the
4917 /// program. The problem is that isel can only see within a single block. As
4918 /// such, we sink as much legal addressing mode work into the block as possible.
4919 ///
4920 /// This method is used to optimize both load/store and inline asms with memory
4921 /// operands.  It's also used to sink addressing computations feeding into cold
4922 /// call sites into their (cold) basic block.
4923 ///
4924 /// The motivation for handling sinking into cold blocks is that doing so can
4925 /// both enable other address mode sinking (by satisfying the register pressure
4926 /// constraint above), and reduce register pressure globally (by removing the
4927 /// addressing mode computation from the fast path entirely.).
4928 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
4929                                         Type *AccessTy, unsigned AddrSpace) {
4930   Value *Repl = Addr;
4931 
4932   // Try to collapse single-value PHI nodes.  This is necessary to undo
4933   // unprofitable PRE transformations.
4934   SmallVector<Value*, 8> worklist;
4935   SmallPtrSet<Value*, 16> Visited;
4936   worklist.push_back(Addr);
4937 
4938   // Use a worklist to iteratively look through PHI and select nodes, and
4939   // ensure that the addressing mode obtained from the non-PHI/select roots of
4940   // the graph are compatible.
4941   bool PhiOrSelectSeen = false;
4942   SmallVector<Instruction*, 16> AddrModeInsts;
4943   const SimplifyQuery SQ(*DL, TLInfo);
4944   AddressingModeCombiner AddrModes(SQ, Addr);
4945   TypePromotionTransaction TPT(RemovedInsts);
4946   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4947       TPT.getRestorationPoint();
4948   while (!worklist.empty()) {
4949     Value *V = worklist.back();
4950     worklist.pop_back();
4951 
4952     // We allow traversing cyclic Phi nodes.
4953     // In case of success after this loop we ensure that traversing through
4954     // Phi nodes ends up with all cases to compute address of the form
4955     //    BaseGV + Base + Scale * Index + Offset
4956     // where Scale and Offset are constans and BaseGV, Base and Index
4957     // are exactly the same Values in all cases.
4958     // It means that BaseGV, Scale and Offset dominate our memory instruction
4959     // and have the same value as they had in address computation represented
4960     // as Phi. So we can safely sink address computation to memory instruction.
4961     if (!Visited.insert(V).second)
4962       continue;
4963 
4964     // For a PHI node, push all of its incoming values.
4965     if (PHINode *P = dyn_cast<PHINode>(V)) {
4966       for (Value *IncValue : P->incoming_values())
4967         worklist.push_back(IncValue);
4968       PhiOrSelectSeen = true;
4969       continue;
4970     }
4971     // Similar for select.
4972     if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
4973       worklist.push_back(SI->getFalseValue());
4974       worklist.push_back(SI->getTrueValue());
4975       PhiOrSelectSeen = true;
4976       continue;
4977     }
4978 
4979     // For non-PHIs, determine the addressing mode being computed.  Note that
4980     // the result may differ depending on what other uses our candidate
4981     // addressing instructions might have.
4982     AddrModeInsts.clear();
4983     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
4984                                                                       0);
4985     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
4986         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
4987         InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
4988         BFI.get());
4989 
4990     GetElementPtrInst *GEP = LargeOffsetGEP.first;
4991     if (GEP && !NewGEPBases.count(GEP)) {
4992       // If splitting the underlying data structure can reduce the offset of a
4993       // GEP, collect the GEP.  Skip the GEPs that are the new bases of
4994       // previously split data structures.
4995       LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
4996       if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
4997         LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
4998     }
4999 
5000     NewAddrMode.OriginalValue = V;
5001     if (!AddrModes.addNewAddrMode(NewAddrMode))
5002       break;
5003   }
5004 
5005   // Try to combine the AddrModes we've collected. If we couldn't collect any,
5006   // or we have multiple but either couldn't combine them or combining them
5007   // wouldn't do anything useful, bail out now.
5008   if (!AddrModes.combineAddrModes()) {
5009     TPT.rollback(LastKnownGood);
5010     return false;
5011   }
5012   bool Modified = TPT.commit();
5013 
5014   // Get the combined AddrMode (or the only AddrMode, if we only had one).
5015   ExtAddrMode AddrMode = AddrModes.getAddrMode();
5016 
5017   // If all the instructions matched are already in this BB, don't do anything.
5018   // If we saw a Phi node then it is not local definitely, and if we saw a select
5019   // then we want to push the address calculation past it even if it's already
5020   // in this BB.
5021   if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
5022         return IsNonLocalValue(V, MemoryInst->getParent());
5023                   })) {
5024     LLVM_DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode
5025                       << "\n");
5026     return Modified;
5027   }
5028 
5029   // Insert this computation right after this user.  Since our caller is
5030   // scanning from the top of the BB to the bottom, reuse of the expr are
5031   // guaranteed to happen later.
5032   IRBuilder<> Builder(MemoryInst);
5033 
5034   // Now that we determined the addressing expression we want to use and know
5035   // that we have to sink it into this block.  Check to see if we have already
5036   // done this for some other load/store instr in this block.  If so, reuse
5037   // the computation.  Before attempting reuse, check if the address is valid
5038   // as it may have been erased.
5039 
5040   WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
5041 
5042   Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
5043   if (SunkAddr) {
5044     LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
5045                       << " for " << *MemoryInst << "\n");
5046     if (SunkAddr->getType() != Addr->getType())
5047       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
5048   } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
5049                                    SubtargetInfo->addrSinkUsingGEPs())) {
5050     // By default, we use the GEP-based method when AA is used later. This
5051     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
5052     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
5053                       << " for " << *MemoryInst << "\n");
5054     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5055     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
5056 
5057     // First, find the pointer.
5058     if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
5059       ResultPtr = AddrMode.BaseReg;
5060       AddrMode.BaseReg = nullptr;
5061     }
5062 
5063     if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
5064       // We can't add more than one pointer together, nor can we scale a
5065       // pointer (both of which seem meaningless).
5066       if (ResultPtr || AddrMode.Scale != 1)
5067         return Modified;
5068 
5069       ResultPtr = AddrMode.ScaledReg;
5070       AddrMode.Scale = 0;
5071     }
5072 
5073     // It is only safe to sign extend the BaseReg if we know that the math
5074     // required to create it did not overflow before we extend it. Since
5075     // the original IR value was tossed in favor of a constant back when
5076     // the AddrMode was created we need to bail out gracefully if widths
5077     // do not match instead of extending it.
5078     //
5079     // (See below for code to add the scale.)
5080     if (AddrMode.Scale) {
5081       Type *ScaledRegTy = AddrMode.ScaledReg->getType();
5082       if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
5083           cast<IntegerType>(ScaledRegTy)->getBitWidth())
5084         return Modified;
5085     }
5086 
5087     if (AddrMode.BaseGV) {
5088       if (ResultPtr)
5089         return Modified;
5090 
5091       ResultPtr = AddrMode.BaseGV;
5092     }
5093 
5094     // If the real base value actually came from an inttoptr, then the matcher
5095     // will look through it and provide only the integer value. In that case,
5096     // use it here.
5097     if (!DL->isNonIntegralPointerType(Addr->getType())) {
5098       if (!ResultPtr && AddrMode.BaseReg) {
5099         ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
5100                                            "sunkaddr");
5101         AddrMode.BaseReg = nullptr;
5102       } else if (!ResultPtr && AddrMode.Scale == 1) {
5103         ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
5104                                            "sunkaddr");
5105         AddrMode.Scale = 0;
5106       }
5107     }
5108 
5109     if (!ResultPtr &&
5110         !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
5111       SunkAddr = Constant::getNullValue(Addr->getType());
5112     } else if (!ResultPtr) {
5113       return Modified;
5114     } else {
5115       Type *I8PtrTy =
5116           Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
5117       Type *I8Ty = Builder.getInt8Ty();
5118 
5119       // Start with the base register. Do this first so that subsequent address
5120       // matching finds it last, which will prevent it from trying to match it
5121       // as the scaled value in case it happens to be a mul. That would be
5122       // problematic if we've sunk a different mul for the scale, because then
5123       // we'd end up sinking both muls.
5124       if (AddrMode.BaseReg) {
5125         Value *V = AddrMode.BaseReg;
5126         if (V->getType() != IntPtrTy)
5127           V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
5128 
5129         ResultIndex = V;
5130       }
5131 
5132       // Add the scale value.
5133       if (AddrMode.Scale) {
5134         Value *V = AddrMode.ScaledReg;
5135         if (V->getType() == IntPtrTy) {
5136           // done.
5137         } else {
5138           assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
5139                  cast<IntegerType>(V->getType())->getBitWidth() &&
5140                  "We can't transform if ScaledReg is too narrow");
5141           V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
5142         }
5143 
5144         if (AddrMode.Scale != 1)
5145           V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
5146                                 "sunkaddr");
5147         if (ResultIndex)
5148           ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
5149         else
5150           ResultIndex = V;
5151       }
5152 
5153       // Add in the Base Offset if present.
5154       if (AddrMode.BaseOffs) {
5155         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
5156         if (ResultIndex) {
5157           // We need to add this separately from the scale above to help with
5158           // SDAG consecutive load/store merging.
5159           if (ResultPtr->getType() != I8PtrTy)
5160             ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
5161           ResultPtr =
5162               AddrMode.InBounds
5163                   ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
5164                                               "sunkaddr")
5165                   : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
5166         }
5167 
5168         ResultIndex = V;
5169       }
5170 
5171       if (!ResultIndex) {
5172         SunkAddr = ResultPtr;
5173       } else {
5174         if (ResultPtr->getType() != I8PtrTy)
5175           ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
5176         SunkAddr =
5177             AddrMode.InBounds
5178                 ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
5179                                             "sunkaddr")
5180                 : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
5181       }
5182 
5183       if (SunkAddr->getType() != Addr->getType())
5184         SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
5185     }
5186   } else {
5187     // We'd require a ptrtoint/inttoptr down the line, which we can't do for
5188     // non-integral pointers, so in that case bail out now.
5189     Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
5190     Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
5191     PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
5192     PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
5193     if (DL->isNonIntegralPointerType(Addr->getType()) ||
5194         (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
5195         (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
5196         (AddrMode.BaseGV &&
5197          DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
5198       return Modified;
5199 
5200     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
5201                       << " for " << *MemoryInst << "\n");
5202     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5203     Value *Result = nullptr;
5204 
5205     // Start with the base register. Do this first so that subsequent address
5206     // matching finds it last, which will prevent it from trying to match it
5207     // as the scaled value in case it happens to be a mul. That would be
5208     // problematic if we've sunk a different mul for the scale, because then
5209     // we'd end up sinking both muls.
5210     if (AddrMode.BaseReg) {
5211       Value *V = AddrMode.BaseReg;
5212       if (V->getType()->isPointerTy())
5213         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
5214       if (V->getType() != IntPtrTy)
5215         V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
5216       Result = V;
5217     }
5218 
5219     // Add the scale value.
5220     if (AddrMode.Scale) {
5221       Value *V = AddrMode.ScaledReg;
5222       if (V->getType() == IntPtrTy) {
5223         // done.
5224       } else if (V->getType()->isPointerTy()) {
5225         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
5226       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
5227                  cast<IntegerType>(V->getType())->getBitWidth()) {
5228         V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
5229       } else {
5230         // It is only safe to sign extend the BaseReg if we know that the math
5231         // required to create it did not overflow before we extend it. Since
5232         // the original IR value was tossed in favor of a constant back when
5233         // the AddrMode was created we need to bail out gracefully if widths
5234         // do not match instead of extending it.
5235         Instruction *I = dyn_cast_or_null<Instruction>(Result);
5236         if (I && (Result != AddrMode.BaseReg))
5237           I->eraseFromParent();
5238         return Modified;
5239       }
5240       if (AddrMode.Scale != 1)
5241         V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
5242                               "sunkaddr");
5243       if (Result)
5244         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5245       else
5246         Result = V;
5247     }
5248 
5249     // Add in the BaseGV if present.
5250     if (AddrMode.BaseGV) {
5251       Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
5252       if (Result)
5253         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5254       else
5255         Result = V;
5256     }
5257 
5258     // Add in the Base Offset if present.
5259     if (AddrMode.BaseOffs) {
5260       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
5261       if (Result)
5262         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5263       else
5264         Result = V;
5265     }
5266 
5267     if (!Result)
5268       SunkAddr = Constant::getNullValue(Addr->getType());
5269     else
5270       SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
5271   }
5272 
5273   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
5274   // Store the newly computed address into the cache. In the case we reused a
5275   // value, this should be idempotent.
5276   SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);
5277 
5278   // If we have no uses, recursively delete the value and all dead instructions
5279   // using it.
5280   if (Repl->use_empty()) {
5281     resetIteratorIfInvalidatedWhileCalling(CurInstIterator->getParent(), [&]() {
5282       RecursivelyDeleteTriviallyDeadInstructions(
5283           Repl, TLInfo, nullptr,
5284           [&](Value *V) { removeAllAssertingVHReferences(V); });
5285     });
5286   }
5287   ++NumMemoryInsts;
5288   return true;
5289 }
5290 
5291 /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
5292 /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
5293 /// only handle a 2 operand GEP in the same basic block or a splat constant
5294 /// vector. The 2 operands to the GEP must have a scalar pointer and a vector
5295 /// index.
5296 ///
5297 /// If the existing GEP has a vector base pointer that is splat, we can look
5298 /// through the splat to find the scalar pointer. If we can't find a scalar
5299 /// pointer there's nothing we can do.
5300 ///
5301 /// If we have a GEP with more than 2 indices where the middle indices are all
5302 /// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
5303 ///
5304 /// If the final index isn't a vector or is a splat, we can emit a scalar GEP
5305 /// followed by a GEP with an all zeroes vector index. This will enable
5306 /// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
5307 /// zero index.
5308 bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
5309                                                Value *Ptr) {
5310   // FIXME: Support scalable vectors.
5311   if (isa<ScalableVectorType>(Ptr->getType()))
5312     return false;
5313 
5314   Value *NewAddr;
5315 
5316   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
5317     // Don't optimize GEPs that don't have indices.
5318     if (!GEP->hasIndices())
5319       return false;
5320 
5321     // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
5322     // FIXME: We should support this by sinking the GEP.
5323     if (MemoryInst->getParent() != GEP->getParent())
5324       return false;
5325 
5326     SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
5327 
5328     bool RewriteGEP = false;
5329 
5330     if (Ops[0]->getType()->isVectorTy()) {
5331       Ops[0] = getSplatValue(Ops[0]);
5332       if (!Ops[0])
5333         return false;
5334       RewriteGEP = true;
5335     }
5336 
5337     unsigned FinalIndex = Ops.size() - 1;
5338 
5339     // Ensure all but the last index is 0.
5340     // FIXME: This isn't strictly required. All that's required is that they are
5341     // all scalars or splats.
5342     for (unsigned i = 1; i < FinalIndex; ++i) {
5343       auto *C = dyn_cast<Constant>(Ops[i]);
5344       if (!C)
5345         return false;
5346       if (isa<VectorType>(C->getType()))
5347         C = C->getSplatValue();
5348       auto *CI = dyn_cast_or_null<ConstantInt>(C);
5349       if (!CI || !CI->isZero())
5350         return false;
5351       // Scalarize the index if needed.
5352       Ops[i] = CI;
5353     }
5354 
5355     // Try to scalarize the final index.
5356     if (Ops[FinalIndex]->getType()->isVectorTy()) {
5357       if (Value *V = getSplatValue(Ops[FinalIndex])) {
5358         auto *C = dyn_cast<ConstantInt>(V);
5359         // Don't scalarize all zeros vector.
5360         if (!C || !C->isZero()) {
5361           Ops[FinalIndex] = V;
5362           RewriteGEP = true;
5363         }
5364       }
5365     }
5366 
5367     // If we made any changes or the we have extra operands, we need to generate
5368     // new instructions.
5369     if (!RewriteGEP && Ops.size() == 2)
5370       return false;
5371 
5372     unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
5373 
5374     IRBuilder<> Builder(MemoryInst);
5375 
5376     Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
5377 
5378     // If the final index isn't a vector, emit a scalar GEP containing all ops
5379     // and a vector GEP with all zeroes final index.
5380     if (!Ops[FinalIndex]->getType()->isVectorTy()) {
5381       NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
5382       auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
5383       NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
5384     } else {
5385       Value *Base = Ops[0];
5386       Value *Index = Ops[FinalIndex];
5387 
5388       // Create a scalar GEP if there are more than 2 operands.
5389       if (Ops.size() != 2) {
5390         // Replace the last index with 0.
5391         Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
5392         Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
5393       }
5394 
5395       // Now create the GEP with scalar pointer and vector index.
5396       NewAddr = Builder.CreateGEP(Base, Index);
5397     }
5398   } else if (!isa<Constant>(Ptr)) {
5399     // Not a GEP, maybe its a splat and we can create a GEP to enable
5400     // SelectionDAGBuilder to use it as a uniform base.
5401     Value *V = getSplatValue(Ptr);
5402     if (!V)
5403       return false;
5404 
5405     unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
5406 
5407     IRBuilder<> Builder(MemoryInst);
5408 
5409     // Emit a vector GEP with a scalar pointer and all 0s vector index.
5410     Type *ScalarIndexTy = DL->getIndexType(V->getType()->getScalarType());
5411     auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
5412     NewAddr = Builder.CreateGEP(V, Constant::getNullValue(IndexTy));
5413   } else {
5414     // Constant, SelectionDAGBuilder knows to check if its a splat.
5415     return false;
5416   }
5417 
5418   MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
5419 
5420   // If we have no uses, recursively delete the value and all dead instructions
5421   // using it.
5422   if (Ptr->use_empty())
5423     RecursivelyDeleteTriviallyDeadInstructions(
5424         Ptr, TLInfo, nullptr,
5425         [&](Value *V) { removeAllAssertingVHReferences(V); });
5426 
5427   return true;
5428 }
5429 
5430 /// If there are any memory operands, use OptimizeMemoryInst to sink their
5431 /// address computing into the block when possible / profitable.
5432 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
5433   bool MadeChange = false;
5434 
5435   const TargetRegisterInfo *TRI =
5436       TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
5437   TargetLowering::AsmOperandInfoVector TargetConstraints =
5438       TLI->ParseConstraints(*DL, TRI, *CS);
5439   unsigned ArgNo = 0;
5440   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
5441     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
5442 
5443     // Compute the constraint code and ConstraintType to use.
5444     TLI->ComputeConstraintToUse(OpInfo, SDValue());
5445 
5446     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
5447         OpInfo.isIndirect) {
5448       Value *OpVal = CS->getArgOperand(ArgNo++);
5449       MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
5450     } else if (OpInfo.Type == InlineAsm::isInput)
5451       ArgNo++;
5452   }
5453 
5454   return MadeChange;
5455 }
5456 
5457 /// Check if all the uses of \p Val are equivalent (or free) zero or
5458 /// sign extensions.
5459 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
5460   assert(!Val->use_empty() && "Input must have at least one use");
5461   const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
5462   bool IsSExt = isa<SExtInst>(FirstUser);
5463   Type *ExtTy = FirstUser->getType();
5464   for (const User *U : Val->users()) {
5465     const Instruction *UI = cast<Instruction>(U);
5466     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
5467       return false;
5468     Type *CurTy = UI->getType();
5469     // Same input and output types: Same instruction after CSE.
5470     if (CurTy == ExtTy)
5471       continue;
5472 
5473     // If IsSExt is true, we are in this situation:
5474     // a = Val
5475     // b = sext ty1 a to ty2
5476     // c = sext ty1 a to ty3
5477     // Assuming ty2 is shorter than ty3, this could be turned into:
5478     // a = Val
5479     // b = sext ty1 a to ty2
5480     // c = sext ty2 b to ty3
5481     // However, the last sext is not free.
5482     if (IsSExt)
5483       return false;
5484 
5485     // This is a ZExt, maybe this is free to extend from one type to another.
5486     // In that case, we would not account for a different use.
5487     Type *NarrowTy;
5488     Type *LargeTy;
5489     if (ExtTy->getScalarType()->getIntegerBitWidth() >
5490         CurTy->getScalarType()->getIntegerBitWidth()) {
5491       NarrowTy = CurTy;
5492       LargeTy = ExtTy;
5493     } else {
5494       NarrowTy = ExtTy;
5495       LargeTy = CurTy;
5496     }
5497 
5498     if (!TLI.isZExtFree(NarrowTy, LargeTy))
5499       return false;
5500   }
5501   // All uses are the same or can be derived from one another for free.
5502   return true;
5503 }
5504 
5505 /// Try to speculatively promote extensions in \p Exts and continue
5506 /// promoting through newly promoted operands recursively as far as doing so is
5507 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
5508 /// When some promotion happened, \p TPT contains the proper state to revert
5509 /// them.
5510 ///
5511 /// \return true if some promotion happened, false otherwise.
5512 bool CodeGenPrepare::tryToPromoteExts(
5513     TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
5514     SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
5515     unsigned CreatedInstsCost) {
5516   bool Promoted = false;
5517 
5518   // Iterate over all the extensions to try to promote them.
5519   for (auto *I : Exts) {
5520     // Early check if we directly have ext(load).
5521     if (isa<LoadInst>(I->getOperand(0))) {
5522       ProfitablyMovedExts.push_back(I);
5523       continue;
5524     }
5525 
5526     // Check whether or not we want to do any promotion.  The reason we have
5527     // this check inside the for loop is to catch the case where an extension
5528     // is directly fed by a load because in such case the extension can be moved
5529     // up without any promotion on its operands.
5530     if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
5531       return false;
5532 
5533     // Get the action to perform the promotion.
5534     TypePromotionHelper::Action TPH =
5535         TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
5536     // Check if we can promote.
5537     if (!TPH) {
5538       // Save the current extension as we cannot move up through its operand.
5539       ProfitablyMovedExts.push_back(I);
5540       continue;
5541     }
5542 
5543     // Save the current state.
5544     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5545         TPT.getRestorationPoint();
5546     SmallVector<Instruction *, 4> NewExts;
5547     unsigned NewCreatedInstsCost = 0;
5548     unsigned ExtCost = !TLI->isExtFree(I);
5549     // Promote.
5550     Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
5551                              &NewExts, nullptr, *TLI);
5552     assert(PromotedVal &&
5553            "TypePromotionHelper should have filtered out those cases");
5554 
5555     // We would be able to merge only one extension in a load.
5556     // Therefore, if we have more than 1 new extension we heuristically
5557     // cut this search path, because it means we degrade the code quality.
5558     // With exactly 2, the transformation is neutral, because we will merge
5559     // one extension but leave one. However, we optimistically keep going,
5560     // because the new extension may be removed too.
5561     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
5562     // FIXME: It would be possible to propagate a negative value instead of
5563     // conservatively ceiling it to 0.
5564     TotalCreatedInstsCost =
5565         std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
5566     if (!StressExtLdPromotion &&
5567         (TotalCreatedInstsCost > 1 ||
5568          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
5569       // This promotion is not profitable, rollback to the previous state, and
5570       // save the current extension in ProfitablyMovedExts as the latest
5571       // speculative promotion turned out to be unprofitable.
5572       TPT.rollback(LastKnownGood);
5573       ProfitablyMovedExts.push_back(I);
5574       continue;
5575     }
5576     // Continue promoting NewExts as far as doing so is profitable.
5577     SmallVector<Instruction *, 2> NewlyMovedExts;
5578     (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
5579     bool NewPromoted = false;
5580     for (auto *ExtInst : NewlyMovedExts) {
5581       Instruction *MovedExt = cast<Instruction>(ExtInst);
5582       Value *ExtOperand = MovedExt->getOperand(0);
5583       // If we have reached to a load, we need this extra profitability check
5584       // as it could potentially be merged into an ext(load).
5585       if (isa<LoadInst>(ExtOperand) &&
5586           !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
5587             (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
5588         continue;
5589 
5590       ProfitablyMovedExts.push_back(MovedExt);
5591       NewPromoted = true;
5592     }
5593 
5594     // If none of speculative promotions for NewExts is profitable, rollback
5595     // and save the current extension (I) as the last profitable extension.
5596     if (!NewPromoted) {
5597       TPT.rollback(LastKnownGood);
5598       ProfitablyMovedExts.push_back(I);
5599       continue;
5600     }
5601     // The promotion is profitable.
5602     Promoted = true;
5603   }
5604   return Promoted;
5605 }
5606 
5607 /// Merging redundant sexts when one is dominating the other.
5608 bool CodeGenPrepare::mergeSExts(Function &F) {
5609   bool Changed = false;
5610   for (auto &Entry : ValToSExtendedUses) {
5611     SExts &Insts = Entry.second;
5612     SExts CurPts;
5613     for (Instruction *Inst : Insts) {
5614       if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
5615           Inst->getOperand(0) != Entry.first)
5616         continue;
5617       bool inserted = false;
5618       for (auto &Pt : CurPts) {
5619         if (getDT(F).dominates(Inst, Pt)) {
5620           Pt->replaceAllUsesWith(Inst);
5621           RemovedInsts.insert(Pt);
5622           Pt->removeFromParent();
5623           Pt = Inst;
5624           inserted = true;
5625           Changed = true;
5626           break;
5627         }
5628         if (!getDT(F).dominates(Pt, Inst))
5629           // Give up if we need to merge in a common dominator as the
5630           // experiments show it is not profitable.
5631           continue;
5632         Inst->replaceAllUsesWith(Pt);
5633         RemovedInsts.insert(Inst);
5634         Inst->removeFromParent();
5635         inserted = true;
5636         Changed = true;
5637         break;
5638       }
5639       if (!inserted)
5640         CurPts.push_back(Inst);
5641     }
5642   }
5643   return Changed;
5644 }
5645 
5646 // Splitting large data structures so that the GEPs accessing them can have
5647 // smaller offsets so that they can be sunk to the same blocks as their users.
5648 // For example, a large struct starting from %base is split into two parts
5649 // where the second part starts from %new_base.
5650 //
5651 // Before:
5652 // BB0:
5653 //   %base     =
5654 //
5655 // BB1:
5656 //   %gep0     = gep %base, off0
5657 //   %gep1     = gep %base, off1
5658 //   %gep2     = gep %base, off2
5659 //
5660 // BB2:
5661 //   %load1    = load %gep0
5662 //   %load2    = load %gep1
5663 //   %load3    = load %gep2
5664 //
5665 // After:
5666 // BB0:
5667 //   %base     =
5668 //   %new_base = gep %base, off0
5669 //
5670 // BB1:
5671 //   %new_gep0 = %new_base
5672 //   %new_gep1 = gep %new_base, off1 - off0
5673 //   %new_gep2 = gep %new_base, off2 - off0
5674 //
5675 // BB2:
5676 //   %load1    = load i32, i32* %new_gep0
5677 //   %load2    = load i32, i32* %new_gep1
5678 //   %load3    = load i32, i32* %new_gep2
5679 //
5680 // %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
5681 // their offsets are smaller enough to fit into the addressing mode.
5682 bool CodeGenPrepare::splitLargeGEPOffsets() {
5683   bool Changed = false;
5684   for (auto &Entry : LargeOffsetGEPMap) {
5685     Value *OldBase = Entry.first;
5686     SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
5687         &LargeOffsetGEPs = Entry.second;
5688     auto compareGEPOffset =
5689         [&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
5690             const std::pair<GetElementPtrInst *, int64_t> &RHS) {
5691           if (LHS.first == RHS.first)
5692             return false;
5693           if (LHS.second != RHS.second)
5694             return LHS.second < RHS.second;
5695           return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
5696         };
5697     // Sorting all the GEPs of the same data structures based on the offsets.
5698     llvm::sort(LargeOffsetGEPs, compareGEPOffset);
5699     LargeOffsetGEPs.erase(
5700         std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
5701         LargeOffsetGEPs.end());
5702     // Skip if all the GEPs have the same offsets.
5703     if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
5704       continue;
5705     GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
5706     int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
5707     Value *NewBaseGEP = nullptr;
5708 
5709     auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
5710     while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
5711       GetElementPtrInst *GEP = LargeOffsetGEP->first;
5712       int64_t Offset = LargeOffsetGEP->second;
5713       if (Offset != BaseOffset) {
5714         TargetLowering::AddrMode AddrMode;
5715         AddrMode.BaseOffs = Offset - BaseOffset;
5716         // The result type of the GEP might not be the type of the memory
5717         // access.
5718         if (!TLI->isLegalAddressingMode(*DL, AddrMode,
5719                                         GEP->getResultElementType(),
5720                                         GEP->getAddressSpace())) {
5721           // We need to create a new base if the offset to the current base is
5722           // too large to fit into the addressing mode. So, a very large struct
5723           // may be split into several parts.
5724           BaseGEP = GEP;
5725           BaseOffset = Offset;
5726           NewBaseGEP = nullptr;
5727         }
5728       }
5729 
5730       // Generate a new GEP to replace the current one.
5731       LLVMContext &Ctx = GEP->getContext();
5732       Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
5733       Type *I8PtrTy =
5734           Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
5735       Type *I8Ty = Type::getInt8Ty(Ctx);
5736 
5737       if (!NewBaseGEP) {
5738         // Create a new base if we don't have one yet.  Find the insertion
5739         // pointer for the new base first.
5740         BasicBlock::iterator NewBaseInsertPt;
5741         BasicBlock *NewBaseInsertBB;
5742         if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
5743           // If the base of the struct is an instruction, the new base will be
5744           // inserted close to it.
5745           NewBaseInsertBB = BaseI->getParent();
5746           if (isa<PHINode>(BaseI))
5747             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5748           else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
5749             NewBaseInsertBB =
5750                 SplitEdge(NewBaseInsertBB, Invoke->getNormalDest());
5751             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5752           } else
5753             NewBaseInsertPt = std::next(BaseI->getIterator());
5754         } else {
5755           // If the current base is an argument or global value, the new base
5756           // will be inserted to the entry block.
5757           NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
5758           NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5759         }
5760         IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
5761         // Create a new base.
5762         Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset);
5763         NewBaseGEP = OldBase;
5764         if (NewBaseGEP->getType() != I8PtrTy)
5765           NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
5766         NewBaseGEP =
5767             NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
5768         NewGEPBases.insert(NewBaseGEP);
5769       }
5770 
5771       IRBuilder<> Builder(GEP);
5772       Value *NewGEP = NewBaseGEP;
5773       if (Offset == BaseOffset) {
5774         if (GEP->getType() != I8PtrTy)
5775           NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
5776       } else {
5777         // Calculate the new offset for the new GEP.
5778         Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset);
5779         NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
5780 
5781         if (GEP->getType() != I8PtrTy)
5782           NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
5783       }
5784       GEP->replaceAllUsesWith(NewGEP);
5785       LargeOffsetGEPID.erase(GEP);
5786       LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
5787       GEP->eraseFromParent();
5788       Changed = true;
5789     }
5790   }
5791   return Changed;
5792 }
5793 
5794 bool CodeGenPrepare::optimizePhiType(
5795     PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
5796     SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
5797   // We are looking for a collection on interconnected phi nodes that together
5798   // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
5799   // are of the same type. Convert the whole set of nodes to the type of the
5800   // bitcast.
5801   Type *PhiTy = I->getType();
5802   Type *ConvertTy = nullptr;
5803   if (Visited.count(I) ||
5804       (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
5805     return false;
5806 
5807   SmallVector<Instruction *, 4> Worklist;
5808   Worklist.push_back(cast<Instruction>(I));
5809   SmallPtrSet<PHINode *, 4> PhiNodes;
5810   PhiNodes.insert(I);
5811   Visited.insert(I);
5812   SmallPtrSet<Instruction *, 4> Defs;
5813   SmallPtrSet<Instruction *, 4> Uses;
5814   // This works by adding extra bitcasts between load/stores and removing
5815   // existing bicasts. If we have a phi(bitcast(load)) or a store(bitcast(phi))
5816   // we can get in the situation where we remove a bitcast in one iteration
5817   // just to add it again in the next. We need to ensure that at least one
5818   // bitcast we remove are anchored to something that will not change back.
5819   bool AnyAnchored = false;
5820 
5821   while (!Worklist.empty()) {
5822     Instruction *II = Worklist.pop_back_val();
5823 
5824     if (auto *Phi = dyn_cast<PHINode>(II)) {
5825       // Handle Defs, which might also be PHI's
5826       for (Value *V : Phi->incoming_values()) {
5827         if (auto *OpPhi = dyn_cast<PHINode>(V)) {
5828           if (!PhiNodes.count(OpPhi)) {
5829             if (Visited.count(OpPhi))
5830               return false;
5831             PhiNodes.insert(OpPhi);
5832             Visited.insert(OpPhi);
5833             Worklist.push_back(OpPhi);
5834           }
5835         } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
5836           if (!OpLoad->isSimple())
5837             return false;
5838           if (!Defs.count(OpLoad)) {
5839             Defs.insert(OpLoad);
5840             Worklist.push_back(OpLoad);
5841           }
5842         } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
5843           if (!Defs.count(OpEx)) {
5844             Defs.insert(OpEx);
5845             Worklist.push_back(OpEx);
5846           }
5847         } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
5848           if (!ConvertTy)
5849             ConvertTy = OpBC->getOperand(0)->getType();
5850           if (OpBC->getOperand(0)->getType() != ConvertTy)
5851             return false;
5852           if (!Defs.count(OpBC)) {
5853             Defs.insert(OpBC);
5854             Worklist.push_back(OpBC);
5855             AnyAnchored |= !isa<LoadInst>(OpBC->getOperand(0)) &&
5856                            !isa<ExtractElementInst>(OpBC->getOperand(0));
5857           }
5858         } else if (!isa<UndefValue>(V)) {
5859           return false;
5860         }
5861       }
5862     }
5863 
5864     // Handle uses which might also be phi's
5865     for (User *V : II->users()) {
5866       if (auto *OpPhi = dyn_cast<PHINode>(V)) {
5867         if (!PhiNodes.count(OpPhi)) {
5868           if (Visited.count(OpPhi))
5869             return false;
5870           PhiNodes.insert(OpPhi);
5871           Visited.insert(OpPhi);
5872           Worklist.push_back(OpPhi);
5873         }
5874       } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
5875         if (!OpStore->isSimple() || OpStore->getOperand(0) != II)
5876           return false;
5877         Uses.insert(OpStore);
5878       } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
5879         if (!ConvertTy)
5880           ConvertTy = OpBC->getType();
5881         if (OpBC->getType() != ConvertTy)
5882           return false;
5883         Uses.insert(OpBC);
5884         AnyAnchored |=
5885             any_of(OpBC->users(), [](User *U) { return !isa<StoreInst>(U); });
5886       } else {
5887         return false;
5888       }
5889     }
5890   }
5891 
5892   if (!ConvertTy || !AnyAnchored || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
5893     return false;
5894 
5895   LLVM_DEBUG(dbgs() << "Converting " << *I << "\n  and connected nodes to "
5896                     << *ConvertTy << "\n");
5897 
5898   // Create all the new phi nodes of the new type, and bitcast any loads to the
5899   // correct type.
5900   ValueToValueMap ValMap;
5901   ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);
5902   for (Instruction *D : Defs) {
5903     if (isa<BitCastInst>(D)) {
5904       ValMap[D] = D->getOperand(0);
5905       DeletedInstrs.insert(D);
5906     } else {
5907       ValMap[D] =
5908           new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
5909     }
5910   }
5911   for (PHINode *Phi : PhiNodes)
5912     ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
5913                                   Phi->getName() + ".tc", Phi);
5914   // Pipe together all the PhiNodes.
5915   for (PHINode *Phi : PhiNodes) {
5916     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
5917     for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
5918       NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
5919                           Phi->getIncomingBlock(i));
5920     Visited.insert(NewPhi);
5921   }
5922   // And finally pipe up the stores and bitcasts
5923   for (Instruction *U : Uses) {
5924     if (isa<BitCastInst>(U)) {
5925       DeletedInstrs.insert(U);
5926       U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
5927     } else {
5928       U->setOperand(0,
5929                     new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
5930     }
5931   }
5932 
5933   // Save the removed phis to be deleted later.
5934   for (PHINode *Phi : PhiNodes)
5935     DeletedInstrs.insert(Phi);
5936   return true;
5937 }
5938 
5939 bool CodeGenPrepare::optimizePhiTypes(Function &F) {
5940   if (!OptimizePhiTypes)
5941     return false;
5942 
5943   bool Changed = false;
5944   SmallPtrSet<PHINode *, 4> Visited;
5945   SmallPtrSet<Instruction *, 4> DeletedInstrs;
5946 
5947   // Attempt to optimize all the phis in the functions to the correct type.
5948   for (auto &BB : F)
5949     for (auto &Phi : BB.phis())
5950       Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);
5951 
5952   // Remove any old phi's that have been converted.
5953   for (auto *I : DeletedInstrs) {
5954     I->replaceAllUsesWith(UndefValue::get(I->getType()));
5955     I->eraseFromParent();
5956   }
5957 
5958   return Changed;
5959 }
5960 
5961 /// Return true, if an ext(load) can be formed from an extension in
5962 /// \p MovedExts.
5963 bool CodeGenPrepare::canFormExtLd(
5964     const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
5965     Instruction *&Inst, bool HasPromoted) {
5966   for (auto *MovedExtInst : MovedExts) {
5967     if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
5968       LI = cast<LoadInst>(MovedExtInst->getOperand(0));
5969       Inst = MovedExtInst;
5970       break;
5971     }
5972   }
5973   if (!LI)
5974     return false;
5975 
5976   // If they're already in the same block, there's nothing to do.
5977   // Make the cheap checks first if we did not promote.
5978   // If we promoted, we need to check if it is indeed profitable.
5979   if (!HasPromoted && LI->getParent() == Inst->getParent())
5980     return false;
5981 
5982   return TLI->isExtLoad(LI, Inst, *DL);
5983 }
5984 
5985 /// Move a zext or sext fed by a load into the same basic block as the load,
5986 /// unless conditions are unfavorable. This allows SelectionDAG to fold the
5987 /// extend into the load.
5988 ///
5989 /// E.g.,
5990 /// \code
5991 /// %ld = load i32* %addr
5992 /// %add = add nuw i32 %ld, 4
5993 /// %zext = zext i32 %add to i64
5994 // \endcode
5995 /// =>
5996 /// \code
5997 /// %ld = load i32* %addr
5998 /// %zext = zext i32 %ld to i64
5999 /// %add = add nuw i64 %zext, 4
6000 /// \encode
6001 /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
6002 /// allow us to match zext(load i32*) to i64.
6003 ///
6004 /// Also, try to promote the computations used to obtain a sign extended
6005 /// value used into memory accesses.
6006 /// E.g.,
6007 /// \code
6008 /// a = add nsw i32 b, 3
6009 /// d = sext i32 a to i64
6010 /// e = getelementptr ..., i64 d
6011 /// \endcode
6012 /// =>
6013 /// \code
6014 /// f = sext i32 b to i64
6015 /// a = add nsw i64 f, 3
6016 /// e = getelementptr ..., i64 a
6017 /// \endcode
6018 ///
6019 /// \p Inst[in/out] the extension may be modified during the process if some
6020 /// promotions apply.
6021 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
6022   bool AllowPromotionWithoutCommonHeader = false;
6023   /// See if it is an interesting sext operations for the address type
6024   /// promotion before trying to promote it, e.g., the ones with the right
6025   /// type and used in memory accesses.
6026   bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
6027       *Inst, AllowPromotionWithoutCommonHeader);
6028   TypePromotionTransaction TPT(RemovedInsts);
6029   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
6030       TPT.getRestorationPoint();
6031   SmallVector<Instruction *, 1> Exts;
6032   SmallVector<Instruction *, 2> SpeculativelyMovedExts;
6033   Exts.push_back(Inst);
6034 
6035   bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
6036 
6037   // Look for a load being extended.
6038   LoadInst *LI = nullptr;
6039   Instruction *ExtFedByLoad;
6040 
6041   // Try to promote a chain of computation if it allows to form an extended
6042   // load.
6043   if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
6044     assert(LI && ExtFedByLoad && "Expect a valid load and extension");
6045     TPT.commit();
6046     // Move the extend into the same block as the load.
6047     ExtFedByLoad->moveAfter(LI);
6048     ++NumExtsMoved;
6049     Inst = ExtFedByLoad;
6050     return true;
6051   }
6052 
6053   // Continue promoting SExts if known as considerable depending on targets.
6054   if (ATPConsiderable &&
6055       performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
6056                                   HasPromoted, TPT, SpeculativelyMovedExts))
6057     return true;
6058 
6059   TPT.rollback(LastKnownGood);
6060   return false;
6061 }
6062 
6063 // Perform address type promotion if doing so is profitable.
6064 // If AllowPromotionWithoutCommonHeader == false, we should find other sext
6065 // instructions that sign extended the same initial value. However, if
6066 // AllowPromotionWithoutCommonHeader == true, we expect promoting the
6067 // extension is just profitable.
6068 bool CodeGenPrepare::performAddressTypePromotion(
6069     Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
6070     bool HasPromoted, TypePromotionTransaction &TPT,
6071     SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
6072   bool Promoted = false;
6073   SmallPtrSet<Instruction *, 1> UnhandledExts;
6074   bool AllSeenFirst = true;
6075   for (auto *I : SpeculativelyMovedExts) {
6076     Value *HeadOfChain = I->getOperand(0);
6077     DenseMap<Value *, Instruction *>::iterator AlreadySeen =
6078         SeenChainsForSExt.find(HeadOfChain);
6079     // If there is an unhandled SExt which has the same header, try to promote
6080     // it as well.
6081     if (AlreadySeen != SeenChainsForSExt.end()) {
6082       if (AlreadySeen->second != nullptr)
6083         UnhandledExts.insert(AlreadySeen->second);
6084       AllSeenFirst = false;
6085     }
6086   }
6087 
6088   if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
6089                         SpeculativelyMovedExts.size() == 1)) {
6090     TPT.commit();
6091     if (HasPromoted)
6092       Promoted = true;
6093     for (auto *I : SpeculativelyMovedExts) {
6094       Value *HeadOfChain = I->getOperand(0);
6095       SeenChainsForSExt[HeadOfChain] = nullptr;
6096       ValToSExtendedUses[HeadOfChain].push_back(I);
6097     }
6098     // Update Inst as promotion happen.
6099     Inst = SpeculativelyMovedExts.pop_back_val();
6100   } else {
6101     // This is the first chain visited from the header, keep the current chain
6102     // as unhandled. Defer to promote this until we encounter another SExt
6103     // chain derived from the same header.
6104     for (auto *I : SpeculativelyMovedExts) {
6105       Value *HeadOfChain = I->getOperand(0);
6106       SeenChainsForSExt[HeadOfChain] = Inst;
6107     }
6108     return false;
6109   }
6110 
6111   if (!AllSeenFirst && !UnhandledExts.empty())
6112     for (auto *VisitedSExt : UnhandledExts) {
6113       if (RemovedInsts.count(VisitedSExt))
6114         continue;
6115       TypePromotionTransaction TPT(RemovedInsts);
6116       SmallVector<Instruction *, 1> Exts;
6117       SmallVector<Instruction *, 2> Chains;
6118       Exts.push_back(VisitedSExt);
6119       bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
6120       TPT.commit();
6121       if (HasPromoted)
6122         Promoted = true;
6123       for (auto *I : Chains) {
6124         Value *HeadOfChain = I->getOperand(0);
6125         // Mark this as handled.
6126         SeenChainsForSExt[HeadOfChain] = nullptr;
6127         ValToSExtendedUses[HeadOfChain].push_back(I);
6128       }
6129     }
6130   return Promoted;
6131 }
6132 
6133 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
6134   BasicBlock *DefBB = I->getParent();
6135 
6136   // If the result of a {s|z}ext and its source are both live out, rewrite all
6137   // other uses of the source with result of extension.
6138   Value *Src = I->getOperand(0);
6139   if (Src->hasOneUse())
6140     return false;
6141 
6142   // Only do this xform if truncating is free.
6143   if (!TLI->isTruncateFree(I->getType(), Src->getType()))
6144     return false;
6145 
6146   // Only safe to perform the optimization if the source is also defined in
6147   // this block.
6148   if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
6149     return false;
6150 
6151   bool DefIsLiveOut = false;
6152   for (User *U : I->users()) {
6153     Instruction *UI = cast<Instruction>(U);
6154 
6155     // Figure out which BB this ext is used in.
6156     BasicBlock *UserBB = UI->getParent();
6157     if (UserBB == DefBB) continue;
6158     DefIsLiveOut = true;
6159     break;
6160   }
6161   if (!DefIsLiveOut)
6162     return false;
6163 
6164   // Make sure none of the uses are PHI nodes.
6165   for (User *U : Src->users()) {
6166     Instruction *UI = cast<Instruction>(U);
6167     BasicBlock *UserBB = UI->getParent();
6168     if (UserBB == DefBB) continue;
6169     // Be conservative. We don't want this xform to end up introducing
6170     // reloads just before load / store instructions.
6171     if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
6172       return false;
6173   }
6174 
6175   // InsertedTruncs - Only insert one trunc in each block once.
6176   DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
6177 
6178   bool MadeChange = false;
6179   for (Use &U : Src->uses()) {
6180     Instruction *User = cast<Instruction>(U.getUser());
6181 
6182     // Figure out which BB this ext is used in.
6183     BasicBlock *UserBB = User->getParent();
6184     if (UserBB == DefBB) continue;
6185 
6186     // Both src and def are live in this block. Rewrite the use.
6187     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
6188 
6189     if (!InsertedTrunc) {
6190       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
6191       assert(InsertPt != UserBB->end());
6192       InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
6193       InsertedInsts.insert(InsertedTrunc);
6194     }
6195 
6196     // Replace a use of the {s|z}ext source with a use of the result.
6197     U = InsertedTrunc;
6198     ++NumExtUses;
6199     MadeChange = true;
6200   }
6201 
6202   return MadeChange;
6203 }
6204 
6205 // Find loads whose uses only use some of the loaded value's bits.  Add an "and"
6206 // just after the load if the target can fold this into one extload instruction,
6207 // with the hope of eliminating some of the other later "and" instructions using
6208 // the loaded value.  "and"s that are made trivially redundant by the insertion
6209 // of the new "and" are removed by this function, while others (e.g. those whose
6210 // path from the load goes through a phi) are left for isel to potentially
6211 // remove.
6212 //
6213 // For example:
6214 //
6215 // b0:
6216 //   x = load i32
6217 //   ...
6218 // b1:
6219 //   y = and x, 0xff
6220 //   z = use y
6221 //
6222 // becomes:
6223 //
6224 // b0:
6225 //   x = load i32
6226 //   x' = and x, 0xff
6227 //   ...
6228 // b1:
6229 //   z = use x'
6230 //
6231 // whereas:
6232 //
6233 // b0:
6234 //   x1 = load i32
6235 //   ...
6236 // b1:
6237 //   x2 = load i32
6238 //   ...
6239 // b2:
6240 //   x = phi x1, x2
6241 //   y = and x, 0xff
6242 //
6243 // becomes (after a call to optimizeLoadExt for each load):
6244 //
6245 // b0:
6246 //   x1 = load i32
6247 //   x1' = and x1, 0xff
6248 //   ...
6249 // b1:
6250 //   x2 = load i32
6251 //   x2' = and x2, 0xff
6252 //   ...
6253 // b2:
6254 //   x = phi x1', x2'
6255 //   y = and x, 0xff
6256 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
6257   if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
6258     return false;
6259 
6260   // Skip loads we've already transformed.
6261   if (Load->hasOneUse() &&
6262       InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
6263     return false;
6264 
6265   // Look at all uses of Load, looking through phis, to determine how many bits
6266   // of the loaded value are needed.
6267   SmallVector<Instruction *, 8> WorkList;
6268   SmallPtrSet<Instruction *, 16> Visited;
6269   SmallVector<Instruction *, 8> AndsToMaybeRemove;
6270   for (auto *U : Load->users())
6271     WorkList.push_back(cast<Instruction>(U));
6272 
6273   EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
6274   unsigned BitWidth = LoadResultVT.getSizeInBits();
6275   APInt DemandBits(BitWidth, 0);
6276   APInt WidestAndBits(BitWidth, 0);
6277 
6278   while (!WorkList.empty()) {
6279     Instruction *I = WorkList.back();
6280     WorkList.pop_back();
6281 
6282     // Break use-def graph loops.
6283     if (!Visited.insert(I).second)
6284       continue;
6285 
6286     // For a PHI node, push all of its users.
6287     if (auto *Phi = dyn_cast<PHINode>(I)) {
6288       for (auto *U : Phi->users())
6289         WorkList.push_back(cast<Instruction>(U));
6290       continue;
6291     }
6292 
6293     switch (I->getOpcode()) {
6294     case Instruction::And: {
6295       auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
6296       if (!AndC)
6297         return false;
6298       APInt AndBits = AndC->getValue();
6299       DemandBits |= AndBits;
6300       // Keep track of the widest and mask we see.
6301       if (AndBits.ugt(WidestAndBits))
6302         WidestAndBits = AndBits;
6303       if (AndBits == WidestAndBits && I->getOperand(0) == Load)
6304         AndsToMaybeRemove.push_back(I);
6305       break;
6306     }
6307 
6308     case Instruction::Shl: {
6309       auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
6310       if (!ShlC)
6311         return false;
6312       uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
6313       DemandBits.setLowBits(BitWidth - ShiftAmt);
6314       break;
6315     }
6316 
6317     case Instruction::Trunc: {
6318       EVT TruncVT = TLI->getValueType(*DL, I->getType());
6319       unsigned TruncBitWidth = TruncVT.getSizeInBits();
6320       DemandBits.setLowBits(TruncBitWidth);
6321       break;
6322     }
6323 
6324     default:
6325       return false;
6326     }
6327   }
6328 
6329   uint32_t ActiveBits = DemandBits.getActiveBits();
6330   // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
6331   // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
6332   // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
6333   // (and (load x) 1) is not matched as a single instruction, rather as a LDR
6334   // followed by an AND.
6335   // TODO: Look into removing this restriction by fixing backends to either
6336   // return false for isLoadExtLegal for i1 or have them select this pattern to
6337   // a single instruction.
6338   //
6339   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
6340   // mask, since these are the only ands that will be removed by isel.
6341   if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
6342       WidestAndBits != DemandBits)
6343     return false;
6344 
6345   LLVMContext &Ctx = Load->getType()->getContext();
6346   Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
6347   EVT TruncVT = TLI->getValueType(*DL, TruncTy);
6348 
6349   // Reject cases that won't be matched as extloads.
6350   if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
6351       !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
6352     return false;
6353 
6354   IRBuilder<> Builder(Load->getNextNode());
6355   auto *NewAnd = cast<Instruction>(
6356       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
6357   // Mark this instruction as "inserted by CGP", so that other
6358   // optimizations don't touch it.
6359   InsertedInsts.insert(NewAnd);
6360 
6361   // Replace all uses of load with new and (except for the use of load in the
6362   // new and itself).
6363   Load->replaceAllUsesWith(NewAnd);
6364   NewAnd->setOperand(0, Load);
6365 
6366   // Remove any and instructions that are now redundant.
6367   for (auto *And : AndsToMaybeRemove)
6368     // Check that the and mask is the same as the one we decided to put on the
6369     // new and.
6370     if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
6371       And->replaceAllUsesWith(NewAnd);
6372       if (&*CurInstIterator == And)
6373         CurInstIterator = std::next(And->getIterator());
6374       And->eraseFromParent();
6375       ++NumAndUses;
6376     }
6377 
6378   ++NumAndsAdded;
6379   return true;
6380 }
6381 
6382 /// Check if V (an operand of a select instruction) is an expensive instruction
6383 /// that is only used once.
6384 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
6385   auto *I = dyn_cast<Instruction>(V);
6386   // If it's safe to speculatively execute, then it should not have side
6387   // effects; therefore, it's safe to sink and possibly *not* execute.
6388   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
6389          TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
6390          TargetTransformInfo::TCC_Expensive;
6391 }
6392 
6393 /// Returns true if a SelectInst should be turned into an explicit branch.
6394 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
6395                                                 const TargetLowering *TLI,
6396                                                 SelectInst *SI) {
6397   // If even a predictable select is cheap, then a branch can't be cheaper.
6398   if (!TLI->isPredictableSelectExpensive())
6399     return false;
6400 
6401   // FIXME: This should use the same heuristics as IfConversion to determine
6402   // whether a select is better represented as a branch.
6403 
6404   // If metadata tells us that the select condition is obviously predictable,
6405   // then we want to replace the select with a branch.
6406   uint64_t TrueWeight, FalseWeight;
6407   if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
6408     uint64_t Max = std::max(TrueWeight, FalseWeight);
6409     uint64_t Sum = TrueWeight + FalseWeight;
6410     if (Sum != 0) {
6411       auto Probability = BranchProbability::getBranchProbability(Max, Sum);
6412       if (Probability > TLI->getPredictableBranchThreshold())
6413         return true;
6414     }
6415   }
6416 
6417   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
6418 
6419   // If a branch is predictable, an out-of-order CPU can avoid blocking on its
6420   // comparison condition. If the compare has more than one use, there's
6421   // probably another cmov or setcc around, so it's not worth emitting a branch.
6422   if (!Cmp || !Cmp->hasOneUse())
6423     return false;
6424 
6425   // If either operand of the select is expensive and only needed on one side
6426   // of the select, we should form a branch.
6427   if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
6428       sinkSelectOperand(TTI, SI->getFalseValue()))
6429     return true;
6430 
6431   return false;
6432 }
6433 
6434 /// If \p isTrue is true, return the true value of \p SI, otherwise return
6435 /// false value of \p SI. If the true/false value of \p SI is defined by any
6436 /// select instructions in \p Selects, look through the defining select
6437 /// instruction until the true/false value is not defined in \p Selects.
6438 static Value *getTrueOrFalseValue(
6439     SelectInst *SI, bool isTrue,
6440     const SmallPtrSet<const Instruction *, 2> &Selects) {
6441   Value *V = nullptr;
6442 
6443   for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
6444        DefSI = dyn_cast<SelectInst>(V)) {
6445     assert(DefSI->getCondition() == SI->getCondition() &&
6446            "The condition of DefSI does not match with SI");
6447     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
6448   }
6449 
6450   assert(V && "Failed to get select true/false value");
6451   return V;
6452 }
6453 
6454 bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
6455   assert(Shift->isShift() && "Expected a shift");
6456 
6457   // If this is (1) a vector shift, (2) shifts by scalars are cheaper than
6458   // general vector shifts, and (3) the shift amount is a select-of-splatted
6459   // values, hoist the shifts before the select:
6460   //   shift Op0, (select Cond, TVal, FVal) -->
6461   //   select Cond, (shift Op0, TVal), (shift Op0, FVal)
6462   //
6463   // This is inverting a generic IR transform when we know that the cost of a
6464   // general vector shift is more than the cost of 2 shift-by-scalars.
6465   // We can't do this effectively in SDAG because we may not be able to
6466   // determine if the select operands are splats from within a basic block.
6467   Type *Ty = Shift->getType();
6468   if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
6469     return false;
6470   Value *Cond, *TVal, *FVal;
6471   if (!match(Shift->getOperand(1),
6472              m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
6473     return false;
6474   if (!isSplatValue(TVal) || !isSplatValue(FVal))
6475     return false;
6476 
6477   IRBuilder<> Builder(Shift);
6478   BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
6479   Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
6480   Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
6481   Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
6482   Shift->replaceAllUsesWith(NewSel);
6483   Shift->eraseFromParent();
6484   return true;
6485 }
6486 
6487 bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
6488   Intrinsic::ID Opcode = Fsh->getIntrinsicID();
6489   assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
6490          "Expected a funnel shift");
6491 
6492   // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
6493   // than general vector shifts, and (3) the shift amount is select-of-splatted
6494   // values, hoist the funnel shifts before the select:
6495   //   fsh Op0, Op1, (select Cond, TVal, FVal) -->
6496   //   select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
6497   //
6498   // This is inverting a generic IR transform when we know that the cost of a
6499   // general vector shift is more than the cost of 2 shift-by-scalars.
6500   // We can't do this effectively in SDAG because we may not be able to
6501   // determine if the select operands are splats from within a basic block.
6502   Type *Ty = Fsh->getType();
6503   if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
6504     return false;
6505   Value *Cond, *TVal, *FVal;
6506   if (!match(Fsh->getOperand(2),
6507              m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
6508     return false;
6509   if (!isSplatValue(TVal) || !isSplatValue(FVal))
6510     return false;
6511 
6512   IRBuilder<> Builder(Fsh);
6513   Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
6514   Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
6515   Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
6516   Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
6517   Fsh->replaceAllUsesWith(NewSel);
6518   Fsh->eraseFromParent();
6519   return true;
6520 }
6521 
6522 /// If we have a SelectInst that will likely profit from branch prediction,
6523 /// turn it into a branch.
6524 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
6525   if (DisableSelectToBranch)
6526     return false;
6527 
6528   // Find all consecutive select instructions that share the same condition.
6529   SmallVector<SelectInst *, 2> ASI;
6530   ASI.push_back(SI);
6531   for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
6532        It != SI->getParent()->end(); ++It) {
6533     SelectInst *I = dyn_cast<SelectInst>(&*It);
6534     if (I && SI->getCondition() == I->getCondition()) {
6535       ASI.push_back(I);
6536     } else {
6537       break;
6538     }
6539   }
6540 
6541   SelectInst *LastSI = ASI.back();
6542   // Increment the current iterator to skip all the rest of select instructions
6543   // because they will be either "not lowered" or "all lowered" to branch.
6544   CurInstIterator = std::next(LastSI->getIterator());
6545 
6546   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
6547 
6548   // Can we convert the 'select' to CF ?
6549   if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable))
6550     return false;
6551 
6552   TargetLowering::SelectSupportKind SelectKind;
6553   if (VectorCond)
6554     SelectKind = TargetLowering::VectorMaskSelect;
6555   else if (SI->getType()->isVectorTy())
6556     SelectKind = TargetLowering::ScalarCondVectorVal;
6557   else
6558     SelectKind = TargetLowering::ScalarValSelect;
6559 
6560   if (TLI->isSelectSupported(SelectKind) &&
6561       (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize ||
6562        llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())))
6563     return false;
6564 
6565   // The DominatorTree needs to be rebuilt by any consumers after this
6566   // transformation. We simply reset here rather than setting the ModifiedDT
6567   // flag to avoid restarting the function walk in runOnFunction for each
6568   // select optimized.
6569   DT.reset();
6570 
6571   // Transform a sequence like this:
6572   //    start:
6573   //       %cmp = cmp uge i32 %a, %b
6574   //       %sel = select i1 %cmp, i32 %c, i32 %d
6575   //
6576   // Into:
6577   //    start:
6578   //       %cmp = cmp uge i32 %a, %b
6579   //       %cmp.frozen = freeze %cmp
6580   //       br i1 %cmp.frozen, label %select.true, label %select.false
6581   //    select.true:
6582   //       br label %select.end
6583   //    select.false:
6584   //       br label %select.end
6585   //    select.end:
6586   //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
6587   //
6588   // %cmp should be frozen, otherwise it may introduce undefined behavior.
6589   // In addition, we may sink instructions that produce %c or %d from
6590   // the entry block into the destination(s) of the new branch.
6591   // If the true or false blocks do not contain a sunken instruction, that
6592   // block and its branch may be optimized away. In that case, one side of the
6593   // first branch will point directly to select.end, and the corresponding PHI
6594   // predecessor block will be the start block.
6595 
6596   // First, we split the block containing the select into 2 blocks.
6597   BasicBlock *StartBlock = SI->getParent();
6598   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
6599   BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
6600   BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
6601 
6602   // Delete the unconditional branch that was just created by the split.
6603   StartBlock->getTerminator()->eraseFromParent();
6604 
6605   // These are the new basic blocks for the conditional branch.
6606   // At least one will become an actual new basic block.
6607   BasicBlock *TrueBlock = nullptr;
6608   BasicBlock *FalseBlock = nullptr;
6609   BranchInst *TrueBranch = nullptr;
6610   BranchInst *FalseBranch = nullptr;
6611 
6612   // Sink expensive instructions into the conditional blocks to avoid executing
6613   // them speculatively.
6614   for (SelectInst *SI : ASI) {
6615     if (sinkSelectOperand(TTI, SI->getTrueValue())) {
6616       if (TrueBlock == nullptr) {
6617         TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
6618                                        EndBlock->getParent(), EndBlock);
6619         TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
6620         TrueBranch->setDebugLoc(SI->getDebugLoc());
6621       }
6622       auto *TrueInst = cast<Instruction>(SI->getTrueValue());
6623       TrueInst->moveBefore(TrueBranch);
6624     }
6625     if (sinkSelectOperand(TTI, SI->getFalseValue())) {
6626       if (FalseBlock == nullptr) {
6627         FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
6628                                         EndBlock->getParent(), EndBlock);
6629         FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
6630         FalseBranch->setDebugLoc(SI->getDebugLoc());
6631       }
6632       auto *FalseInst = cast<Instruction>(SI->getFalseValue());
6633       FalseInst->moveBefore(FalseBranch);
6634     }
6635   }
6636 
6637   // If there was nothing to sink, then arbitrarily choose the 'false' side
6638   // for a new input value to the PHI.
6639   if (TrueBlock == FalseBlock) {
6640     assert(TrueBlock == nullptr &&
6641            "Unexpected basic block transform while optimizing select");
6642 
6643     FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
6644                                     EndBlock->getParent(), EndBlock);
6645     auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
6646     FalseBranch->setDebugLoc(SI->getDebugLoc());
6647   }
6648 
6649   // Insert the real conditional branch based on the original condition.
6650   // If we did not create a new block for one of the 'true' or 'false' paths
6651   // of the condition, it means that side of the branch goes to the end block
6652   // directly and the path originates from the start block from the point of
6653   // view of the new PHI.
6654   BasicBlock *TT, *FT;
6655   if (TrueBlock == nullptr) {
6656     TT = EndBlock;
6657     FT = FalseBlock;
6658     TrueBlock = StartBlock;
6659   } else if (FalseBlock == nullptr) {
6660     TT = TrueBlock;
6661     FT = EndBlock;
6662     FalseBlock = StartBlock;
6663   } else {
6664     TT = TrueBlock;
6665     FT = FalseBlock;
6666   }
6667   IRBuilder<> IB(SI);
6668   auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
6669   IB.CreateCondBr(CondFr, TT, FT, SI);
6670 
6671   SmallPtrSet<const Instruction *, 2> INS;
6672   INS.insert(ASI.begin(), ASI.end());
6673   // Use reverse iterator because later select may use the value of the
6674   // earlier select, and we need to propagate value through earlier select
6675   // to get the PHI operand.
6676   for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
6677     SelectInst *SI = *It;
6678     // The select itself is replaced with a PHI Node.
6679     PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
6680     PN->takeName(SI);
6681     PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
6682     PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
6683     PN->setDebugLoc(SI->getDebugLoc());
6684 
6685     SI->replaceAllUsesWith(PN);
6686     SI->eraseFromParent();
6687     INS.erase(SI);
6688     ++NumSelectsExpanded;
6689   }
6690 
6691   // Instruct OptimizeBlock to skip to the next block.
6692   CurInstIterator = StartBlock->end();
6693   return true;
6694 }
6695 
6696 /// Some targets only accept certain types for splat inputs. For example a VDUP
6697 /// in MVE takes a GPR (integer) register, and the instruction that incorporate
6698 /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
6699 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
6700   if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
6701                             m_Undef(), m_ZeroMask())))
6702     return false;
6703   Type *NewType = TLI->shouldConvertSplatType(SVI);
6704   if (!NewType)
6705     return false;
6706 
6707   auto *SVIVecType = cast<FixedVectorType>(SVI->getType());
6708   assert(!NewType->isVectorTy() && "Expected a scalar type!");
6709   assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
6710          "Expected a type of the same size!");
6711   auto *NewVecType =
6712       FixedVectorType::get(NewType, SVIVecType->getNumElements());
6713 
6714   // Create a bitcast (shuffle (insert (bitcast(..))))
6715   IRBuilder<> Builder(SVI->getContext());
6716   Builder.SetInsertPoint(SVI);
6717   Value *BC1 = Builder.CreateBitCast(
6718       cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
6719   Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,
6720                                               (uint64_t)0);
6721   Value *Shuffle = Builder.CreateShuffleVector(
6722       Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());
6723   Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
6724 
6725   SVI->replaceAllUsesWith(BC2);
6726   RecursivelyDeleteTriviallyDeadInstructions(
6727       SVI, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); });
6728 
6729   // Also hoist the bitcast up to its operand if it they are not in the same
6730   // block.
6731   if (auto *BCI = dyn_cast<Instruction>(BC1))
6732     if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))
6733       if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&
6734           !Op->isTerminator() && !Op->isEHPad())
6735         BCI->moveAfter(Op);
6736 
6737   return true;
6738 }
6739 
6740 bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
6741   // If the operands of I can be folded into a target instruction together with
6742   // I, duplicate and sink them.
6743   SmallVector<Use *, 4> OpsToSink;
6744   if (!TLI->shouldSinkOperands(I, OpsToSink))
6745     return false;
6746 
6747   // OpsToSink can contain multiple uses in a use chain (e.g.
6748   // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
6749   // uses must come first, so we process the ops in reverse order so as to not
6750   // create invalid IR.
6751   BasicBlock *TargetBB = I->getParent();
6752   bool Changed = false;
6753   SmallVector<Use *, 4> ToReplace;
6754   for (Use *U : reverse(OpsToSink)) {
6755     auto *UI = cast<Instruction>(U->get());
6756     if (UI->getParent() == TargetBB || isa<PHINode>(UI))
6757       continue;
6758     ToReplace.push_back(U);
6759   }
6760 
6761   SetVector<Instruction *> MaybeDead;
6762   DenseMap<Instruction *, Instruction *> NewInstructions;
6763   Instruction *InsertPoint = I;
6764   for (Use *U : ToReplace) {
6765     auto *UI = cast<Instruction>(U->get());
6766     Instruction *NI = UI->clone();
6767     NewInstructions[UI] = NI;
6768     MaybeDead.insert(UI);
6769     LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
6770     NI->insertBefore(InsertPoint);
6771     InsertPoint = NI;
6772     InsertedInsts.insert(NI);
6773 
6774     // Update the use for the new instruction, making sure that we update the
6775     // sunk instruction uses, if it is part of a chain that has already been
6776     // sunk.
6777     Instruction *OldI = cast<Instruction>(U->getUser());
6778     if (NewInstructions.count(OldI))
6779       NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
6780     else
6781       U->set(NI);
6782     Changed = true;
6783   }
6784 
6785   // Remove instructions that are dead after sinking.
6786   for (auto *I : MaybeDead) {
6787     if (!I->hasNUsesOrMore(1)) {
6788       LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
6789       I->eraseFromParent();
6790     }
6791   }
6792 
6793   return Changed;
6794 }
6795 
6796 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
6797   Value *Cond = SI->getCondition();
6798   Type *OldType = Cond->getType();
6799   LLVMContext &Context = Cond->getContext();
6800   MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
6801   unsigned RegWidth = RegType.getSizeInBits();
6802 
6803   if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
6804     return false;
6805 
6806   // If the register width is greater than the type width, expand the condition
6807   // of the switch instruction and each case constant to the width of the
6808   // register. By widening the type of the switch condition, subsequent
6809   // comparisons (for case comparisons) will not need to be extended to the
6810   // preferred register width, so we will potentially eliminate N-1 extends,
6811   // where N is the number of cases in the switch.
6812   auto *NewType = Type::getIntNTy(Context, RegWidth);
6813 
6814   // Zero-extend the switch condition and case constants unless the switch
6815   // condition is a function argument that is already being sign-extended.
6816   // In that case, we can avoid an unnecessary mask/extension by sign-extending
6817   // everything instead.
6818   Instruction::CastOps ExtType = Instruction::ZExt;
6819   if (auto *Arg = dyn_cast<Argument>(Cond))
6820     if (Arg->hasSExtAttr())
6821       ExtType = Instruction::SExt;
6822 
6823   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
6824   ExtInst->insertBefore(SI);
6825   ExtInst->setDebugLoc(SI->getDebugLoc());
6826   SI->setCondition(ExtInst);
6827   for (auto Case : SI->cases()) {
6828     APInt NarrowConst = Case.getCaseValue()->getValue();
6829     APInt WideConst = (ExtType == Instruction::ZExt) ?
6830                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
6831     Case.setValue(ConstantInt::get(Context, WideConst));
6832   }
6833 
6834   return true;
6835 }
6836 
6837 
6838 namespace {
6839 
6840 /// Helper class to promote a scalar operation to a vector one.
6841 /// This class is used to move downward extractelement transition.
6842 /// E.g.,
6843 /// a = vector_op <2 x i32>
6844 /// b = extractelement <2 x i32> a, i32 0
6845 /// c = scalar_op b
6846 /// store c
6847 ///
6848 /// =>
6849 /// a = vector_op <2 x i32>
6850 /// c = vector_op a (equivalent to scalar_op on the related lane)
6851 /// * d = extractelement <2 x i32> c, i32 0
6852 /// * store d
6853 /// Assuming both extractelement and store can be combine, we get rid of the
6854 /// transition.
6855 class VectorPromoteHelper {
6856   /// DataLayout associated with the current module.
6857   const DataLayout &DL;
6858 
6859   /// Used to perform some checks on the legality of vector operations.
6860   const TargetLowering &TLI;
6861 
6862   /// Used to estimated the cost of the promoted chain.
6863   const TargetTransformInfo &TTI;
6864 
6865   /// The transition being moved downwards.
6866   Instruction *Transition;
6867 
6868   /// The sequence of instructions to be promoted.
6869   SmallVector<Instruction *, 4> InstsToBePromoted;
6870 
6871   /// Cost of combining a store and an extract.
6872   unsigned StoreExtractCombineCost;
6873 
6874   /// Instruction that will be combined with the transition.
6875   Instruction *CombineInst = nullptr;
6876 
6877   /// The instruction that represents the current end of the transition.
6878   /// Since we are faking the promotion until we reach the end of the chain
6879   /// of computation, we need a way to get the current end of the transition.
6880   Instruction *getEndOfTransition() const {
6881     if (InstsToBePromoted.empty())
6882       return Transition;
6883     return InstsToBePromoted.back();
6884   }
6885 
6886   /// Return the index of the original value in the transition.
6887   /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
6888   /// c, is at index 0.
6889   unsigned getTransitionOriginalValueIdx() const {
6890     assert(isa<ExtractElementInst>(Transition) &&
6891            "Other kind of transitions are not supported yet");
6892     return 0;
6893   }
6894 
6895   /// Return the index of the index in the transition.
6896   /// E.g., for "extractelement <2 x i32> c, i32 0" the index
6897   /// is at index 1.
6898   unsigned getTransitionIdx() const {
6899     assert(isa<ExtractElementInst>(Transition) &&
6900            "Other kind of transitions are not supported yet");
6901     return 1;
6902   }
6903 
6904   /// Get the type of the transition.
6905   /// This is the type of the original value.
6906   /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
6907   /// transition is <2 x i32>.
6908   Type *getTransitionType() const {
6909     return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
6910   }
6911 
6912   /// Promote \p ToBePromoted by moving \p Def downward through.
6913   /// I.e., we have the following sequence:
6914   /// Def = Transition <ty1> a to <ty2>
6915   /// b = ToBePromoted <ty2> Def, ...
6916   /// =>
6917   /// b = ToBePromoted <ty1> a, ...
6918   /// Def = Transition <ty1> ToBePromoted to <ty2>
6919   void promoteImpl(Instruction *ToBePromoted);
6920 
6921   /// Check whether or not it is profitable to promote all the
6922   /// instructions enqueued to be promoted.
6923   bool isProfitableToPromote() {
6924     Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
6925     unsigned Index = isa<ConstantInt>(ValIdx)
6926                          ? cast<ConstantInt>(ValIdx)->getZExtValue()
6927                          : -1;
6928     Type *PromotedType = getTransitionType();
6929 
6930     StoreInst *ST = cast<StoreInst>(CombineInst);
6931     unsigned AS = ST->getPointerAddressSpace();
6932     unsigned Align = ST->getAlignment();
6933     // Check if this store is supported.
6934     if (!TLI.allowsMisalignedMemoryAccesses(
6935             TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
6936             Align)) {
6937       // If this is not supported, there is no way we can combine
6938       // the extract with the store.
6939       return false;
6940     }
6941 
6942     // The scalar chain of computation has to pay for the transition
6943     // scalar to vector.
6944     // The vector chain has to account for the combining cost.
6945     uint64_t ScalarCost =
6946         TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
6947     uint64_t VectorCost = StoreExtractCombineCost;
6948     enum TargetTransformInfo::TargetCostKind CostKind =
6949       TargetTransformInfo::TCK_RecipThroughput;
6950     for (const auto &Inst : InstsToBePromoted) {
6951       // Compute the cost.
6952       // By construction, all instructions being promoted are arithmetic ones.
6953       // Moreover, one argument is a constant that can be viewed as a splat
6954       // constant.
6955       Value *Arg0 = Inst->getOperand(0);
6956       bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
6957                             isa<ConstantFP>(Arg0);
6958       TargetTransformInfo::OperandValueKind Arg0OVK =
6959           IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
6960                          : TargetTransformInfo::OK_AnyValue;
6961       TargetTransformInfo::OperandValueKind Arg1OVK =
6962           !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
6963                           : TargetTransformInfo::OK_AnyValue;
6964       ScalarCost += TTI.getArithmeticInstrCost(
6965           Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK);
6966       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
6967                                                CostKind,
6968                                                Arg0OVK, Arg1OVK);
6969     }
6970     LLVM_DEBUG(
6971         dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
6972                << ScalarCost << "\nVector: " << VectorCost << '\n');
6973     return ScalarCost > VectorCost;
6974   }
6975 
6976   /// Generate a constant vector with \p Val with the same
6977   /// number of elements as the transition.
6978   /// \p UseSplat defines whether or not \p Val should be replicated
6979   /// across the whole vector.
6980   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
6981   /// otherwise we generate a vector with as many undef as possible:
6982   /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
6983   /// used at the index of the extract.
6984   Value *getConstantVector(Constant *Val, bool UseSplat) const {
6985     unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
6986     if (!UseSplat) {
6987       // If we cannot determine where the constant must be, we have to
6988       // use a splat constant.
6989       Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
6990       if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
6991         ExtractIdx = CstVal->getSExtValue();
6992       else
6993         UseSplat = true;
6994     }
6995 
6996     ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();
6997     if (UseSplat)
6998       return ConstantVector::getSplat(EC, Val);
6999 
7000     if (!EC.isScalable()) {
7001       SmallVector<Constant *, 4> ConstVec;
7002       UndefValue *UndefVal = UndefValue::get(Val->getType());
7003       for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
7004         if (Idx == ExtractIdx)
7005           ConstVec.push_back(Val);
7006         else
7007           ConstVec.push_back(UndefVal);
7008       }
7009       return ConstantVector::get(ConstVec);
7010     } else
7011       llvm_unreachable(
7012           "Generate scalable vector for non-splat is unimplemented");
7013   }
7014 
7015   /// Check if promoting to a vector type an operand at \p OperandIdx
7016   /// in \p Use can trigger undefined behavior.
7017   static bool canCauseUndefinedBehavior(const Instruction *Use,
7018                                         unsigned OperandIdx) {
7019     // This is not safe to introduce undef when the operand is on
7020     // the right hand side of a division-like instruction.
7021     if (OperandIdx != 1)
7022       return false;
7023     switch (Use->getOpcode()) {
7024     default:
7025       return false;
7026     case Instruction::SDiv:
7027     case Instruction::UDiv:
7028     case Instruction::SRem:
7029     case Instruction::URem:
7030       return true;
7031     case Instruction::FDiv:
7032     case Instruction::FRem:
7033       return !Use->hasNoNaNs();
7034     }
7035     llvm_unreachable(nullptr);
7036   }
7037 
7038 public:
7039   VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
7040                       const TargetTransformInfo &TTI, Instruction *Transition,
7041                       unsigned CombineCost)
7042       : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
7043         StoreExtractCombineCost(CombineCost) {
7044     assert(Transition && "Do not know how to promote null");
7045   }
7046 
7047   /// Check if we can promote \p ToBePromoted to \p Type.
7048   bool canPromote(const Instruction *ToBePromoted) const {
7049     // We could support CastInst too.
7050     return isa<BinaryOperator>(ToBePromoted);
7051   }
7052 
7053   /// Check if it is profitable to promote \p ToBePromoted
7054   /// by moving downward the transition through.
7055   bool shouldPromote(const Instruction *ToBePromoted) const {
7056     // Promote only if all the operands can be statically expanded.
7057     // Indeed, we do not want to introduce any new kind of transitions.
7058     for (const Use &U : ToBePromoted->operands()) {
7059       const Value *Val = U.get();
7060       if (Val == getEndOfTransition()) {
7061         // If the use is a division and the transition is on the rhs,
7062         // we cannot promote the operation, otherwise we may create a
7063         // division by zero.
7064         if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
7065           return false;
7066         continue;
7067       }
7068       if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
7069           !isa<ConstantFP>(Val))
7070         return false;
7071     }
7072     // Check that the resulting operation is legal.
7073     int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
7074     if (!ISDOpcode)
7075       return false;
7076     return StressStoreExtract ||
7077            TLI.isOperationLegalOrCustom(
7078                ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
7079   }
7080 
7081   /// Check whether or not \p Use can be combined
7082   /// with the transition.
7083   /// I.e., is it possible to do Use(Transition) => AnotherUse?
7084   bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
7085 
7086   /// Record \p ToBePromoted as part of the chain to be promoted.
7087   void enqueueForPromotion(Instruction *ToBePromoted) {
7088     InstsToBePromoted.push_back(ToBePromoted);
7089   }
7090 
7091   /// Set the instruction that will be combined with the transition.
7092   void recordCombineInstruction(Instruction *ToBeCombined) {
7093     assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
7094     CombineInst = ToBeCombined;
7095   }
7096 
7097   /// Promote all the instructions enqueued for promotion if it is
7098   /// is profitable.
7099   /// \return True if the promotion happened, false otherwise.
7100   bool promote() {
7101     // Check if there is something to promote.
7102     // Right now, if we do not have anything to combine with,
7103     // we assume the promotion is not profitable.
7104     if (InstsToBePromoted.empty() || !CombineInst)
7105       return false;
7106 
7107     // Check cost.
7108     if (!StressStoreExtract && !isProfitableToPromote())
7109       return false;
7110 
7111     // Promote.
7112     for (auto &ToBePromoted : InstsToBePromoted)
7113       promoteImpl(ToBePromoted);
7114     InstsToBePromoted.clear();
7115     return true;
7116   }
7117 };
7118 
7119 } // end anonymous namespace
7120 
7121 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
7122   // At this point, we know that all the operands of ToBePromoted but Def
7123   // can be statically promoted.
7124   // For Def, we need to use its parameter in ToBePromoted:
7125   // b = ToBePromoted ty1 a
7126   // Def = Transition ty1 b to ty2
7127   // Move the transition down.
7128   // 1. Replace all uses of the promoted operation by the transition.
7129   // = ... b => = ... Def.
7130   assert(ToBePromoted->getType() == Transition->getType() &&
7131          "The type of the result of the transition does not match "
7132          "the final type");
7133   ToBePromoted->replaceAllUsesWith(Transition);
7134   // 2. Update the type of the uses.
7135   // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
7136   Type *TransitionTy = getTransitionType();
7137   ToBePromoted->mutateType(TransitionTy);
7138   // 3. Update all the operands of the promoted operation with promoted
7139   // operands.
7140   // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
7141   for (Use &U : ToBePromoted->operands()) {
7142     Value *Val = U.get();
7143     Value *NewVal = nullptr;
7144     if (Val == Transition)
7145       NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
7146     else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
7147              isa<ConstantFP>(Val)) {
7148       // Use a splat constant if it is not safe to use undef.
7149       NewVal = getConstantVector(
7150           cast<Constant>(Val),
7151           isa<UndefValue>(Val) ||
7152               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
7153     } else
7154       llvm_unreachable("Did you modified shouldPromote and forgot to update "
7155                        "this?");
7156     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
7157   }
7158   Transition->moveAfter(ToBePromoted);
7159   Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
7160 }
7161 
7162 /// Some targets can do store(extractelement) with one instruction.
7163 /// Try to push the extractelement towards the stores when the target
7164 /// has this feature and this is profitable.
7165 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
7166   unsigned CombineCost = std::numeric_limits<unsigned>::max();
7167   if (DisableStoreExtract ||
7168       (!StressStoreExtract &&
7169        !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
7170                                        Inst->getOperand(1), CombineCost)))
7171     return false;
7172 
7173   // At this point we know that Inst is a vector to scalar transition.
7174   // Try to move it down the def-use chain, until:
7175   // - We can combine the transition with its single use
7176   //   => we got rid of the transition.
7177   // - We escape the current basic block
7178   //   => we would need to check that we are moving it at a cheaper place and
7179   //      we do not do that for now.
7180   BasicBlock *Parent = Inst->getParent();
7181   LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
7182   VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
7183   // If the transition has more than one use, assume this is not going to be
7184   // beneficial.
7185   while (Inst->hasOneUse()) {
7186     Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
7187     LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
7188 
7189     if (ToBePromoted->getParent() != Parent) {
7190       LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
7191                         << ToBePromoted->getParent()->getName()
7192                         << ") than the transition (" << Parent->getName()
7193                         << ").\n");
7194       return false;
7195     }
7196 
7197     if (VPH.canCombine(ToBePromoted)) {
7198       LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
7199                         << "will be combined with: " << *ToBePromoted << '\n');
7200       VPH.recordCombineInstruction(ToBePromoted);
7201       bool Changed = VPH.promote();
7202       NumStoreExtractExposed += Changed;
7203       return Changed;
7204     }
7205 
7206     LLVM_DEBUG(dbgs() << "Try promoting.\n");
7207     if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
7208       return false;
7209 
7210     LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
7211 
7212     VPH.enqueueForPromotion(ToBePromoted);
7213     Inst = ToBePromoted;
7214   }
7215   return false;
7216 }
7217 
7218 /// For the instruction sequence of store below, F and I values
7219 /// are bundled together as an i64 value before being stored into memory.
7220 /// Sometimes it is more efficient to generate separate stores for F and I,
7221 /// which can remove the bitwise instructions or sink them to colder places.
7222 ///
7223 ///   (store (or (zext (bitcast F to i32) to i64),
7224 ///              (shl (zext I to i64), 32)), addr)  -->
7225 ///   (store F, addr) and (store I, addr+4)
7226 ///
7227 /// Similarly, splitting for other merged store can also be beneficial, like:
7228 /// For pair of {i32, i32}, i64 store --> two i32 stores.
7229 /// For pair of {i32, i16}, i64 store --> two i32 stores.
7230 /// For pair of {i16, i16}, i32 store --> two i16 stores.
7231 /// For pair of {i16, i8},  i32 store --> two i16 stores.
7232 /// For pair of {i8, i8},   i16 store --> two i8 stores.
7233 ///
7234 /// We allow each target to determine specifically which kind of splitting is
7235 /// supported.
7236 ///
7237 /// The store patterns are commonly seen from the simple code snippet below
7238 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
7239 ///   void goo(const std::pair<int, float> &);
7240 ///   hoo() {
7241 ///     ...
7242 ///     goo(std::make_pair(tmp, ftmp));
7243 ///     ...
7244 ///   }
7245 ///
7246 /// Although we already have similar splitting in DAG Combine, we duplicate
7247 /// it in CodeGenPrepare to catch the case in which pattern is across
7248 /// multiple BBs. The logic in DAG Combine is kept to catch case generated
7249 /// during code expansion.
7250 static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
7251                                 const TargetLowering &TLI) {
7252   // Handle simple but common cases only.
7253   Type *StoreType = SI.getValueOperand()->getType();
7254 
7255   // The code below assumes shifting a value by <number of bits>,
7256   // whereas scalable vectors would have to be shifted by
7257   // <2log(vscale) + number of bits> in order to store the
7258   // low/high parts. Bailing out for now.
7259   if (isa<ScalableVectorType>(StoreType))
7260     return false;
7261 
7262   if (!DL.typeSizeEqualsStoreSize(StoreType) ||
7263       DL.getTypeSizeInBits(StoreType) == 0)
7264     return false;
7265 
7266   unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
7267   Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
7268   if (!DL.typeSizeEqualsStoreSize(SplitStoreType))
7269     return false;
7270 
7271   // Don't split the store if it is volatile.
7272   if (SI.isVolatile())
7273     return false;
7274 
7275   // Match the following patterns:
7276   // (store (or (zext LValue to i64),
7277   //            (shl (zext HValue to i64), 32)), HalfValBitSize)
7278   //  or
7279   // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
7280   //            (zext LValue to i64),
7281   // Expect both operands of OR and the first operand of SHL have only
7282   // one use.
7283   Value *LValue, *HValue;
7284   if (!match(SI.getValueOperand(),
7285              m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
7286                     m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
7287                                    m_SpecificInt(HalfValBitSize))))))
7288     return false;
7289 
7290   // Check LValue and HValue are int with size less or equal than 32.
7291   if (!LValue->getType()->isIntegerTy() ||
7292       DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
7293       !HValue->getType()->isIntegerTy() ||
7294       DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
7295     return false;
7296 
7297   // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
7298   // as the input of target query.
7299   auto *LBC = dyn_cast<BitCastInst>(LValue);
7300   auto *HBC = dyn_cast<BitCastInst>(HValue);
7301   EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
7302                   : EVT::getEVT(LValue->getType());
7303   EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
7304                    : EVT::getEVT(HValue->getType());
7305   if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
7306     return false;
7307 
7308   // Start to split store.
7309   IRBuilder<> Builder(SI.getContext());
7310   Builder.SetInsertPoint(&SI);
7311 
7312   // If LValue/HValue is a bitcast in another BB, create a new one in current
7313   // BB so it may be merged with the splitted stores by dag combiner.
7314   if (LBC && LBC->getParent() != SI.getParent())
7315     LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
7316   if (HBC && HBC->getParent() != SI.getParent())
7317     HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
7318 
7319   bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
7320   auto CreateSplitStore = [&](Value *V, bool Upper) {
7321     V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
7322     Value *Addr = Builder.CreateBitCast(
7323         SI.getOperand(1),
7324         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
7325     Align Alignment = SI.getAlign();
7326     const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
7327     if (IsOffsetStore) {
7328       Addr = Builder.CreateGEP(
7329           SplitStoreType, Addr,
7330           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
7331 
7332       // When splitting the store in half, naturally one half will retain the
7333       // alignment of the original wider store, regardless of whether it was
7334       // over-aligned or not, while the other will require adjustment.
7335       Alignment = commonAlignment(Alignment, HalfValBitSize / 8);
7336     }
7337     Builder.CreateAlignedStore(V, Addr, Alignment);
7338   };
7339 
7340   CreateSplitStore(LValue, false);
7341   CreateSplitStore(HValue, true);
7342 
7343   // Delete the old store.
7344   SI.eraseFromParent();
7345   return true;
7346 }
7347 
7348 // Return true if the GEP has two operands, the first operand is of a sequential
7349 // type, and the second operand is a constant.
7350 static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
7351   gep_type_iterator I = gep_type_begin(*GEP);
7352   return GEP->getNumOperands() == 2 &&
7353       I.isSequential() &&
7354       isa<ConstantInt>(GEP->getOperand(1));
7355 }
7356 
7357 // Try unmerging GEPs to reduce liveness interference (register pressure) across
7358 // IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
7359 // reducing liveness interference across those edges benefits global register
7360 // allocation. Currently handles only certain cases.
7361 //
7362 // For example, unmerge %GEPI and %UGEPI as below.
7363 //
7364 // ---------- BEFORE ----------
7365 // SrcBlock:
7366 //   ...
7367 //   %GEPIOp = ...
7368 //   ...
7369 //   %GEPI = gep %GEPIOp, Idx
7370 //   ...
7371 //   indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
7372 //   (* %GEPI is alive on the indirectbr edges due to other uses ahead)
7373 //   (* %GEPIOp is alive on the indirectbr edges only because of it's used by
7374 //   %UGEPI)
7375 //
7376 // DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
7377 // DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
7378 // ...
7379 //
7380 // DstBi:
7381 //   ...
7382 //   %UGEPI = gep %GEPIOp, UIdx
7383 // ...
7384 // ---------------------------
7385 //
7386 // ---------- AFTER ----------
7387 // SrcBlock:
7388 //   ... (same as above)
7389 //    (* %GEPI is still alive on the indirectbr edges)
7390 //    (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
7391 //    unmerging)
7392 // ...
7393 //
7394 // DstBi:
7395 //   ...
7396 //   %UGEPI = gep %GEPI, (UIdx-Idx)
7397 //   ...
7398 // ---------------------------
7399 //
7400 // The register pressure on the IndirectBr edges is reduced because %GEPIOp is
7401 // no longer alive on them.
7402 //
7403 // We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
7404 // of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
7405 // not to disable further simplications and optimizations as a result of GEP
7406 // merging.
7407 //
7408 // Note this unmerging may increase the length of the data flow critical path
7409 // (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
7410 // between the register pressure and the length of data-flow critical
7411 // path. Restricting this to the uncommon IndirectBr case would minimize the
7412 // impact of potentially longer critical path, if any, and the impact on compile
7413 // time.
7414 static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
7415                                              const TargetTransformInfo *TTI) {
7416   BasicBlock *SrcBlock = GEPI->getParent();
7417   // Check that SrcBlock ends with an IndirectBr. If not, give up. The common
7418   // (non-IndirectBr) cases exit early here.
7419   if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
7420     return false;
7421   // Check that GEPI is a simple gep with a single constant index.
7422   if (!GEPSequentialConstIndexed(GEPI))
7423     return false;
7424   ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
7425   // Check that GEPI is a cheap one.
7426   if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),
7427                          TargetTransformInfo::TCK_SizeAndLatency)
7428       > TargetTransformInfo::TCC_Basic)
7429     return false;
7430   Value *GEPIOp = GEPI->getOperand(0);
7431   // Check that GEPIOp is an instruction that's also defined in SrcBlock.
7432   if (!isa<Instruction>(GEPIOp))
7433     return false;
7434   auto *GEPIOpI = cast<Instruction>(GEPIOp);
7435   if (GEPIOpI->getParent() != SrcBlock)
7436     return false;
7437   // Check that GEP is used outside the block, meaning it's alive on the
7438   // IndirectBr edge(s).
7439   if (find_if(GEPI->users(), [&](User *Usr) {
7440         if (auto *I = dyn_cast<Instruction>(Usr)) {
7441           if (I->getParent() != SrcBlock) {
7442             return true;
7443           }
7444         }
7445         return false;
7446       }) == GEPI->users().end())
7447     return false;
7448   // The second elements of the GEP chains to be unmerged.
7449   std::vector<GetElementPtrInst *> UGEPIs;
7450   // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
7451   // on IndirectBr edges.
7452   for (User *Usr : GEPIOp->users()) {
7453     if (Usr == GEPI) continue;
7454     // Check if Usr is an Instruction. If not, give up.
7455     if (!isa<Instruction>(Usr))
7456       return false;
7457     auto *UI = cast<Instruction>(Usr);
7458     // Check if Usr in the same block as GEPIOp, which is fine, skip.
7459     if (UI->getParent() == SrcBlock)
7460       continue;
7461     // Check if Usr is a GEP. If not, give up.
7462     if (!isa<GetElementPtrInst>(Usr))
7463       return false;
7464     auto *UGEPI = cast<GetElementPtrInst>(Usr);
7465     // Check if UGEPI is a simple gep with a single constant index and GEPIOp is
7466     // the pointer operand to it. If so, record it in the vector. If not, give
7467     // up.
7468     if (!GEPSequentialConstIndexed(UGEPI))
7469       return false;
7470     if (UGEPI->getOperand(0) != GEPIOp)
7471       return false;
7472     if (GEPIIdx->getType() !=
7473         cast<ConstantInt>(UGEPI->getOperand(1))->getType())
7474       return false;
7475     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7476     if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),
7477                            TargetTransformInfo::TCK_SizeAndLatency)
7478         > TargetTransformInfo::TCC_Basic)
7479       return false;
7480     UGEPIs.push_back(UGEPI);
7481   }
7482   if (UGEPIs.size() == 0)
7483     return false;
7484   // Check the materializing cost of (Uidx-Idx).
7485   for (GetElementPtrInst *UGEPI : UGEPIs) {
7486     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7487     APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
7488     unsigned ImmCost =
7489       TTI->getIntImmCost(NewIdx, GEPIIdx->getType(),
7490                          TargetTransformInfo::TCK_SizeAndLatency);
7491     if (ImmCost > TargetTransformInfo::TCC_Basic)
7492       return false;
7493   }
7494   // Now unmerge between GEPI and UGEPIs.
7495   for (GetElementPtrInst *UGEPI : UGEPIs) {
7496     UGEPI->setOperand(0, GEPI);
7497     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7498     Constant *NewUGEPIIdx =
7499         ConstantInt::get(GEPIIdx->getType(),
7500                          UGEPIIdx->getValue() - GEPIIdx->getValue());
7501     UGEPI->setOperand(1, NewUGEPIIdx);
7502     // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
7503     // inbounds to avoid UB.
7504     if (!GEPI->isInBounds()) {
7505       UGEPI->setIsInBounds(false);
7506     }
7507   }
7508   // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
7509   // alive on IndirectBr edges).
7510   assert(find_if(GEPIOp->users(), [&](User *Usr) {
7511         return cast<Instruction>(Usr)->getParent() != SrcBlock;
7512       }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
7513   return true;
7514 }
7515 
7516 bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
7517   // Bail out if we inserted the instruction to prevent optimizations from
7518   // stepping on each other's toes.
7519   if (InsertedInsts.count(I))
7520     return false;
7521 
7522   // TODO: Move into the switch on opcode below here.
7523   if (PHINode *P = dyn_cast<PHINode>(I)) {
7524     // It is possible for very late stage optimizations (such as SimplifyCFG)
7525     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
7526     // trivial PHI, go ahead and zap it here.
7527     if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
7528       LargeOffsetGEPMap.erase(P);
7529       P->replaceAllUsesWith(V);
7530       P->eraseFromParent();
7531       ++NumPHIsElim;
7532       return true;
7533     }
7534     return false;
7535   }
7536 
7537   if (CastInst *CI = dyn_cast<CastInst>(I)) {
7538     // If the source of the cast is a constant, then this should have
7539     // already been constant folded.  The only reason NOT to constant fold
7540     // it is if something (e.g. LSR) was careful to place the constant
7541     // evaluation in a block other than then one that uses it (e.g. to hoist
7542     // the address of globals out of a loop).  If this is the case, we don't
7543     // want to forward-subst the cast.
7544     if (isa<Constant>(CI->getOperand(0)))
7545       return false;
7546 
7547     if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
7548       return true;
7549 
7550     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
7551       /// Sink a zext or sext into its user blocks if the target type doesn't
7552       /// fit in one register
7553       if (TLI->getTypeAction(CI->getContext(),
7554                              TLI->getValueType(*DL, CI->getType())) ==
7555           TargetLowering::TypeExpandInteger) {
7556         return SinkCast(CI);
7557       } else {
7558         bool MadeChange = optimizeExt(I);
7559         return MadeChange | optimizeExtUses(I);
7560       }
7561     }
7562     return false;
7563   }
7564 
7565   if (auto *Cmp = dyn_cast<CmpInst>(I))
7566     if (optimizeCmp(Cmp, ModifiedDT))
7567       return true;
7568 
7569   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
7570     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
7571     bool Modified = optimizeLoadExt(LI);
7572     unsigned AS = LI->getPointerAddressSpace();
7573     Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
7574     return Modified;
7575   }
7576 
7577   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
7578     if (splitMergedValStore(*SI, *DL, *TLI))
7579       return true;
7580     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
7581     unsigned AS = SI->getPointerAddressSpace();
7582     return optimizeMemoryInst(I, SI->getOperand(1),
7583                               SI->getOperand(0)->getType(), AS);
7584   }
7585 
7586   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
7587       unsigned AS = RMW->getPointerAddressSpace();
7588       return optimizeMemoryInst(I, RMW->getPointerOperand(),
7589                                 RMW->getType(), AS);
7590   }
7591 
7592   if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
7593       unsigned AS = CmpX->getPointerAddressSpace();
7594       return optimizeMemoryInst(I, CmpX->getPointerOperand(),
7595                                 CmpX->getCompareOperand()->getType(), AS);
7596   }
7597 
7598   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
7599 
7600   if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)
7601     return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
7602 
7603   // TODO: Move this into the switch on opcode - it handles shifts already.
7604   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
7605                 BinOp->getOpcode() == Instruction::LShr)) {
7606     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
7607     if (CI && TLI->hasExtractBitsInsn())
7608       if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
7609         return true;
7610   }
7611 
7612   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
7613     if (GEPI->hasAllZeroIndices()) {
7614       /// The GEP operand must be a pointer, so must its result -> BitCast
7615       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
7616                                         GEPI->getName(), GEPI);
7617       NC->setDebugLoc(GEPI->getDebugLoc());
7618       GEPI->replaceAllUsesWith(NC);
7619       GEPI->eraseFromParent();
7620       ++NumGEPsElim;
7621       optimizeInst(NC, ModifiedDT);
7622       return true;
7623     }
7624     if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
7625       return true;
7626     }
7627     return false;
7628   }
7629 
7630   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
7631     // freeze(icmp a, const)) -> icmp (freeze a), const
7632     // This helps generate efficient conditional jumps.
7633     Instruction *CmpI = nullptr;
7634     if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))
7635       CmpI = II;
7636     else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))
7637       CmpI = F->getFastMathFlags().none() ? F : nullptr;
7638 
7639     if (CmpI && CmpI->hasOneUse()) {
7640       auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);
7641       bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||
7642                     isa<ConstantPointerNull>(Op0);
7643       bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||
7644                     isa<ConstantPointerNull>(Op1);
7645       if (Const0 || Const1) {
7646         if (!Const0 || !Const1) {
7647           auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
7648           F->takeName(FI);
7649           CmpI->setOperand(Const0 ? 1 : 0, F);
7650         }
7651         FI->replaceAllUsesWith(CmpI);
7652         FI->eraseFromParent();
7653         return true;
7654       }
7655     }
7656     return false;
7657   }
7658 
7659   if (tryToSinkFreeOperands(I))
7660     return true;
7661 
7662   switch (I->getOpcode()) {
7663   case Instruction::Shl:
7664   case Instruction::LShr:
7665   case Instruction::AShr:
7666     return optimizeShiftInst(cast<BinaryOperator>(I));
7667   case Instruction::Call:
7668     return optimizeCallInst(cast<CallInst>(I), ModifiedDT);
7669   case Instruction::Select:
7670     return optimizeSelectInst(cast<SelectInst>(I));
7671   case Instruction::ShuffleVector:
7672     return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
7673   case Instruction::Switch:
7674     return optimizeSwitchInst(cast<SwitchInst>(I));
7675   case Instruction::ExtractElement:
7676     return optimizeExtractElementInst(cast<ExtractElementInst>(I));
7677   }
7678 
7679   return false;
7680 }
7681 
7682 /// Given an OR instruction, check to see if this is a bitreverse
7683 /// idiom. If so, insert the new intrinsic and return true.
7684 bool CodeGenPrepare::makeBitReverse(Instruction &I) {
7685   if (!I.getType()->isIntegerTy() ||
7686       !TLI->isOperationLegalOrCustom(ISD::BITREVERSE,
7687                                      TLI->getValueType(*DL, I.getType(), true)))
7688     return false;
7689 
7690   SmallVector<Instruction*, 4> Insts;
7691   if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
7692     return false;
7693   Instruction *LastInst = Insts.back();
7694   I.replaceAllUsesWith(LastInst);
7695   RecursivelyDeleteTriviallyDeadInstructions(
7696       &I, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); });
7697   return true;
7698 }
7699 
7700 // In this pass we look for GEP and cast instructions that are used
7701 // across basic blocks and rewrite them to improve basic-block-at-a-time
7702 // selection.
7703 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
7704   SunkAddrs.clear();
7705   bool MadeChange = false;
7706 
7707   CurInstIterator = BB.begin();
7708   while (CurInstIterator != BB.end()) {
7709     MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
7710     if (ModifiedDT)
7711       return true;
7712   }
7713 
7714   bool MadeBitReverse = true;
7715   while (MadeBitReverse) {
7716     MadeBitReverse = false;
7717     for (auto &I : reverse(BB)) {
7718       if (makeBitReverse(I)) {
7719         MadeBitReverse = MadeChange = true;
7720         break;
7721       }
7722     }
7723   }
7724   MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT);
7725 
7726   return MadeChange;
7727 }
7728 
7729 // Some CGP optimizations may move or alter what's computed in a block. Check
7730 // whether a dbg.value intrinsic could be pointed at a more appropriate operand.
7731 bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
7732   assert(isa<DbgValueInst>(I));
7733   DbgValueInst &DVI = *cast<DbgValueInst>(I);
7734 
7735   // Does this dbg.value refer to a sunk address calculation?
7736   Value *Location = DVI.getVariableLocation();
7737   WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
7738   Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
7739   if (SunkAddr) {
7740     // Point dbg.value at locally computed address, which should give the best
7741     // opportunity to be accurately lowered. This update may change the type of
7742     // pointer being referred to; however this makes no difference to debugging
7743     // information, and we can't generate bitcasts that may affect codegen.
7744     DVI.setOperand(0, MetadataAsValue::get(DVI.getContext(),
7745                                            ValueAsMetadata::get(SunkAddr)));
7746     return true;
7747   }
7748   return false;
7749 }
7750 
7751 // A llvm.dbg.value may be using a value before its definition, due to
7752 // optimizations in this pass and others. Scan for such dbg.values, and rescue
7753 // them by moving the dbg.value to immediately after the value definition.
7754 // FIXME: Ideally this should never be necessary, and this has the potential
7755 // to re-order dbg.value intrinsics.
7756 bool CodeGenPrepare::placeDbgValues(Function &F) {
7757   bool MadeChange = false;
7758   DominatorTree DT(F);
7759 
7760   for (BasicBlock &BB : F) {
7761     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
7762       Instruction *Insn = &*BI++;
7763       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
7764       if (!DVI)
7765         continue;
7766 
7767       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
7768 
7769       if (!VI || VI->isTerminator())
7770         continue;
7771 
7772       // If VI is a phi in a block with an EHPad terminator, we can't insert
7773       // after it.
7774       if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
7775         continue;
7776 
7777       // If the defining instruction dominates the dbg.value, we do not need
7778       // to move the dbg.value.
7779       if (DT.dominates(VI, DVI))
7780         continue;
7781 
7782       LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
7783                         << *DVI << ' ' << *VI);
7784       DVI->removeFromParent();
7785       if (isa<PHINode>(VI))
7786         DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
7787       else
7788         DVI->insertAfter(VI);
7789       MadeChange = true;
7790       ++NumDbgValueMoved;
7791     }
7792   }
7793   return MadeChange;
7794 }
7795 
7796 /// Scale down both weights to fit into uint32_t.
7797 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
7798   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
7799   uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
7800   NewTrue = NewTrue / Scale;
7801   NewFalse = NewFalse / Scale;
7802 }
7803 
7804 /// Some targets prefer to split a conditional branch like:
7805 /// \code
7806 ///   %0 = icmp ne i32 %a, 0
7807 ///   %1 = icmp ne i32 %b, 0
7808 ///   %or.cond = or i1 %0, %1
7809 ///   br i1 %or.cond, label %TrueBB, label %FalseBB
7810 /// \endcode
7811 /// into multiple branch instructions like:
7812 /// \code
7813 ///   bb1:
7814 ///     %0 = icmp ne i32 %a, 0
7815 ///     br i1 %0, label %TrueBB, label %bb2
7816 ///   bb2:
7817 ///     %1 = icmp ne i32 %b, 0
7818 ///     br i1 %1, label %TrueBB, label %FalseBB
7819 /// \endcode
7820 /// This usually allows instruction selection to do even further optimizations
7821 /// and combine the compare with the branch instruction. Currently this is
7822 /// applied for targets which have "cheap" jump instructions.
7823 ///
7824 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
7825 ///
7826 bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
7827   if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
7828     return false;
7829 
7830   bool MadeChange = false;
7831   for (auto &BB : F) {
7832     // Does this BB end with the following?
7833     //   %cond1 = icmp|fcmp|binary instruction ...
7834     //   %cond2 = icmp|fcmp|binary instruction ...
7835     //   %cond.or = or|and i1 %cond1, cond2
7836     //   br i1 %cond.or label %dest1, label %dest2"
7837     BinaryOperator *LogicOp;
7838     BasicBlock *TBB, *FBB;
7839     if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
7840       continue;
7841 
7842     auto *Br1 = cast<BranchInst>(BB.getTerminator());
7843     if (Br1->getMetadata(LLVMContext::MD_unpredictable))
7844       continue;
7845 
7846     // The merging of mostly empty BB can cause a degenerate branch.
7847     if (TBB == FBB)
7848       continue;
7849 
7850     unsigned Opc;
7851     Value *Cond1, *Cond2;
7852     if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
7853                              m_OneUse(m_Value(Cond2)))))
7854       Opc = Instruction::And;
7855     else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
7856                                  m_OneUse(m_Value(Cond2)))))
7857       Opc = Instruction::Or;
7858     else
7859       continue;
7860 
7861     if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
7862         !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
7863       continue;
7864 
7865     LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
7866 
7867     // Create a new BB.
7868     auto *TmpBB =
7869         BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
7870                            BB.getParent(), BB.getNextNode());
7871 
7872     // Update original basic block by using the first condition directly by the
7873     // branch instruction and removing the no longer needed and/or instruction.
7874     Br1->setCondition(Cond1);
7875     LogicOp->eraseFromParent();
7876 
7877     // Depending on the condition we have to either replace the true or the
7878     // false successor of the original branch instruction.
7879     if (Opc == Instruction::And)
7880       Br1->setSuccessor(0, TmpBB);
7881     else
7882       Br1->setSuccessor(1, TmpBB);
7883 
7884     // Fill in the new basic block.
7885     auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
7886     if (auto *I = dyn_cast<Instruction>(Cond2)) {
7887       I->removeFromParent();
7888       I->insertBefore(Br2);
7889     }
7890 
7891     // Update PHI nodes in both successors. The original BB needs to be
7892     // replaced in one successor's PHI nodes, because the branch comes now from
7893     // the newly generated BB (NewBB). In the other successor we need to add one
7894     // incoming edge to the PHI nodes, because both branch instructions target
7895     // now the same successor. Depending on the original branch condition
7896     // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
7897     // we perform the correct update for the PHI nodes.
7898     // This doesn't change the successor order of the just created branch
7899     // instruction (or any other instruction).
7900     if (Opc == Instruction::Or)
7901       std::swap(TBB, FBB);
7902 
7903     // Replace the old BB with the new BB.
7904     TBB->replacePhiUsesWith(&BB, TmpBB);
7905 
7906     // Add another incoming edge form the new BB.
7907     for (PHINode &PN : FBB->phis()) {
7908       auto *Val = PN.getIncomingValueForBlock(&BB);
7909       PN.addIncoming(Val, TmpBB);
7910     }
7911 
7912     // Update the branch weights (from SelectionDAGBuilder::
7913     // FindMergedConditions).
7914     if (Opc == Instruction::Or) {
7915       // Codegen X | Y as:
7916       // BB1:
7917       //   jmp_if_X TBB
7918       //   jmp TmpBB
7919       // TmpBB:
7920       //   jmp_if_Y TBB
7921       //   jmp FBB
7922       //
7923 
7924       // We have flexibility in setting Prob for BB1 and Prob for NewBB.
7925       // The requirement is that
7926       //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
7927       //     = TrueProb for original BB.
7928       // Assuming the original weights are A and B, one choice is to set BB1's
7929       // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
7930       // assumes that
7931       //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
7932       // Another choice is to assume TrueProb for BB1 equals to TrueProb for
7933       // TmpBB, but the math is more complicated.
7934       uint64_t TrueWeight, FalseWeight;
7935       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
7936         uint64_t NewTrueWeight = TrueWeight;
7937         uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
7938         scaleWeights(NewTrueWeight, NewFalseWeight);
7939         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
7940                          .createBranchWeights(TrueWeight, FalseWeight));
7941 
7942         NewTrueWeight = TrueWeight;
7943         NewFalseWeight = 2 * FalseWeight;
7944         scaleWeights(NewTrueWeight, NewFalseWeight);
7945         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
7946                          .createBranchWeights(TrueWeight, FalseWeight));
7947       }
7948     } else {
7949       // Codegen X & Y as:
7950       // BB1:
7951       //   jmp_if_X TmpBB
7952       //   jmp FBB
7953       // TmpBB:
7954       //   jmp_if_Y TBB
7955       //   jmp FBB
7956       //
7957       //  This requires creation of TmpBB after CurBB.
7958 
7959       // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
7960       // The requirement is that
7961       //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
7962       //     = FalseProb for original BB.
7963       // Assuming the original weights are A and B, one choice is to set BB1's
7964       // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
7965       // assumes that
7966       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
7967       uint64_t TrueWeight, FalseWeight;
7968       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
7969         uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
7970         uint64_t NewFalseWeight = FalseWeight;
7971         scaleWeights(NewTrueWeight, NewFalseWeight);
7972         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
7973                          .createBranchWeights(TrueWeight, FalseWeight));
7974 
7975         NewTrueWeight = 2 * TrueWeight;
7976         NewFalseWeight = FalseWeight;
7977         scaleWeights(NewTrueWeight, NewFalseWeight);
7978         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
7979                          .createBranchWeights(TrueWeight, FalseWeight));
7980       }
7981     }
7982 
7983     ModifiedDT = true;
7984     MadeChange = true;
7985 
7986     LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
7987                TmpBB->dump());
7988   }
7989   return MadeChange;
7990 }
7991