1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass munges the code in the input function to better prepare it for
10 // SelectionDAG-based code generation. This works around limitations in it's
11 // basic-block-at-a-time approach. It should eventually be removed.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/APInt.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/MapVector.h"
19 #include "llvm/ADT/PointerIntPair.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/BlockFrequencyInfo.h"
25 #include "llvm/Analysis/BranchProbabilityInfo.h"
26 #include "llvm/Analysis/ConstantFolding.h"
27 #include "llvm/Analysis/InstructionSimplify.h"
28 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/Analysis/MemoryBuiltins.h"
30 #include "llvm/Analysis/ProfileSummaryInfo.h"
31 #include "llvm/Analysis/TargetLibraryInfo.h"
32 #include "llvm/Analysis/TargetTransformInfo.h"
33 #include "llvm/Analysis/ValueTracking.h"
34 #include "llvm/Analysis/VectorUtils.h"
35 #include "llvm/CodeGen/Analysis.h"
36 #include "llvm/CodeGen/ISDOpcodes.h"
37 #include "llvm/CodeGen/SelectionDAGNodes.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/TargetPassConfig.h"
40 #include "llvm/CodeGen/TargetSubtargetInfo.h"
41 #include "llvm/CodeGen/ValueTypes.h"
42 #include "llvm/Config/llvm-config.h"
43 #include "llvm/IR/Argument.h"
44 #include "llvm/IR/Attributes.h"
45 #include "llvm/IR/BasicBlock.h"
46 #include "llvm/IR/Constant.h"
47 #include "llvm/IR/Constants.h"
48 #include "llvm/IR/DataLayout.h"
49 #include "llvm/IR/DerivedTypes.h"
50 #include "llvm/IR/Dominators.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/IR/GetElementPtrTypeIterator.h"
53 #include "llvm/IR/GlobalValue.h"
54 #include "llvm/IR/GlobalVariable.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/InlineAsm.h"
57 #include "llvm/IR/InstrTypes.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/IntrinsicInst.h"
61 #include "llvm/IR/Intrinsics.h"
62 #include "llvm/IR/IntrinsicsAArch64.h"
63 #include "llvm/IR/LLVMContext.h"
64 #include "llvm/IR/MDBuilder.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PatternMatch.h"
68 #include "llvm/IR/Statepoint.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/IR/Use.h"
71 #include "llvm/IR/User.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/IR/ValueHandle.h"
74 #include "llvm/IR/ValueMap.h"
75 #include "llvm/InitializePasses.h"
76 #include "llvm/Pass.h"
77 #include "llvm/Support/BlockFrequency.h"
78 #include "llvm/Support/BranchProbability.h"
79 #include "llvm/Support/Casting.h"
80 #include "llvm/Support/CommandLine.h"
81 #include "llvm/Support/Compiler.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/ErrorHandling.h"
84 #include "llvm/Support/MachineValueType.h"
85 #include "llvm/Support/MathExtras.h"
86 #include "llvm/Support/raw_ostream.h"
87 #include "llvm/Target/TargetMachine.h"
88 #include "llvm/Target/TargetOptions.h"
89 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
90 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
91 #include "llvm/Transforms/Utils/Local.h"
92 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
93 #include "llvm/Transforms/Utils/SizeOpts.h"
94 #include <algorithm>
95 #include <cassert>
96 #include <cstdint>
97 #include <iterator>
98 #include <limits>
99 #include <memory>
100 #include <utility>
101 #include <vector>
102 
103 using namespace llvm;
104 using namespace llvm::PatternMatch;
105 
106 #define DEBUG_TYPE "codegenprepare"
107 
108 STATISTIC(NumBlocksElim, "Number of blocks eliminated");
109 STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
110 STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
111 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
112                       "sunken Cmps");
113 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
114                        "of sunken Casts");
115 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
116                           "computations were sunk");
117 STATISTIC(NumMemoryInstsPhiCreated,
118           "Number of phis created when address "
119           "computations were sunk to memory instructions");
120 STATISTIC(NumMemoryInstsSelectCreated,
121           "Number of select created when address "
122           "computations were sunk to memory instructions");
123 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
124 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
125 STATISTIC(NumAndsAdded,
126           "Number of and mask instructions added to form ext loads");
127 STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
128 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
129 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
130 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
131 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
132 
133 static cl::opt<bool> DisableBranchOpts(
134   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
135   cl::desc("Disable branch optimizations in CodeGenPrepare"));
136 
137 static cl::opt<bool>
138     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
139                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
140 
141 static cl::opt<bool> DisableSelectToBranch(
142   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
143   cl::desc("Disable select to branch conversion."));
144 
145 static cl::opt<bool> AddrSinkUsingGEPs(
146   "addr-sink-using-gep", cl::Hidden, cl::init(true),
147   cl::desc("Address sinking in CGP using GEPs."));
148 
149 static cl::opt<bool> EnableAndCmpSinking(
150    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
151    cl::desc("Enable sinkinig and/cmp into branches."));
152 
153 static cl::opt<bool> DisableStoreExtract(
154     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
155     cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
156 
157 static cl::opt<bool> StressStoreExtract(
158     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
159     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
160 
161 static cl::opt<bool> DisableExtLdPromotion(
162     "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
163     cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
164              "CodeGenPrepare"));
165 
166 static cl::opt<bool> StressExtLdPromotion(
167     "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
168     cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
169              "optimization in CodeGenPrepare"));
170 
171 static cl::opt<bool> DisablePreheaderProtect(
172     "disable-preheader-prot", cl::Hidden, cl::init(false),
173     cl::desc("Disable protection against removing loop preheaders"));
174 
175 static cl::opt<bool> ProfileGuidedSectionPrefix(
176     "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
177     cl::desc("Use profile info to add section prefix for hot/cold functions"));
178 
179 static cl::opt<bool> ProfileUnknownInSpecialSection(
180     "profile-unknown-in-special-section", cl::Hidden, cl::init(false),
181     cl::ZeroOrMore,
182     cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
183              "profile, we cannot tell the function is cold for sure because "
184              "it may be a function newly added without ever being sampled. "
185              "With the flag enabled, compiler can put such profile unknown "
186              "functions into a special section, so runtime system can choose "
187              "to handle it in a different way than .text section, to save "
188              "RAM for example. "));
189 
190 static cl::opt<unsigned> FreqRatioToSkipMerge(
191     "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
192     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
193              "(frequency of destination block) is greater than this ratio"));
194 
195 static cl::opt<bool> ForceSplitStore(
196     "force-split-store", cl::Hidden, cl::init(false),
197     cl::desc("Force store splitting no matter what the target query says."));
198 
199 static cl::opt<bool>
200 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
201     cl::desc("Enable merging of redundant sexts when one is dominating"
202     " the other."), cl::init(true));
203 
204 static cl::opt<bool> DisableComplexAddrModes(
205     "disable-complex-addr-modes", cl::Hidden, cl::init(false),
206     cl::desc("Disables combining addressing modes with different parts "
207              "in optimizeMemoryInst."));
208 
209 static cl::opt<bool>
210 AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
211                 cl::desc("Allow creation of Phis in Address sinking."));
212 
213 static cl::opt<bool>
214 AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true),
215                    cl::desc("Allow creation of selects in Address sinking."));
216 
217 static cl::opt<bool> AddrSinkCombineBaseReg(
218     "addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
219     cl::desc("Allow combining of BaseReg field in Address sinking."));
220 
221 static cl::opt<bool> AddrSinkCombineBaseGV(
222     "addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
223     cl::desc("Allow combining of BaseGV field in Address sinking."));
224 
225 static cl::opt<bool> AddrSinkCombineBaseOffs(
226     "addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
227     cl::desc("Allow combining of BaseOffs field in Address sinking."));
228 
229 static cl::opt<bool> AddrSinkCombineScaledReg(
230     "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
231     cl::desc("Allow combining of ScaledReg field in Address sinking."));
232 
233 static cl::opt<bool>
234     EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
235                          cl::init(true),
236                          cl::desc("Enable splitting large offset of GEP."));
237 
238 static cl::opt<bool> EnableICMP_EQToICMP_ST(
239     "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),
240     cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
241 
242 static cl::opt<bool>
243     VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),
244                      cl::desc("Enable BFI update verification for "
245                               "CodeGenPrepare."));
246 
247 static cl::opt<bool> OptimizePhiTypes(
248     "cgp-optimize-phi-types", cl::Hidden, cl::init(false),
249     cl::desc("Enable converting phi types in CodeGenPrepare"));
250 
251 namespace {
252 
253 enum ExtType {
254   ZeroExtension,   // Zero extension has been seen.
255   SignExtension,   // Sign extension has been seen.
256   BothExtension    // This extension type is used if we saw sext after
257                    // ZeroExtension had been set, or if we saw zext after
258                    // SignExtension had been set. It makes the type
259                    // information of a promoted instruction invalid.
260 };
261 
262 using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
263 using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
264 using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
265 using SExts = SmallVector<Instruction *, 16>;
266 using ValueToSExts = DenseMap<Value *, SExts>;
267 
268 class TypePromotionTransaction;
269 
270   class CodeGenPrepare : public FunctionPass {
271     const TargetMachine *TM = nullptr;
272     const TargetSubtargetInfo *SubtargetInfo;
273     const TargetLowering *TLI = nullptr;
274     const TargetRegisterInfo *TRI;
275     const TargetTransformInfo *TTI = nullptr;
276     const TargetLibraryInfo *TLInfo;
277     const LoopInfo *LI;
278     std::unique_ptr<BlockFrequencyInfo> BFI;
279     std::unique_ptr<BranchProbabilityInfo> BPI;
280     ProfileSummaryInfo *PSI;
281 
282     /// As we scan instructions optimizing them, this is the next instruction
283     /// to optimize. Transforms that can invalidate this should update it.
284     BasicBlock::iterator CurInstIterator;
285 
286     /// Keeps track of non-local addresses that have been sunk into a block.
287     /// This allows us to avoid inserting duplicate code for blocks with
288     /// multiple load/stores of the same address. The usage of WeakTrackingVH
289     /// enables SunkAddrs to be treated as a cache whose entries can be
290     /// invalidated if a sunken address computation has been erased.
291     ValueMap<Value*, WeakTrackingVH> SunkAddrs;
292 
293     /// Keeps track of all instructions inserted for the current function.
294     SetOfInstrs InsertedInsts;
295 
296     /// Keeps track of the type of the related instruction before their
297     /// promotion for the current function.
298     InstrToOrigTy PromotedInsts;
299 
300     /// Keep track of instructions removed during promotion.
301     SetOfInstrs RemovedInsts;
302 
303     /// Keep track of sext chains based on their initial value.
304     DenseMap<Value *, Instruction *> SeenChainsForSExt;
305 
306     /// Keep track of GEPs accessing the same data structures such as structs or
307     /// arrays that are candidates to be split later because of their large
308     /// size.
309     MapVector<
310         AssertingVH<Value>,
311         SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
312         LargeOffsetGEPMap;
313 
314     /// Keep track of new GEP base after splitting the GEPs having large offset.
315     SmallSet<AssertingVH<Value>, 2> NewGEPBases;
316 
317     /// Map serial numbers to Large offset GEPs.
318     DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
319 
320     /// Keep track of SExt promoted.
321     ValueToSExts ValToSExtendedUses;
322 
323     /// True if the function has the OptSize attribute.
324     bool OptSize;
325 
326     /// DataLayout for the Function being processed.
327     const DataLayout *DL = nullptr;
328 
329     /// Building the dominator tree can be expensive, so we only build it
330     /// lazily and update it when required.
331     std::unique_ptr<DominatorTree> DT;
332 
333   public:
334     static char ID; // Pass identification, replacement for typeid
335 
336     CodeGenPrepare() : FunctionPass(ID) {
337       initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
338     }
339 
340     bool runOnFunction(Function &F) override;
341 
342     StringRef getPassName() const override { return "CodeGen Prepare"; }
343 
344     void getAnalysisUsage(AnalysisUsage &AU) const override {
345       // FIXME: When we can selectively preserve passes, preserve the domtree.
346       AU.addRequired<ProfileSummaryInfoWrapperPass>();
347       AU.addRequired<TargetLibraryInfoWrapperPass>();
348       AU.addRequired<TargetPassConfig>();
349       AU.addRequired<TargetTransformInfoWrapperPass>();
350       AU.addRequired<LoopInfoWrapperPass>();
351     }
352 
353   private:
354     template <typename F>
355     void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
356       // Substituting can cause recursive simplifications, which can invalidate
357       // our iterator.  Use a WeakTrackingVH to hold onto it in case this
358       // happens.
359       Value *CurValue = &*CurInstIterator;
360       WeakTrackingVH IterHandle(CurValue);
361 
362       f();
363 
364       // If the iterator instruction was recursively deleted, start over at the
365       // start of the block.
366       if (IterHandle != CurValue) {
367         CurInstIterator = BB->begin();
368         SunkAddrs.clear();
369       }
370     }
371 
372     // Get the DominatorTree, building if necessary.
373     DominatorTree &getDT(Function &F) {
374       if (!DT)
375         DT = std::make_unique<DominatorTree>(F);
376       return *DT;
377     }
378 
379     void removeAllAssertingVHReferences(Value *V);
380     bool eliminateFallThrough(Function &F);
381     bool eliminateMostlyEmptyBlocks(Function &F);
382     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
383     bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
384     void eliminateMostlyEmptyBlock(BasicBlock *BB);
385     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
386                                        bool isPreheader);
387     bool makeBitReverse(Instruction &I);
388     bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
389     bool optimizeInst(Instruction *I, bool &ModifiedDT);
390     bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
391                             Type *AccessTy, unsigned AddrSpace);
392     bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
393     bool optimizeInlineAsmInst(CallInst *CS);
394     bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
395     bool optimizeExt(Instruction *&I);
396     bool optimizeExtUses(Instruction *I);
397     bool optimizeLoadExt(LoadInst *Load);
398     bool optimizeShiftInst(BinaryOperator *BO);
399     bool optimizeFunnelShift(IntrinsicInst *Fsh);
400     bool optimizeSelectInst(SelectInst *SI);
401     bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
402     bool optimizeSwitchInst(SwitchInst *SI);
403     bool optimizeExtractElementInst(Instruction *Inst);
404     bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
405     bool fixupDbgValue(Instruction *I);
406     bool placeDbgValues(Function &F);
407     bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
408                       LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
409     bool tryToPromoteExts(TypePromotionTransaction &TPT,
410                           const SmallVectorImpl<Instruction *> &Exts,
411                           SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
412                           unsigned CreatedInstsCost = 0);
413     bool mergeSExts(Function &F);
414     bool splitLargeGEPOffsets();
415     bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
416                          SmallPtrSetImpl<Instruction *> &DeletedInstrs);
417     bool optimizePhiTypes(Function &F);
418     bool performAddressTypePromotion(
419         Instruction *&Inst,
420         bool AllowPromotionWithoutCommonHeader,
421         bool HasPromoted, TypePromotionTransaction &TPT,
422         SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
423     bool splitBranchCondition(Function &F, bool &ModifiedDT);
424     bool simplifyOffsetableRelocate(GCStatepointInst &I);
425 
426     bool tryToSinkFreeOperands(Instruction *I);
427     bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,
428                                      Value *Arg1, CmpInst *Cmp,
429                                      Intrinsic::ID IID);
430     bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
431     bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
432     bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
433     void verifyBFIUpdates(Function &F);
434   };
435 
436 } // end anonymous namespace
437 
438 char CodeGenPrepare::ID = 0;
439 
440 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
441                       "Optimize for code generation", false, false)
442 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
443 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
444 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
445 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
446 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
447 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
448                     "Optimize for code generation", false, false)
449 
450 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
451 
452 bool CodeGenPrepare::runOnFunction(Function &F) {
453   if (skipFunction(F))
454     return false;
455 
456   DL = &F.getParent()->getDataLayout();
457 
458   bool EverMadeChange = false;
459   // Clear per function information.
460   InsertedInsts.clear();
461   PromotedInsts.clear();
462 
463   TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
464   SubtargetInfo = TM->getSubtargetImpl(F);
465   TLI = SubtargetInfo->getTargetLowering();
466   TRI = SubtargetInfo->getRegisterInfo();
467   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
468   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
469   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
470   BPI.reset(new BranchProbabilityInfo(F, *LI));
471   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
472   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
473   OptSize = F.hasOptSize();
474   if (ProfileGuidedSectionPrefix) {
475     if (PSI->isFunctionHotInCallGraph(&F, *BFI))
476       F.setSectionPrefix("hot");
477     else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
478       F.setSectionPrefix("unlikely");
479     else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
480              PSI->isFunctionHotnessUnknown(F))
481       F.setSectionPrefix("unknown");
482   }
483 
484   /// This optimization identifies DIV instructions that can be
485   /// profitably bypassed and carried out with a shorter, faster divide.
486   if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
487     const DenseMap<unsigned int, unsigned int> &BypassWidths =
488         TLI->getBypassSlowDivWidths();
489     BasicBlock* BB = &*F.begin();
490     while (BB != nullptr) {
491       // bypassSlowDivision may create new BBs, but we don't want to reapply the
492       // optimization to those blocks.
493       BasicBlock* Next = BB->getNextNode();
494       // F.hasOptSize is already checked in the outer if statement.
495       if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
496         EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
497       BB = Next;
498     }
499   }
500 
501   // Eliminate blocks that contain only PHI nodes and an
502   // unconditional branch.
503   EverMadeChange |= eliminateMostlyEmptyBlocks(F);
504 
505   bool ModifiedDT = false;
506   if (!DisableBranchOpts)
507     EverMadeChange |= splitBranchCondition(F, ModifiedDT);
508 
509   // Split some critical edges where one of the sources is an indirect branch,
510   // to help generate sane code for PHIs involving such edges.
511   EverMadeChange |= SplitIndirectBrCriticalEdges(F);
512 
513   bool MadeChange = true;
514   while (MadeChange) {
515     MadeChange = false;
516     DT.reset();
517     for (Function::iterator I = F.begin(); I != F.end(); ) {
518       BasicBlock *BB = &*I++;
519       bool ModifiedDTOnIteration = false;
520       MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
521 
522       // Restart BB iteration if the dominator tree of the Function was changed
523       if (ModifiedDTOnIteration)
524         break;
525     }
526     if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
527       MadeChange |= mergeSExts(F);
528     if (!LargeOffsetGEPMap.empty())
529       MadeChange |= splitLargeGEPOffsets();
530     MadeChange |= optimizePhiTypes(F);
531 
532     if (MadeChange)
533       eliminateFallThrough(F);
534 
535     // Really free removed instructions during promotion.
536     for (Instruction *I : RemovedInsts)
537       I->deleteValue();
538 
539     EverMadeChange |= MadeChange;
540     SeenChainsForSExt.clear();
541     ValToSExtendedUses.clear();
542     RemovedInsts.clear();
543     LargeOffsetGEPMap.clear();
544     LargeOffsetGEPID.clear();
545   }
546 
547   NewGEPBases.clear();
548   SunkAddrs.clear();
549 
550   if (!DisableBranchOpts) {
551     MadeChange = false;
552     // Use a set vector to get deterministic iteration order. The order the
553     // blocks are removed may affect whether or not PHI nodes in successors
554     // are removed.
555     SmallSetVector<BasicBlock*, 8> WorkList;
556     for (BasicBlock &BB : F) {
557       SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
558       MadeChange |= ConstantFoldTerminator(&BB, true);
559       if (!MadeChange) continue;
560 
561       for (SmallVectorImpl<BasicBlock*>::iterator
562              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
563         if (pred_empty(*II))
564           WorkList.insert(*II);
565     }
566 
567     // Delete the dead blocks and any of their dead successors.
568     MadeChange |= !WorkList.empty();
569     while (!WorkList.empty()) {
570       BasicBlock *BB = WorkList.pop_back_val();
571       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
572 
573       DeleteDeadBlock(BB);
574 
575       for (SmallVectorImpl<BasicBlock*>::iterator
576              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
577         if (pred_empty(*II))
578           WorkList.insert(*II);
579     }
580 
581     // Merge pairs of basic blocks with unconditional branches, connected by
582     // a single edge.
583     if (EverMadeChange || MadeChange)
584       MadeChange |= eliminateFallThrough(F);
585 
586     EverMadeChange |= MadeChange;
587   }
588 
589   if (!DisableGCOpts) {
590     SmallVector<GCStatepointInst *, 2> Statepoints;
591     for (BasicBlock &BB : F)
592       for (Instruction &I : BB)
593         if (auto *SP = dyn_cast<GCStatepointInst>(&I))
594           Statepoints.push_back(SP);
595     for (auto &I : Statepoints)
596       EverMadeChange |= simplifyOffsetableRelocate(*I);
597   }
598 
599   // Do this last to clean up use-before-def scenarios introduced by other
600   // preparatory transforms.
601   EverMadeChange |= placeDbgValues(F);
602 
603 #ifndef NDEBUG
604   if (VerifyBFIUpdates)
605     verifyBFIUpdates(F);
606 #endif
607 
608   return EverMadeChange;
609 }
610 
611 /// An instruction is about to be deleted, so remove all references to it in our
612 /// GEP-tracking data strcutures.
613 void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
614   LargeOffsetGEPMap.erase(V);
615   NewGEPBases.erase(V);
616 
617   auto GEP = dyn_cast<GetElementPtrInst>(V);
618   if (!GEP)
619     return;
620 
621   LargeOffsetGEPID.erase(GEP);
622 
623   auto VecI = LargeOffsetGEPMap.find(GEP->getPointerOperand());
624   if (VecI == LargeOffsetGEPMap.end())
625     return;
626 
627   auto &GEPVector = VecI->second;
628   const auto &I = std::find_if(GEPVector.begin(), GEPVector.end(),
629                                [=](auto &Elt) { return Elt.first == GEP; });
630   if (I == GEPVector.end())
631     return;
632 
633   GEPVector.erase(I);
634   if (GEPVector.empty())
635     LargeOffsetGEPMap.erase(VecI);
636 }
637 
638 // Verify BFI has been updated correctly by recomputing BFI and comparing them.
639 void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
640   DominatorTree NewDT(F);
641   LoopInfo NewLI(NewDT);
642   BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
643   BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
644   NewBFI.verifyMatch(*BFI);
645 }
646 
647 /// Merge basic blocks which are connected by a single edge, where one of the
648 /// basic blocks has a single successor pointing to the other basic block,
649 /// which has a single predecessor.
650 bool CodeGenPrepare::eliminateFallThrough(Function &F) {
651   bool Changed = false;
652   // Scan all of the blocks in the function, except for the entry block.
653   // Use a temporary array to avoid iterator being invalidated when
654   // deleting blocks.
655   SmallVector<WeakTrackingVH, 16> Blocks;
656   for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
657     Blocks.push_back(&Block);
658 
659   SmallSet<WeakTrackingVH, 16> Preds;
660   for (auto &Block : Blocks) {
661     auto *BB = cast_or_null<BasicBlock>(Block);
662     if (!BB)
663       continue;
664     // If the destination block has a single pred, then this is a trivial
665     // edge, just collapse it.
666     BasicBlock *SinglePred = BB->getSinglePredecessor();
667 
668     // Don't merge if BB's address is taken.
669     if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
670 
671     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
672     if (Term && !Term->isConditional()) {
673       Changed = true;
674       LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
675 
676       // Merge BB into SinglePred and delete it.
677       MergeBlockIntoPredecessor(BB);
678       Preds.insert(SinglePred);
679     }
680   }
681 
682   // (Repeatedly) merging blocks into their predecessors can create redundant
683   // debug intrinsics.
684   for (auto &Pred : Preds)
685     if (auto *BB = cast_or_null<BasicBlock>(Pred))
686       RemoveRedundantDbgInstrs(BB);
687 
688   return Changed;
689 }
690 
691 /// Find a destination block from BB if BB is mergeable empty block.
692 BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
693   // If this block doesn't end with an uncond branch, ignore it.
694   BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
695   if (!BI || !BI->isUnconditional())
696     return nullptr;
697 
698   // If the instruction before the branch (skipping debug info) isn't a phi
699   // node, then other stuff is happening here.
700   BasicBlock::iterator BBI = BI->getIterator();
701   if (BBI != BB->begin()) {
702     --BBI;
703     while (isa<DbgInfoIntrinsic>(BBI)) {
704       if (BBI == BB->begin())
705         break;
706       --BBI;
707     }
708     if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
709       return nullptr;
710   }
711 
712   // Do not break infinite loops.
713   BasicBlock *DestBB = BI->getSuccessor(0);
714   if (DestBB == BB)
715     return nullptr;
716 
717   if (!canMergeBlocks(BB, DestBB))
718     DestBB = nullptr;
719 
720   return DestBB;
721 }
722 
723 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
724 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
725 /// edges in ways that are non-optimal for isel. Start by eliminating these
726 /// blocks so we can split them the way we want them.
727 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
728   SmallPtrSet<BasicBlock *, 16> Preheaders;
729   SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
730   while (!LoopList.empty()) {
731     Loop *L = LoopList.pop_back_val();
732     LoopList.insert(LoopList.end(), L->begin(), L->end());
733     if (BasicBlock *Preheader = L->getLoopPreheader())
734       Preheaders.insert(Preheader);
735   }
736 
737   bool MadeChange = false;
738   // Copy blocks into a temporary array to avoid iterator invalidation issues
739   // as we remove them.
740   // Note that this intentionally skips the entry block.
741   SmallVector<WeakTrackingVH, 16> Blocks;
742   for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
743     Blocks.push_back(&Block);
744 
745   for (auto &Block : Blocks) {
746     BasicBlock *BB = cast_or_null<BasicBlock>(Block);
747     if (!BB)
748       continue;
749     BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
750     if (!DestBB ||
751         !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
752       continue;
753 
754     eliminateMostlyEmptyBlock(BB);
755     MadeChange = true;
756   }
757   return MadeChange;
758 }
759 
760 bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
761                                                    BasicBlock *DestBB,
762                                                    bool isPreheader) {
763   // Do not delete loop preheaders if doing so would create a critical edge.
764   // Loop preheaders can be good locations to spill registers. If the
765   // preheader is deleted and we create a critical edge, registers may be
766   // spilled in the loop body instead.
767   if (!DisablePreheaderProtect && isPreheader &&
768       !(BB->getSinglePredecessor() &&
769         BB->getSinglePredecessor()->getSingleSuccessor()))
770     return false;
771 
772   // Skip merging if the block's successor is also a successor to any callbr
773   // that leads to this block.
774   // FIXME: Is this really needed? Is this a correctness issue?
775   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
776     if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
777       for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
778         if (DestBB == CBI->getSuccessor(i))
779           return false;
780   }
781 
782   // Try to skip merging if the unique predecessor of BB is terminated by a
783   // switch or indirect branch instruction, and BB is used as an incoming block
784   // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
785   // add COPY instructions in the predecessor of BB instead of BB (if it is not
786   // merged). Note that the critical edge created by merging such blocks wont be
787   // split in MachineSink because the jump table is not analyzable. By keeping
788   // such empty block (BB), ISel will place COPY instructions in BB, not in the
789   // predecessor of BB.
790   BasicBlock *Pred = BB->getUniquePredecessor();
791   if (!Pred ||
792       !(isa<SwitchInst>(Pred->getTerminator()) ||
793         isa<IndirectBrInst>(Pred->getTerminator())))
794     return true;
795 
796   if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
797     return true;
798 
799   // We use a simple cost heuristic which determine skipping merging is
800   // profitable if the cost of skipping merging is less than the cost of
801   // merging : Cost(skipping merging) < Cost(merging BB), where the
802   // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
803   // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
804   // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
805   //   Freq(Pred) / Freq(BB) > 2.
806   // Note that if there are multiple empty blocks sharing the same incoming
807   // value for the PHIs in the DestBB, we consider them together. In such
808   // case, Cost(merging BB) will be the sum of their frequencies.
809 
810   if (!isa<PHINode>(DestBB->begin()))
811     return true;
812 
813   SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
814 
815   // Find all other incoming blocks from which incoming values of all PHIs in
816   // DestBB are the same as the ones from BB.
817   for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
818        ++PI) {
819     BasicBlock *DestBBPred = *PI;
820     if (DestBBPred == BB)
821       continue;
822 
823     if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
824           return DestPN.getIncomingValueForBlock(BB) ==
825                  DestPN.getIncomingValueForBlock(DestBBPred);
826         }))
827       SameIncomingValueBBs.insert(DestBBPred);
828   }
829 
830   // See if all BB's incoming values are same as the value from Pred. In this
831   // case, no reason to skip merging because COPYs are expected to be place in
832   // Pred already.
833   if (SameIncomingValueBBs.count(Pred))
834     return true;
835 
836   BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
837   BlockFrequency BBFreq = BFI->getBlockFreq(BB);
838 
839   for (auto *SameValueBB : SameIncomingValueBBs)
840     if (SameValueBB->getUniquePredecessor() == Pred &&
841         DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
842       BBFreq += BFI->getBlockFreq(SameValueBB);
843 
844   return PredFreq.getFrequency() <=
845          BBFreq.getFrequency() * FreqRatioToSkipMerge;
846 }
847 
848 /// Return true if we can merge BB into DestBB if there is a single
849 /// unconditional branch between them, and BB contains no other non-phi
850 /// instructions.
851 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
852                                     const BasicBlock *DestBB) const {
853   // We only want to eliminate blocks whose phi nodes are used by phi nodes in
854   // the successor.  If there are more complex condition (e.g. preheaders),
855   // don't mess around with them.
856   for (const PHINode &PN : BB->phis()) {
857     for (const User *U : PN.users()) {
858       const Instruction *UI = cast<Instruction>(U);
859       if (UI->getParent() != DestBB || !isa<PHINode>(UI))
860         return false;
861       // If User is inside DestBB block and it is a PHINode then check
862       // incoming value. If incoming value is not from BB then this is
863       // a complex condition (e.g. preheaders) we want to avoid here.
864       if (UI->getParent() == DestBB) {
865         if (const PHINode *UPN = dyn_cast<PHINode>(UI))
866           for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
867             Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
868             if (Insn && Insn->getParent() == BB &&
869                 Insn->getParent() != UPN->getIncomingBlock(I))
870               return false;
871           }
872       }
873     }
874   }
875 
876   // If BB and DestBB contain any common predecessors, then the phi nodes in BB
877   // and DestBB may have conflicting incoming values for the block.  If so, we
878   // can't merge the block.
879   const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
880   if (!DestBBPN) return true;  // no conflict.
881 
882   // Collect the preds of BB.
883   SmallPtrSet<const BasicBlock*, 16> BBPreds;
884   if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
885     // It is faster to get preds from a PHI than with pred_iterator.
886     for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
887       BBPreds.insert(BBPN->getIncomingBlock(i));
888   } else {
889     BBPreds.insert(pred_begin(BB), pred_end(BB));
890   }
891 
892   // Walk the preds of DestBB.
893   for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
894     BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
895     if (BBPreds.count(Pred)) {   // Common predecessor?
896       for (const PHINode &PN : DestBB->phis()) {
897         const Value *V1 = PN.getIncomingValueForBlock(Pred);
898         const Value *V2 = PN.getIncomingValueForBlock(BB);
899 
900         // If V2 is a phi node in BB, look up what the mapped value will be.
901         if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
902           if (V2PN->getParent() == BB)
903             V2 = V2PN->getIncomingValueForBlock(Pred);
904 
905         // If there is a conflict, bail out.
906         if (V1 != V2) return false;
907       }
908     }
909   }
910 
911   return true;
912 }
913 
914 /// Eliminate a basic block that has only phi's and an unconditional branch in
915 /// it.
916 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
917   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
918   BasicBlock *DestBB = BI->getSuccessor(0);
919 
920   LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
921                     << *BB << *DestBB);
922 
923   // If the destination block has a single pred, then this is a trivial edge,
924   // just collapse it.
925   if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
926     if (SinglePred != DestBB) {
927       assert(SinglePred == BB &&
928              "Single predecessor not the same as predecessor");
929       // Merge DestBB into SinglePred/BB and delete it.
930       MergeBlockIntoPredecessor(DestBB);
931       // Note: BB(=SinglePred) will not be deleted on this path.
932       // DestBB(=its single successor) is the one that was deleted.
933       LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
934       return;
935     }
936   }
937 
938   // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
939   // to handle the new incoming edges it is about to have.
940   for (PHINode &PN : DestBB->phis()) {
941     // Remove the incoming value for BB, and remember it.
942     Value *InVal = PN.removeIncomingValue(BB, false);
943 
944     // Two options: either the InVal is a phi node defined in BB or it is some
945     // value that dominates BB.
946     PHINode *InValPhi = dyn_cast<PHINode>(InVal);
947     if (InValPhi && InValPhi->getParent() == BB) {
948       // Add all of the input values of the input PHI as inputs of this phi.
949       for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
950         PN.addIncoming(InValPhi->getIncomingValue(i),
951                        InValPhi->getIncomingBlock(i));
952     } else {
953       // Otherwise, add one instance of the dominating value for each edge that
954       // we will be adding.
955       if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
956         for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
957           PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
958       } else {
959         for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
960           PN.addIncoming(InVal, *PI);
961       }
962     }
963   }
964 
965   // The PHIs are now updated, change everything that refers to BB to use
966   // DestBB and remove BB.
967   BB->replaceAllUsesWith(DestBB);
968   BB->eraseFromParent();
969   ++NumBlocksElim;
970 
971   LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
972 }
973 
974 // Computes a map of base pointer relocation instructions to corresponding
975 // derived pointer relocation instructions given a vector of all relocate calls
976 static void computeBaseDerivedRelocateMap(
977     const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
978     DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
979         &RelocateInstMap) {
980   // Collect information in two maps: one primarily for locating the base object
981   // while filling the second map; the second map is the final structure holding
982   // a mapping between Base and corresponding Derived relocate calls
983   DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
984   for (auto *ThisRelocate : AllRelocateCalls) {
985     auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
986                             ThisRelocate->getDerivedPtrIndex());
987     RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
988   }
989   for (auto &Item : RelocateIdxMap) {
990     std::pair<unsigned, unsigned> Key = Item.first;
991     if (Key.first == Key.second)
992       // Base relocation: nothing to insert
993       continue;
994 
995     GCRelocateInst *I = Item.second;
996     auto BaseKey = std::make_pair(Key.first, Key.first);
997 
998     // We're iterating over RelocateIdxMap so we cannot modify it.
999     auto MaybeBase = RelocateIdxMap.find(BaseKey);
1000     if (MaybeBase == RelocateIdxMap.end())
1001       // TODO: We might want to insert a new base object relocate and gep off
1002       // that, if there are enough derived object relocates.
1003       continue;
1004 
1005     RelocateInstMap[MaybeBase->second].push_back(I);
1006   }
1007 }
1008 
1009 // Accepts a GEP and extracts the operands into a vector provided they're all
1010 // small integer constants
1011 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
1012                                           SmallVectorImpl<Value *> &OffsetV) {
1013   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
1014     // Only accept small constant integer operands
1015     auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
1016     if (!Op || Op->getZExtValue() > 20)
1017       return false;
1018   }
1019 
1020   for (unsigned i = 1; i < GEP->getNumOperands(); i++)
1021     OffsetV.push_back(GEP->getOperand(i));
1022   return true;
1023 }
1024 
1025 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
1026 // replace, computes a replacement, and affects it.
1027 static bool
1028 simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
1029                           const SmallVectorImpl<GCRelocateInst *> &Targets) {
1030   bool MadeChange = false;
1031   // We must ensure the relocation of derived pointer is defined after
1032   // relocation of base pointer. If we find a relocation corresponding to base
1033   // defined earlier than relocation of base then we move relocation of base
1034   // right before found relocation. We consider only relocation in the same
1035   // basic block as relocation of base. Relocations from other basic block will
1036   // be skipped by optimization and we do not care about them.
1037   for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
1038        &*R != RelocatedBase; ++R)
1039     if (auto *RI = dyn_cast<GCRelocateInst>(R))
1040       if (RI->getStatepoint() == RelocatedBase->getStatepoint())
1041         if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
1042           RelocatedBase->moveBefore(RI);
1043           break;
1044         }
1045 
1046   for (GCRelocateInst *ToReplace : Targets) {
1047     assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
1048            "Not relocating a derived object of the original base object");
1049     if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
1050       // A duplicate relocate call. TODO: coalesce duplicates.
1051       continue;
1052     }
1053 
1054     if (RelocatedBase->getParent() != ToReplace->getParent()) {
1055       // Base and derived relocates are in different basic blocks.
1056       // In this case transform is only valid when base dominates derived
1057       // relocate. However it would be too expensive to check dominance
1058       // for each such relocate, so we skip the whole transformation.
1059       continue;
1060     }
1061 
1062     Value *Base = ToReplace->getBasePtr();
1063     auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
1064     if (!Derived || Derived->getPointerOperand() != Base)
1065       continue;
1066 
1067     SmallVector<Value *, 2> OffsetV;
1068     if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
1069       continue;
1070 
1071     // Create a Builder and replace the target callsite with a gep
1072     assert(RelocatedBase->getNextNode() &&
1073            "Should always have one since it's not a terminator");
1074 
1075     // Insert after RelocatedBase
1076     IRBuilder<> Builder(RelocatedBase->getNextNode());
1077     Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
1078 
1079     // If gc_relocate does not match the actual type, cast it to the right type.
1080     // In theory, there must be a bitcast after gc_relocate if the type does not
1081     // match, and we should reuse it to get the derived pointer. But it could be
1082     // cases like this:
1083     // bb1:
1084     //  ...
1085     //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
1086     //  br label %merge
1087     //
1088     // bb2:
1089     //  ...
1090     //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
1091     //  br label %merge
1092     //
1093     // merge:
1094     //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
1095     //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
1096     //
1097     // In this case, we can not find the bitcast any more. So we insert a new bitcast
1098     // no matter there is already one or not. In this way, we can handle all cases, and
1099     // the extra bitcast should be optimized away in later passes.
1100     Value *ActualRelocatedBase = RelocatedBase;
1101     if (RelocatedBase->getType() != Base->getType()) {
1102       ActualRelocatedBase =
1103           Builder.CreateBitCast(RelocatedBase, Base->getType());
1104     }
1105     Value *Replacement = Builder.CreateGEP(
1106         Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
1107     Replacement->takeName(ToReplace);
1108     // If the newly generated derived pointer's type does not match the original derived
1109     // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
1110     Value *ActualReplacement = Replacement;
1111     if (Replacement->getType() != ToReplace->getType()) {
1112       ActualReplacement =
1113           Builder.CreateBitCast(Replacement, ToReplace->getType());
1114     }
1115     ToReplace->replaceAllUsesWith(ActualReplacement);
1116     ToReplace->eraseFromParent();
1117 
1118     MadeChange = true;
1119   }
1120   return MadeChange;
1121 }
1122 
1123 // Turns this:
1124 //
1125 // %base = ...
1126 // %ptr = gep %base + 15
1127 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1128 // %base' = relocate(%tok, i32 4, i32 4)
1129 // %ptr' = relocate(%tok, i32 4, i32 5)
1130 // %val = load %ptr'
1131 //
1132 // into this:
1133 //
1134 // %base = ...
1135 // %ptr = gep %base + 15
1136 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1137 // %base' = gc.relocate(%tok, i32 4, i32 4)
1138 // %ptr' = gep %base' + 15
1139 // %val = load %ptr'
1140 bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
1141   bool MadeChange = false;
1142   SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
1143   for (auto *U : I.users())
1144     if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
1145       // Collect all the relocate calls associated with a statepoint
1146       AllRelocateCalls.push_back(Relocate);
1147 
1148   // We need at least one base pointer relocation + one derived pointer
1149   // relocation to mangle
1150   if (AllRelocateCalls.size() < 2)
1151     return false;
1152 
1153   // RelocateInstMap is a mapping from the base relocate instruction to the
1154   // corresponding derived relocate instructions
1155   DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
1156   computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
1157   if (RelocateInstMap.empty())
1158     return false;
1159 
1160   for (auto &Item : RelocateInstMap)
1161     // Item.first is the RelocatedBase to offset against
1162     // Item.second is the vector of Targets to replace
1163     MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
1164   return MadeChange;
1165 }
1166 
1167 /// Sink the specified cast instruction into its user blocks.
1168 static bool SinkCast(CastInst *CI) {
1169   BasicBlock *DefBB = CI->getParent();
1170 
1171   /// InsertedCasts - Only insert a cast in each block once.
1172   DenseMap<BasicBlock*, CastInst*> InsertedCasts;
1173 
1174   bool MadeChange = false;
1175   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
1176        UI != E; ) {
1177     Use &TheUse = UI.getUse();
1178     Instruction *User = cast<Instruction>(*UI);
1179 
1180     // Figure out which BB this cast is used in.  For PHI's this is the
1181     // appropriate predecessor block.
1182     BasicBlock *UserBB = User->getParent();
1183     if (PHINode *PN = dyn_cast<PHINode>(User)) {
1184       UserBB = PN->getIncomingBlock(TheUse);
1185     }
1186 
1187     // Preincrement use iterator so we don't invalidate it.
1188     ++UI;
1189 
1190     // The first insertion point of a block containing an EH pad is after the
1191     // pad.  If the pad is the user, we cannot sink the cast past the pad.
1192     if (User->isEHPad())
1193       continue;
1194 
1195     // If the block selected to receive the cast is an EH pad that does not
1196     // allow non-PHI instructions before the terminator, we can't sink the
1197     // cast.
1198     if (UserBB->getTerminator()->isEHPad())
1199       continue;
1200 
1201     // If this user is in the same block as the cast, don't change the cast.
1202     if (UserBB == DefBB) continue;
1203 
1204     // If we have already inserted a cast into this block, use it.
1205     CastInst *&InsertedCast = InsertedCasts[UserBB];
1206 
1207     if (!InsertedCast) {
1208       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1209       assert(InsertPt != UserBB->end());
1210       InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
1211                                       CI->getType(), "", &*InsertPt);
1212       InsertedCast->setDebugLoc(CI->getDebugLoc());
1213     }
1214 
1215     // Replace a use of the cast with a use of the new cast.
1216     TheUse = InsertedCast;
1217     MadeChange = true;
1218     ++NumCastUses;
1219   }
1220 
1221   // If we removed all uses, nuke the cast.
1222   if (CI->use_empty()) {
1223     salvageDebugInfo(*CI);
1224     CI->eraseFromParent();
1225     MadeChange = true;
1226   }
1227 
1228   return MadeChange;
1229 }
1230 
1231 /// If the specified cast instruction is a noop copy (e.g. it's casting from
1232 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
1233 /// reduce the number of virtual registers that must be created and coalesced.
1234 ///
1235 /// Return true if any changes are made.
1236 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
1237                                        const DataLayout &DL) {
1238   // Sink only "cheap" (or nop) address-space casts.  This is a weaker condition
1239   // than sinking only nop casts, but is helpful on some platforms.
1240   if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
1241     if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),
1242                                  ASC->getDestAddressSpace()))
1243       return false;
1244   }
1245 
1246   // If this is a noop copy,
1247   EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1248   EVT DstVT = TLI.getValueType(DL, CI->getType());
1249 
1250   // This is an fp<->int conversion?
1251   if (SrcVT.isInteger() != DstVT.isInteger())
1252     return false;
1253 
1254   // If this is an extension, it will be a zero or sign extension, which
1255   // isn't a noop.
1256   if (SrcVT.bitsLT(DstVT)) return false;
1257 
1258   // If these values will be promoted, find out what they will be promoted
1259   // to.  This helps us consider truncates on PPC as noop copies when they
1260   // are.
1261   if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
1262       TargetLowering::TypePromoteInteger)
1263     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
1264   if (TLI.getTypeAction(CI->getContext(), DstVT) ==
1265       TargetLowering::TypePromoteInteger)
1266     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
1267 
1268   // If, after promotion, these are the same types, this is a noop copy.
1269   if (SrcVT != DstVT)
1270     return false;
1271 
1272   return SinkCast(CI);
1273 }
1274 
1275 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
1276                                                  Value *Arg0, Value *Arg1,
1277                                                  CmpInst *Cmp,
1278                                                  Intrinsic::ID IID) {
1279   if (BO->getParent() != Cmp->getParent()) {
1280     // We used to use a dominator tree here to allow multi-block optimization.
1281     // But that was problematic because:
1282     // 1. It could cause a perf regression by hoisting the math op into the
1283     //    critical path.
1284     // 2. It could cause a perf regression by creating a value that was live
1285     //    across multiple blocks and increasing register pressure.
1286     // 3. Use of a dominator tree could cause large compile-time regression.
1287     //    This is because we recompute the DT on every change in the main CGP
1288     //    run-loop. The recomputing is probably unnecessary in many cases, so if
1289     //    that was fixed, using a DT here would be ok.
1290     return false;
1291   }
1292 
1293   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
1294   if (BO->getOpcode() == Instruction::Add &&
1295       IID == Intrinsic::usub_with_overflow) {
1296     assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
1297     Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
1298   }
1299 
1300   // Insert at the first instruction of the pair.
1301   Instruction *InsertPt = nullptr;
1302   for (Instruction &Iter : *Cmp->getParent()) {
1303     // If BO is an XOR, it is not guaranteed that it comes after both inputs to
1304     // the overflow intrinsic are defined.
1305     if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
1306       InsertPt = &Iter;
1307       break;
1308     }
1309   }
1310   assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
1311 
1312   IRBuilder<> Builder(InsertPt);
1313   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
1314   if (BO->getOpcode() != Instruction::Xor) {
1315     Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
1316     BO->replaceAllUsesWith(Math);
1317   } else
1318     assert(BO->hasOneUse() &&
1319            "Patterns with XOr should use the BO only in the compare");
1320   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
1321   Cmp->replaceAllUsesWith(OV);
1322   Cmp->eraseFromParent();
1323   BO->eraseFromParent();
1324   return true;
1325 }
1326 
1327 /// Match special-case patterns that check for unsigned add overflow.
1328 static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
1329                                                    BinaryOperator *&Add) {
1330   // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
1331   // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
1332   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
1333 
1334   // We are not expecting non-canonical/degenerate code. Just bail out.
1335   if (isa<Constant>(A))
1336     return false;
1337 
1338   ICmpInst::Predicate Pred = Cmp->getPredicate();
1339   if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))
1340     B = ConstantInt::get(B->getType(), 1);
1341   else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))
1342     B = ConstantInt::get(B->getType(), -1);
1343   else
1344     return false;
1345 
1346   // Check the users of the variable operand of the compare looking for an add
1347   // with the adjusted constant.
1348   for (User *U : A->users()) {
1349     if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {
1350       Add = cast<BinaryOperator>(U);
1351       return true;
1352     }
1353   }
1354   return false;
1355 }
1356 
1357 /// Try to combine the compare into a call to the llvm.uadd.with.overflow
1358 /// intrinsic. Return true if any changes were made.
1359 bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
1360                                                bool &ModifiedDT) {
1361   Value *A, *B;
1362   BinaryOperator *Add;
1363   if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
1364     if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
1365       return false;
1366     // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
1367     A = Add->getOperand(0);
1368     B = Add->getOperand(1);
1369   }
1370 
1371   if (!TLI->shouldFormOverflowOp(ISD::UADDO,
1372                                  TLI->getValueType(*DL, Add->getType()),
1373                                  Add->hasNUsesOrMore(2)))
1374     return false;
1375 
1376   // We don't want to move around uses of condition values this late, so we
1377   // check if it is legal to create the call to the intrinsic in the basic
1378   // block containing the icmp.
1379   if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
1380     return false;
1381 
1382   if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
1383                                    Intrinsic::uadd_with_overflow))
1384     return false;
1385 
1386   // Reset callers - do not crash by iterating over a dead instruction.
1387   ModifiedDT = true;
1388   return true;
1389 }
1390 
1391 bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
1392                                                bool &ModifiedDT) {
1393   // We are not expecting non-canonical/degenerate code. Just bail out.
1394   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
1395   if (isa<Constant>(A) && isa<Constant>(B))
1396     return false;
1397 
1398   // Convert (A u> B) to (A u< B) to simplify pattern matching.
1399   ICmpInst::Predicate Pred = Cmp->getPredicate();
1400   if (Pred == ICmpInst::ICMP_UGT) {
1401     std::swap(A, B);
1402     Pred = ICmpInst::ICMP_ULT;
1403   }
1404   // Convert special-case: (A == 0) is the same as (A u< 1).
1405   if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
1406     B = ConstantInt::get(B->getType(), 1);
1407     Pred = ICmpInst::ICMP_ULT;
1408   }
1409   // Convert special-case: (A != 0) is the same as (0 u< A).
1410   if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {
1411     std::swap(A, B);
1412     Pred = ICmpInst::ICMP_ULT;
1413   }
1414   if (Pred != ICmpInst::ICMP_ULT)
1415     return false;
1416 
1417   // Walk the users of a variable operand of a compare looking for a subtract or
1418   // add with that same operand. Also match the 2nd operand of the compare to
1419   // the add/sub, but that may be a negated constant operand of an add.
1420   Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
1421   BinaryOperator *Sub = nullptr;
1422   for (User *U : CmpVariableOperand->users()) {
1423     // A - B, A u< B --> usubo(A, B)
1424     if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
1425       Sub = cast<BinaryOperator>(U);
1426       break;
1427     }
1428 
1429     // A + (-C), A u< C (canonicalized form of (sub A, C))
1430     const APInt *CmpC, *AddC;
1431     if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
1432         match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {
1433       Sub = cast<BinaryOperator>(U);
1434       break;
1435     }
1436   }
1437   if (!Sub)
1438     return false;
1439 
1440   if (!TLI->shouldFormOverflowOp(ISD::USUBO,
1441                                  TLI->getValueType(*DL, Sub->getType()),
1442                                  Sub->hasNUsesOrMore(2)))
1443     return false;
1444 
1445   if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
1446                                    Cmp, Intrinsic::usub_with_overflow))
1447     return false;
1448 
1449   // Reset callers - do not crash by iterating over a dead instruction.
1450   ModifiedDT = true;
1451   return true;
1452 }
1453 
1454 /// Sink the given CmpInst into user blocks to reduce the number of virtual
1455 /// registers that must be created and coalesced. This is a clear win except on
1456 /// targets with multiple condition code registers (PowerPC), where it might
1457 /// lose; some adjustment may be wanted there.
1458 ///
1459 /// Return true if any changes are made.
1460 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
1461   if (TLI.hasMultipleConditionRegisters())
1462     return false;
1463 
1464   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
1465   if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
1466     return false;
1467 
1468   // Only insert a cmp in each block once.
1469   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
1470 
1471   bool MadeChange = false;
1472   for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
1473        UI != E; ) {
1474     Use &TheUse = UI.getUse();
1475     Instruction *User = cast<Instruction>(*UI);
1476 
1477     // Preincrement use iterator so we don't invalidate it.
1478     ++UI;
1479 
1480     // Don't bother for PHI nodes.
1481     if (isa<PHINode>(User))
1482       continue;
1483 
1484     // Figure out which BB this cmp is used in.
1485     BasicBlock *UserBB = User->getParent();
1486     BasicBlock *DefBB = Cmp->getParent();
1487 
1488     // If this user is in the same block as the cmp, don't change the cmp.
1489     if (UserBB == DefBB) continue;
1490 
1491     // If we have already inserted a cmp into this block, use it.
1492     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
1493 
1494     if (!InsertedCmp) {
1495       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1496       assert(InsertPt != UserBB->end());
1497       InsertedCmp =
1498           CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),
1499                           Cmp->getOperand(0), Cmp->getOperand(1), "",
1500                           &*InsertPt);
1501       // Propagate the debug info.
1502       InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
1503     }
1504 
1505     // Replace a use of the cmp with a use of the new cmp.
1506     TheUse = InsertedCmp;
1507     MadeChange = true;
1508     ++NumCmpUses;
1509   }
1510 
1511   // If we removed all uses, nuke the cmp.
1512   if (Cmp->use_empty()) {
1513     Cmp->eraseFromParent();
1514     MadeChange = true;
1515   }
1516 
1517   return MadeChange;
1518 }
1519 
1520 /// For pattern like:
1521 ///
1522 ///   DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB)
1523 ///   ...
1524 /// DomBB:
1525 ///   ...
1526 ///   br DomCond, TrueBB, CmpBB
1527 /// CmpBB: (with DomBB being the single predecessor)
1528 ///   ...
1529 ///   Cmp = icmp eq CmpOp0, CmpOp1
1530 ///   ...
1531 ///
1532 /// It would use two comparison on targets that lowering of icmp sgt/slt is
1533 /// different from lowering of icmp eq (PowerPC). This function try to convert
1534 /// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'.
1535 /// After that, DomCond and Cmp can use the same comparison so reduce one
1536 /// comparison.
1537 ///
1538 /// Return true if any changes are made.
1539 static bool foldICmpWithDominatingICmp(CmpInst *Cmp,
1540                                        const TargetLowering &TLI) {
1541   if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp())
1542     return false;
1543 
1544   ICmpInst::Predicate Pred = Cmp->getPredicate();
1545   if (Pred != ICmpInst::ICMP_EQ)
1546     return false;
1547 
1548   // If icmp eq has users other than BranchInst and SelectInst, converting it to
1549   // icmp slt/sgt would introduce more redundant LLVM IR.
1550   for (User *U : Cmp->users()) {
1551     if (isa<BranchInst>(U))
1552       continue;
1553     if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == Cmp)
1554       continue;
1555     return false;
1556   }
1557 
1558   // This is a cheap/incomplete check for dominance - just match a single
1559   // predecessor with a conditional branch.
1560   BasicBlock *CmpBB = Cmp->getParent();
1561   BasicBlock *DomBB = CmpBB->getSinglePredecessor();
1562   if (!DomBB)
1563     return false;
1564 
1565   // We want to ensure that the only way control gets to the comparison of
1566   // interest is that a less/greater than comparison on the same operands is
1567   // false.
1568   Value *DomCond;
1569   BasicBlock *TrueBB, *FalseBB;
1570   if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
1571     return false;
1572   if (CmpBB != FalseBB)
1573     return false;
1574 
1575   Value *CmpOp0 = Cmp->getOperand(0), *CmpOp1 = Cmp->getOperand(1);
1576   ICmpInst::Predicate DomPred;
1577   if (!match(DomCond, m_ICmp(DomPred, m_Specific(CmpOp0), m_Specific(CmpOp1))))
1578     return false;
1579   if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT)
1580     return false;
1581 
1582   // Convert the equality comparison to the opposite of the dominating
1583   // comparison and swap the direction for all branch/select users.
1584   // We have conceptually converted:
1585   // Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>;
1586   // to
1587   // Res = (a < b) ? <LT_RES> : (a > b)  ? <GT_RES> : <EQ_RES>;
1588   // And similarly for branches.
1589   for (User *U : Cmp->users()) {
1590     if (auto *BI = dyn_cast<BranchInst>(U)) {
1591       assert(BI->isConditional() && "Must be conditional");
1592       BI->swapSuccessors();
1593       continue;
1594     }
1595     if (auto *SI = dyn_cast<SelectInst>(U)) {
1596       // Swap operands
1597       SI->swapValues();
1598       SI->swapProfMetadata();
1599       continue;
1600     }
1601     llvm_unreachable("Must be a branch or a select");
1602   }
1603   Cmp->setPredicate(CmpInst::getSwappedPredicate(DomPred));
1604   return true;
1605 }
1606 
1607 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
1608   if (sinkCmpExpression(Cmp, *TLI))
1609     return true;
1610 
1611   if (combineToUAddWithOverflow(Cmp, ModifiedDT))
1612     return true;
1613 
1614   if (combineToUSubWithOverflow(Cmp, ModifiedDT))
1615     return true;
1616 
1617   if (foldICmpWithDominatingICmp(Cmp, *TLI))
1618     return true;
1619 
1620   return false;
1621 }
1622 
1623 /// Duplicate and sink the given 'and' instruction into user blocks where it is
1624 /// used in a compare to allow isel to generate better code for targets where
1625 /// this operation can be combined.
1626 ///
1627 /// Return true if any changes are made.
1628 static bool sinkAndCmp0Expression(Instruction *AndI,
1629                                   const TargetLowering &TLI,
1630                                   SetOfInstrs &InsertedInsts) {
1631   // Double-check that we're not trying to optimize an instruction that was
1632   // already optimized by some other part of this pass.
1633   assert(!InsertedInsts.count(AndI) &&
1634          "Attempting to optimize already optimized and instruction");
1635   (void) InsertedInsts;
1636 
1637   // Nothing to do for single use in same basic block.
1638   if (AndI->hasOneUse() &&
1639       AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
1640     return false;
1641 
1642   // Try to avoid cases where sinking/duplicating is likely to increase register
1643   // pressure.
1644   if (!isa<ConstantInt>(AndI->getOperand(0)) &&
1645       !isa<ConstantInt>(AndI->getOperand(1)) &&
1646       AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
1647     return false;
1648 
1649   for (auto *U : AndI->users()) {
1650     Instruction *User = cast<Instruction>(U);
1651 
1652     // Only sink 'and' feeding icmp with 0.
1653     if (!isa<ICmpInst>(User))
1654       return false;
1655 
1656     auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
1657     if (!CmpC || !CmpC->isZero())
1658       return false;
1659   }
1660 
1661   if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
1662     return false;
1663 
1664   LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
1665   LLVM_DEBUG(AndI->getParent()->dump());
1666 
1667   // Push the 'and' into the same block as the icmp 0.  There should only be
1668   // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
1669   // others, so we don't need to keep track of which BBs we insert into.
1670   for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
1671        UI != E; ) {
1672     Use &TheUse = UI.getUse();
1673     Instruction *User = cast<Instruction>(*UI);
1674 
1675     // Preincrement use iterator so we don't invalidate it.
1676     ++UI;
1677 
1678     LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
1679 
1680     // Keep the 'and' in the same place if the use is already in the same block.
1681     Instruction *InsertPt =
1682         User->getParent() == AndI->getParent() ? AndI : User;
1683     Instruction *InsertedAnd =
1684         BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
1685                                AndI->getOperand(1), "", InsertPt);
1686     // Propagate the debug info.
1687     InsertedAnd->setDebugLoc(AndI->getDebugLoc());
1688 
1689     // Replace a use of the 'and' with a use of the new 'and'.
1690     TheUse = InsertedAnd;
1691     ++NumAndUses;
1692     LLVM_DEBUG(User->getParent()->dump());
1693   }
1694 
1695   // We removed all uses, nuke the and.
1696   AndI->eraseFromParent();
1697   return true;
1698 }
1699 
1700 /// Check if the candidates could be combined with a shift instruction, which
1701 /// includes:
1702 /// 1. Truncate instruction
1703 /// 2. And instruction and the imm is a mask of the low bits:
1704 /// imm & (imm+1) == 0
1705 static bool isExtractBitsCandidateUse(Instruction *User) {
1706   if (!isa<TruncInst>(User)) {
1707     if (User->getOpcode() != Instruction::And ||
1708         !isa<ConstantInt>(User->getOperand(1)))
1709       return false;
1710 
1711     const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
1712 
1713     if ((Cimm & (Cimm + 1)).getBoolValue())
1714       return false;
1715   }
1716   return true;
1717 }
1718 
1719 /// Sink both shift and truncate instruction to the use of truncate's BB.
1720 static bool
1721 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
1722                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
1723                      const TargetLowering &TLI, const DataLayout &DL) {
1724   BasicBlock *UserBB = User->getParent();
1725   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
1726   auto *TruncI = cast<TruncInst>(User);
1727   bool MadeChange = false;
1728 
1729   for (Value::user_iterator TruncUI = TruncI->user_begin(),
1730                             TruncE = TruncI->user_end();
1731        TruncUI != TruncE;) {
1732 
1733     Use &TruncTheUse = TruncUI.getUse();
1734     Instruction *TruncUser = cast<Instruction>(*TruncUI);
1735     // Preincrement use iterator so we don't invalidate it.
1736 
1737     ++TruncUI;
1738 
1739     int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
1740     if (!ISDOpcode)
1741       continue;
1742 
1743     // If the use is actually a legal node, there will not be an
1744     // implicit truncate.
1745     // FIXME: always querying the result type is just an
1746     // approximation; some nodes' legality is determined by the
1747     // operand or other means. There's no good way to find out though.
1748     if (TLI.isOperationLegalOrCustom(
1749             ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
1750       continue;
1751 
1752     // Don't bother for PHI nodes.
1753     if (isa<PHINode>(TruncUser))
1754       continue;
1755 
1756     BasicBlock *TruncUserBB = TruncUser->getParent();
1757 
1758     if (UserBB == TruncUserBB)
1759       continue;
1760 
1761     BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
1762     CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
1763 
1764     if (!InsertedShift && !InsertedTrunc) {
1765       BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
1766       assert(InsertPt != TruncUserBB->end());
1767       // Sink the shift
1768       if (ShiftI->getOpcode() == Instruction::AShr)
1769         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1770                                                    "", &*InsertPt);
1771       else
1772         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1773                                                    "", &*InsertPt);
1774       InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
1775 
1776       // Sink the trunc
1777       BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
1778       TruncInsertPt++;
1779       assert(TruncInsertPt != TruncUserBB->end());
1780 
1781       InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
1782                                        TruncI->getType(), "", &*TruncInsertPt);
1783       InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());
1784 
1785       MadeChange = true;
1786 
1787       TruncTheUse = InsertedTrunc;
1788     }
1789   }
1790   return MadeChange;
1791 }
1792 
1793 /// Sink the shift *right* instruction into user blocks if the uses could
1794 /// potentially be combined with this shift instruction and generate BitExtract
1795 /// instruction. It will only be applied if the architecture supports BitExtract
1796 /// instruction. Here is an example:
1797 /// BB1:
1798 ///   %x.extract.shift = lshr i64 %arg1, 32
1799 /// BB2:
1800 ///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
1801 /// ==>
1802 ///
1803 /// BB2:
1804 ///   %x.extract.shift.1 = lshr i64 %arg1, 32
1805 ///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
1806 ///
1807 /// CodeGen will recognize the pattern in BB2 and generate BitExtract
1808 /// instruction.
1809 /// Return true if any changes are made.
1810 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
1811                                 const TargetLowering &TLI,
1812                                 const DataLayout &DL) {
1813   BasicBlock *DefBB = ShiftI->getParent();
1814 
1815   /// Only insert instructions in each block once.
1816   DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
1817 
1818   bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
1819 
1820   bool MadeChange = false;
1821   for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
1822        UI != E;) {
1823     Use &TheUse = UI.getUse();
1824     Instruction *User = cast<Instruction>(*UI);
1825     // Preincrement use iterator so we don't invalidate it.
1826     ++UI;
1827 
1828     // Don't bother for PHI nodes.
1829     if (isa<PHINode>(User))
1830       continue;
1831 
1832     if (!isExtractBitsCandidateUse(User))
1833       continue;
1834 
1835     BasicBlock *UserBB = User->getParent();
1836 
1837     if (UserBB == DefBB) {
1838       // If the shift and truncate instruction are in the same BB. The use of
1839       // the truncate(TruncUse) may still introduce another truncate if not
1840       // legal. In this case, we would like to sink both shift and truncate
1841       // instruction to the BB of TruncUse.
1842       // for example:
1843       // BB1:
1844       // i64 shift.result = lshr i64 opnd, imm
1845       // trunc.result = trunc shift.result to i16
1846       //
1847       // BB2:
1848       //   ----> We will have an implicit truncate here if the architecture does
1849       //   not have i16 compare.
1850       // cmp i16 trunc.result, opnd2
1851       //
1852       if (isa<TruncInst>(User) && shiftIsLegal
1853           // If the type of the truncate is legal, no truncate will be
1854           // introduced in other basic blocks.
1855           &&
1856           (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
1857         MadeChange =
1858             SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
1859 
1860       continue;
1861     }
1862     // If we have already inserted a shift into this block, use it.
1863     BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
1864 
1865     if (!InsertedShift) {
1866       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1867       assert(InsertPt != UserBB->end());
1868 
1869       if (ShiftI->getOpcode() == Instruction::AShr)
1870         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1871                                                    "", &*InsertPt);
1872       else
1873         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1874                                                    "", &*InsertPt);
1875       InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
1876 
1877       MadeChange = true;
1878     }
1879 
1880     // Replace a use of the shift with a use of the new shift.
1881     TheUse = InsertedShift;
1882   }
1883 
1884   // If we removed all uses, or there are none, nuke the shift.
1885   if (ShiftI->use_empty()) {
1886     salvageDebugInfo(*ShiftI);
1887     ShiftI->eraseFromParent();
1888     MadeChange = true;
1889   }
1890 
1891   return MadeChange;
1892 }
1893 
1894 /// If counting leading or trailing zeros is an expensive operation and a zero
1895 /// input is defined, add a check for zero to avoid calling the intrinsic.
1896 ///
1897 /// We want to transform:
1898 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
1899 ///
1900 /// into:
1901 ///   entry:
1902 ///     %cmpz = icmp eq i64 %A, 0
1903 ///     br i1 %cmpz, label %cond.end, label %cond.false
1904 ///   cond.false:
1905 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
1906 ///     br label %cond.end
1907 ///   cond.end:
1908 ///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
1909 ///
1910 /// If the transform is performed, return true and set ModifiedDT to true.
1911 static bool despeculateCountZeros(IntrinsicInst *CountZeros,
1912                                   const TargetLowering *TLI,
1913                                   const DataLayout *DL,
1914                                   bool &ModifiedDT) {
1915   // If a zero input is undefined, it doesn't make sense to despeculate that.
1916   if (match(CountZeros->getOperand(1), m_One()))
1917     return false;
1918 
1919   // If it's cheap to speculate, there's nothing to do.
1920   auto IntrinsicID = CountZeros->getIntrinsicID();
1921   if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
1922       (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
1923     return false;
1924 
1925   // Only handle legal scalar cases. Anything else requires too much work.
1926   Type *Ty = CountZeros->getType();
1927   unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
1928   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
1929     return false;
1930 
1931   // The intrinsic will be sunk behind a compare against zero and branch.
1932   BasicBlock *StartBlock = CountZeros->getParent();
1933   BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
1934 
1935   // Create another block after the count zero intrinsic. A PHI will be added
1936   // in this block to select the result of the intrinsic or the bit-width
1937   // constant if the input to the intrinsic is zero.
1938   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
1939   BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
1940 
1941   // Set up a builder to create a compare, conditional branch, and PHI.
1942   IRBuilder<> Builder(CountZeros->getContext());
1943   Builder.SetInsertPoint(StartBlock->getTerminator());
1944   Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
1945 
1946   // Replace the unconditional branch that was created by the first split with
1947   // a compare against zero and a conditional branch.
1948   Value *Zero = Constant::getNullValue(Ty);
1949   Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
1950   Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
1951   StartBlock->getTerminator()->eraseFromParent();
1952 
1953   // Create a PHI in the end block to select either the output of the intrinsic
1954   // or the bit width of the operand.
1955   Builder.SetInsertPoint(&EndBlock->front());
1956   PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
1957   CountZeros->replaceAllUsesWith(PN);
1958   Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
1959   PN->addIncoming(BitWidth, StartBlock);
1960   PN->addIncoming(CountZeros, CallBlock);
1961 
1962   // We are explicitly handling the zero case, so we can set the intrinsic's
1963   // undefined zero argument to 'true'. This will also prevent reprocessing the
1964   // intrinsic; we only despeculate when a zero input is defined.
1965   CountZeros->setArgOperand(1, Builder.getTrue());
1966   ModifiedDT = true;
1967   return true;
1968 }
1969 
1970 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
1971   BasicBlock *BB = CI->getParent();
1972 
1973   // Lower inline assembly if we can.
1974   // If we found an inline asm expession, and if the target knows how to
1975   // lower it to normal LLVM code, do so now.
1976   if (CI->isInlineAsm()) {
1977     if (TLI->ExpandInlineAsm(CI)) {
1978       // Avoid invalidating the iterator.
1979       CurInstIterator = BB->begin();
1980       // Avoid processing instructions out of order, which could cause
1981       // reuse before a value is defined.
1982       SunkAddrs.clear();
1983       return true;
1984     }
1985     // Sink address computing for memory operands into the block.
1986     if (optimizeInlineAsmInst(CI))
1987       return true;
1988   }
1989 
1990   // Align the pointer arguments to this call if the target thinks it's a good
1991   // idea
1992   unsigned MinSize, PrefAlign;
1993   if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
1994     for (auto &Arg : CI->arg_operands()) {
1995       // We want to align both objects whose address is used directly and
1996       // objects whose address is used in casts and GEPs, though it only makes
1997       // sense for GEPs if the offset is a multiple of the desired alignment and
1998       // if size - offset meets the size threshold.
1999       if (!Arg->getType()->isPointerTy())
2000         continue;
2001       APInt Offset(DL->getIndexSizeInBits(
2002                        cast<PointerType>(Arg->getType())->getAddressSpace()),
2003                    0);
2004       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
2005       uint64_t Offset2 = Offset.getLimitedValue();
2006       if ((Offset2 & (PrefAlign-1)) != 0)
2007         continue;
2008       AllocaInst *AI;
2009       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
2010           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
2011         AI->setAlignment(Align(PrefAlign));
2012       // Global variables can only be aligned if they are defined in this
2013       // object (i.e. they are uniquely initialized in this object), and
2014       // over-aligning global variables that have an explicit section is
2015       // forbidden.
2016       GlobalVariable *GV;
2017       if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
2018           GV->getPointerAlignment(*DL) < PrefAlign &&
2019           DL->getTypeAllocSize(GV->getValueType()) >=
2020               MinSize + Offset2)
2021         GV->setAlignment(MaybeAlign(PrefAlign));
2022     }
2023     // If this is a memcpy (or similar) then we may be able to improve the
2024     // alignment
2025     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
2026       Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
2027       MaybeAlign MIDestAlign = MI->getDestAlign();
2028       if (!MIDestAlign || DestAlign > *MIDestAlign)
2029         MI->setDestAlignment(DestAlign);
2030       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
2031         MaybeAlign MTISrcAlign = MTI->getSourceAlign();
2032         Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
2033         if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
2034           MTI->setSourceAlignment(SrcAlign);
2035       }
2036     }
2037   }
2038 
2039   // If we have a cold call site, try to sink addressing computation into the
2040   // cold block.  This interacts with our handling for loads and stores to
2041   // ensure that we can fold all uses of a potential addressing computation
2042   // into their uses.  TODO: generalize this to work over profiling data
2043   if (CI->hasFnAttr(Attribute::Cold) &&
2044       !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
2045     for (auto &Arg : CI->arg_operands()) {
2046       if (!Arg->getType()->isPointerTy())
2047         continue;
2048       unsigned AS = Arg->getType()->getPointerAddressSpace();
2049       return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
2050     }
2051 
2052   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
2053   if (II) {
2054     switch (II->getIntrinsicID()) {
2055     default: break;
2056     case Intrinsic::assume: {
2057       Value *Operand = II->getOperand(0);
2058       II->eraseFromParent();
2059       // Prune the operand, it's most likely dead.
2060       resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
2061         RecursivelyDeleteTriviallyDeadInstructions(
2062             Operand, TLInfo, nullptr,
2063             [&](Value *V) { removeAllAssertingVHReferences(V); });
2064       });
2065       return true;
2066     }
2067 
2068     case Intrinsic::experimental_widenable_condition: {
2069       // Give up on future widening oppurtunties so that we can fold away dead
2070       // paths and merge blocks before going into block-local instruction
2071       // selection.
2072       if (II->use_empty()) {
2073         II->eraseFromParent();
2074         return true;
2075       }
2076       Constant *RetVal = ConstantInt::getTrue(II->getContext());
2077       resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
2078         replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
2079       });
2080       return true;
2081     }
2082     case Intrinsic::objectsize:
2083       llvm_unreachable("llvm.objectsize.* should have been lowered already");
2084     case Intrinsic::is_constant:
2085       llvm_unreachable("llvm.is.constant.* should have been lowered already");
2086     case Intrinsic::aarch64_stlxr:
2087     case Intrinsic::aarch64_stxr: {
2088       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
2089       if (!ExtVal || !ExtVal->hasOneUse() ||
2090           ExtVal->getParent() == CI->getParent())
2091         return false;
2092       // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
2093       ExtVal->moveBefore(CI);
2094       // Mark this instruction as "inserted by CGP", so that other
2095       // optimizations don't touch it.
2096       InsertedInsts.insert(ExtVal);
2097       return true;
2098     }
2099 
2100     case Intrinsic::launder_invariant_group:
2101     case Intrinsic::strip_invariant_group: {
2102       Value *ArgVal = II->getArgOperand(0);
2103       auto it = LargeOffsetGEPMap.find(II);
2104       if (it != LargeOffsetGEPMap.end()) {
2105           // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
2106           // Make sure not to have to deal with iterator invalidation
2107           // after possibly adding ArgVal to LargeOffsetGEPMap.
2108           auto GEPs = std::move(it->second);
2109           LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
2110           LargeOffsetGEPMap.erase(II);
2111       }
2112 
2113       II->replaceAllUsesWith(ArgVal);
2114       II->eraseFromParent();
2115       return true;
2116     }
2117     case Intrinsic::cttz:
2118     case Intrinsic::ctlz:
2119       // If counting zeros is expensive, try to avoid it.
2120       return despeculateCountZeros(II, TLI, DL, ModifiedDT);
2121     case Intrinsic::fshl:
2122     case Intrinsic::fshr:
2123       return optimizeFunnelShift(II);
2124     case Intrinsic::dbg_value:
2125       return fixupDbgValue(II);
2126     case Intrinsic::vscale: {
2127       // If datalayout has no special restrictions on vector data layout,
2128       // replace `llvm.vscale` by an equivalent constant expression
2129       // to benefit from cheap constant propagation.
2130       Type *ScalableVectorTy =
2131           VectorType::get(Type::getInt8Ty(II->getContext()), 1, true);
2132       if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) {
2133         auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo());
2134         auto *One = ConstantInt::getSigned(II->getType(), 1);
2135         auto *CGep =
2136             ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);
2137         II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));
2138         II->eraseFromParent();
2139         return true;
2140       }
2141       break;
2142     }
2143     case Intrinsic::masked_gather:
2144       return optimizeGatherScatterInst(II, II->getArgOperand(0));
2145     case Intrinsic::masked_scatter:
2146       return optimizeGatherScatterInst(II, II->getArgOperand(1));
2147     }
2148 
2149     SmallVector<Value *, 2> PtrOps;
2150     Type *AccessTy;
2151     if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
2152       while (!PtrOps.empty()) {
2153         Value *PtrVal = PtrOps.pop_back_val();
2154         unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2155         if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
2156           return true;
2157       }
2158   }
2159 
2160   // From here on out we're working with named functions.
2161   if (!CI->getCalledFunction()) return false;
2162 
2163   // Lower all default uses of _chk calls.  This is very similar
2164   // to what InstCombineCalls does, but here we are only lowering calls
2165   // to fortified library functions (e.g. __memcpy_chk) that have the default
2166   // "don't know" as the objectsize.  Anything else should be left alone.
2167   FortifiedLibCallSimplifier Simplifier(TLInfo, true);
2168   IRBuilder<> Builder(CI);
2169   if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
2170     CI->replaceAllUsesWith(V);
2171     CI->eraseFromParent();
2172     return true;
2173   }
2174 
2175   return false;
2176 }
2177 
2178 /// Look for opportunities to duplicate return instructions to the predecessor
2179 /// to enable tail call optimizations. The case it is currently looking for is:
2180 /// @code
2181 /// bb0:
2182 ///   %tmp0 = tail call i32 @f0()
2183 ///   br label %return
2184 /// bb1:
2185 ///   %tmp1 = tail call i32 @f1()
2186 ///   br label %return
2187 /// bb2:
2188 ///   %tmp2 = tail call i32 @f2()
2189 ///   br label %return
2190 /// return:
2191 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
2192 ///   ret i32 %retval
2193 /// @endcode
2194 ///
2195 /// =>
2196 ///
2197 /// @code
2198 /// bb0:
2199 ///   %tmp0 = tail call i32 @f0()
2200 ///   ret i32 %tmp0
2201 /// bb1:
2202 ///   %tmp1 = tail call i32 @f1()
2203 ///   ret i32 %tmp1
2204 /// bb2:
2205 ///   %tmp2 = tail call i32 @f2()
2206 ///   ret i32 %tmp2
2207 /// @endcode
2208 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
2209   ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
2210   if (!RetI)
2211     return false;
2212 
2213   PHINode *PN = nullptr;
2214   ExtractValueInst *EVI = nullptr;
2215   BitCastInst *BCI = nullptr;
2216   Value *V = RetI->getReturnValue();
2217   if (V) {
2218     BCI = dyn_cast<BitCastInst>(V);
2219     if (BCI)
2220       V = BCI->getOperand(0);
2221 
2222     EVI = dyn_cast<ExtractValueInst>(V);
2223     if (EVI) {
2224       V = EVI->getOperand(0);
2225       if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),
2226                        [](unsigned idx) { return idx == 0; }))
2227         return false;
2228     }
2229 
2230     PN = dyn_cast<PHINode>(V);
2231     if (!PN)
2232       return false;
2233   }
2234 
2235   if (PN && PN->getParent() != BB)
2236     return false;
2237 
2238   // Make sure there are no instructions between the PHI and return, or that the
2239   // return is the first instruction in the block.
2240   if (PN) {
2241     BasicBlock::iterator BI = BB->begin();
2242     // Skip over debug and the bitcast.
2243     do {
2244       ++BI;
2245     } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
2246              isa<PseudoProbeInst>(BI));
2247     if (&*BI != RetI)
2248       return false;
2249   } else {
2250     if (BB->getFirstNonPHIOrDbg(true) != RetI)
2251       return false;
2252   }
2253 
2254   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
2255   /// call.
2256   const Function *F = BB->getParent();
2257   SmallVector<BasicBlock*, 4> TailCallBBs;
2258   if (PN) {
2259     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
2260       // Look through bitcasts.
2261       Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
2262       CallInst *CI = dyn_cast<CallInst>(IncomingVal);
2263       BasicBlock *PredBB = PN->getIncomingBlock(I);
2264       // Make sure the phi value is indeed produced by the tail call.
2265       if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
2266           TLI->mayBeEmittedAsTailCall(CI) &&
2267           attributesPermitTailCall(F, CI, RetI, *TLI))
2268         TailCallBBs.push_back(PredBB);
2269     }
2270   } else {
2271     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
2272     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
2273       if (!VisitedBBs.insert(*PI).second)
2274         continue;
2275       if (Instruction *I = (*PI)->rbegin()->getPrevNonDebugInstruction(true)) {
2276         CallInst *CI = dyn_cast<CallInst>(I);
2277         if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
2278             attributesPermitTailCall(F, CI, RetI, *TLI))
2279           TailCallBBs.push_back(*PI);
2280       }
2281     }
2282   }
2283 
2284   bool Changed = false;
2285   for (auto const &TailCallBB : TailCallBBs) {
2286     // Make sure the call instruction is followed by an unconditional branch to
2287     // the return block.
2288     BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());
2289     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
2290       continue;
2291 
2292     // Duplicate the return into TailCallBB.
2293     (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
2294     assert(!VerifyBFIUpdates ||
2295            BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
2296     BFI->setBlockFreq(
2297         BB,
2298         (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
2299     ModifiedDT = Changed = true;
2300     ++NumRetsDup;
2301   }
2302 
2303   // If we eliminated all predecessors of the block, delete the block now.
2304   if (Changed && !BB->hasAddressTaken() && pred_empty(BB))
2305     BB->eraseFromParent();
2306 
2307   return Changed;
2308 }
2309 
2310 //===----------------------------------------------------------------------===//
2311 // Memory Optimization
2312 //===----------------------------------------------------------------------===//
2313 
2314 namespace {
2315 
2316 /// This is an extended version of TargetLowering::AddrMode
2317 /// which holds actual Value*'s for register values.
2318 struct ExtAddrMode : public TargetLowering::AddrMode {
2319   Value *BaseReg = nullptr;
2320   Value *ScaledReg = nullptr;
2321   Value *OriginalValue = nullptr;
2322   bool InBounds = true;
2323 
2324   enum FieldName {
2325     NoField        = 0x00,
2326     BaseRegField   = 0x01,
2327     BaseGVField    = 0x02,
2328     BaseOffsField  = 0x04,
2329     ScaledRegField = 0x08,
2330     ScaleField     = 0x10,
2331     MultipleFields = 0xff
2332   };
2333 
2334 
2335   ExtAddrMode() = default;
2336 
2337   void print(raw_ostream &OS) const;
2338   void dump() const;
2339 
2340   FieldName compare(const ExtAddrMode &other) {
2341     // First check that the types are the same on each field, as differing types
2342     // is something we can't cope with later on.
2343     if (BaseReg && other.BaseReg &&
2344         BaseReg->getType() != other.BaseReg->getType())
2345       return MultipleFields;
2346     if (BaseGV && other.BaseGV &&
2347         BaseGV->getType() != other.BaseGV->getType())
2348       return MultipleFields;
2349     if (ScaledReg && other.ScaledReg &&
2350         ScaledReg->getType() != other.ScaledReg->getType())
2351       return MultipleFields;
2352 
2353     // Conservatively reject 'inbounds' mismatches.
2354     if (InBounds != other.InBounds)
2355       return MultipleFields;
2356 
2357     // Check each field to see if it differs.
2358     unsigned Result = NoField;
2359     if (BaseReg != other.BaseReg)
2360       Result |= BaseRegField;
2361     if (BaseGV != other.BaseGV)
2362       Result |= BaseGVField;
2363     if (BaseOffs != other.BaseOffs)
2364       Result |= BaseOffsField;
2365     if (ScaledReg != other.ScaledReg)
2366       Result |= ScaledRegField;
2367     // Don't count 0 as being a different scale, because that actually means
2368     // unscaled (which will already be counted by having no ScaledReg).
2369     if (Scale && other.Scale && Scale != other.Scale)
2370       Result |= ScaleField;
2371 
2372     if (countPopulation(Result) > 1)
2373       return MultipleFields;
2374     else
2375       return static_cast<FieldName>(Result);
2376   }
2377 
2378   // An AddrMode is trivial if it involves no calculation i.e. it is just a base
2379   // with no offset.
2380   bool isTrivial() {
2381     // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
2382     // trivial if at most one of these terms is nonzero, except that BaseGV and
2383     // BaseReg both being zero actually means a null pointer value, which we
2384     // consider to be 'non-zero' here.
2385     return !BaseOffs && !Scale && !(BaseGV && BaseReg);
2386   }
2387 
2388   Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {
2389     switch (Field) {
2390     default:
2391       return nullptr;
2392     case BaseRegField:
2393       return BaseReg;
2394     case BaseGVField:
2395       return BaseGV;
2396     case ScaledRegField:
2397       return ScaledReg;
2398     case BaseOffsField:
2399       return ConstantInt::get(IntPtrTy, BaseOffs);
2400     }
2401   }
2402 
2403   void SetCombinedField(FieldName Field, Value *V,
2404                         const SmallVectorImpl<ExtAddrMode> &AddrModes) {
2405     switch (Field) {
2406     default:
2407       llvm_unreachable("Unhandled fields are expected to be rejected earlier");
2408       break;
2409     case ExtAddrMode::BaseRegField:
2410       BaseReg = V;
2411       break;
2412     case ExtAddrMode::BaseGVField:
2413       // A combined BaseGV is an Instruction, not a GlobalValue, so it goes
2414       // in the BaseReg field.
2415       assert(BaseReg == nullptr);
2416       BaseReg = V;
2417       BaseGV = nullptr;
2418       break;
2419     case ExtAddrMode::ScaledRegField:
2420       ScaledReg = V;
2421       // If we have a mix of scaled and unscaled addrmodes then we want scale
2422       // to be the scale and not zero.
2423       if (!Scale)
2424         for (const ExtAddrMode &AM : AddrModes)
2425           if (AM.Scale) {
2426             Scale = AM.Scale;
2427             break;
2428           }
2429       break;
2430     case ExtAddrMode::BaseOffsField:
2431       // The offset is no longer a constant, so it goes in ScaledReg with a
2432       // scale of 1.
2433       assert(ScaledReg == nullptr);
2434       ScaledReg = V;
2435       Scale = 1;
2436       BaseOffs = 0;
2437       break;
2438     }
2439   }
2440 };
2441 
2442 } // end anonymous namespace
2443 
2444 #ifndef NDEBUG
2445 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
2446   AM.print(OS);
2447   return OS;
2448 }
2449 #endif
2450 
2451 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2452 void ExtAddrMode::print(raw_ostream &OS) const {
2453   bool NeedPlus = false;
2454   OS << "[";
2455   if (InBounds)
2456     OS << "inbounds ";
2457   if (BaseGV) {
2458     OS << (NeedPlus ? " + " : "")
2459        << "GV:";
2460     BaseGV->printAsOperand(OS, /*PrintType=*/false);
2461     NeedPlus = true;
2462   }
2463 
2464   if (BaseOffs) {
2465     OS << (NeedPlus ? " + " : "")
2466        << BaseOffs;
2467     NeedPlus = true;
2468   }
2469 
2470   if (BaseReg) {
2471     OS << (NeedPlus ? " + " : "")
2472        << "Base:";
2473     BaseReg->printAsOperand(OS, /*PrintType=*/false);
2474     NeedPlus = true;
2475   }
2476   if (Scale) {
2477     OS << (NeedPlus ? " + " : "")
2478        << Scale << "*";
2479     ScaledReg->printAsOperand(OS, /*PrintType=*/false);
2480   }
2481 
2482   OS << ']';
2483 }
2484 
2485 LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
2486   print(dbgs());
2487   dbgs() << '\n';
2488 }
2489 #endif
2490 
2491 namespace {
2492 
2493 /// This class provides transaction based operation on the IR.
2494 /// Every change made through this class is recorded in the internal state and
2495 /// can be undone (rollback) until commit is called.
2496 /// CGP does not check if instructions could be speculatively executed when
2497 /// moved. Preserving the original location would pessimize the debugging
2498 /// experience, as well as negatively impact the quality of sample PGO.
2499 class TypePromotionTransaction {
2500   /// This represents the common interface of the individual transaction.
2501   /// Each class implements the logic for doing one specific modification on
2502   /// the IR via the TypePromotionTransaction.
2503   class TypePromotionAction {
2504   protected:
2505     /// The Instruction modified.
2506     Instruction *Inst;
2507 
2508   public:
2509     /// Constructor of the action.
2510     /// The constructor performs the related action on the IR.
2511     TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
2512 
2513     virtual ~TypePromotionAction() = default;
2514 
2515     /// Undo the modification done by this action.
2516     /// When this method is called, the IR must be in the same state as it was
2517     /// before this action was applied.
2518     /// \pre Undoing the action works if and only if the IR is in the exact same
2519     /// state as it was directly after this action was applied.
2520     virtual void undo() = 0;
2521 
2522     /// Advocate every change made by this action.
2523     /// When the results on the IR of the action are to be kept, it is important
2524     /// to call this function, otherwise hidden information may be kept forever.
2525     virtual void commit() {
2526       // Nothing to be done, this action is not doing anything.
2527     }
2528   };
2529 
2530   /// Utility to remember the position of an instruction.
2531   class InsertionHandler {
2532     /// Position of an instruction.
2533     /// Either an instruction:
2534     /// - Is the first in a basic block: BB is used.
2535     /// - Has a previous instruction: PrevInst is used.
2536     union {
2537       Instruction *PrevInst;
2538       BasicBlock *BB;
2539     } Point;
2540 
2541     /// Remember whether or not the instruction had a previous instruction.
2542     bool HasPrevInstruction;
2543 
2544   public:
2545     /// Record the position of \p Inst.
2546     InsertionHandler(Instruction *Inst) {
2547       BasicBlock::iterator It = Inst->getIterator();
2548       HasPrevInstruction = (It != (Inst->getParent()->begin()));
2549       if (HasPrevInstruction)
2550         Point.PrevInst = &*--It;
2551       else
2552         Point.BB = Inst->getParent();
2553     }
2554 
2555     /// Insert \p Inst at the recorded position.
2556     void insert(Instruction *Inst) {
2557       if (HasPrevInstruction) {
2558         if (Inst->getParent())
2559           Inst->removeFromParent();
2560         Inst->insertAfter(Point.PrevInst);
2561       } else {
2562         Instruction *Position = &*Point.BB->getFirstInsertionPt();
2563         if (Inst->getParent())
2564           Inst->moveBefore(Position);
2565         else
2566           Inst->insertBefore(Position);
2567       }
2568     }
2569   };
2570 
2571   /// Move an instruction before another.
2572   class InstructionMoveBefore : public TypePromotionAction {
2573     /// Original position of the instruction.
2574     InsertionHandler Position;
2575 
2576   public:
2577     /// Move \p Inst before \p Before.
2578     InstructionMoveBefore(Instruction *Inst, Instruction *Before)
2579         : TypePromotionAction(Inst), Position(Inst) {
2580       LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
2581                         << "\n");
2582       Inst->moveBefore(Before);
2583     }
2584 
2585     /// Move the instruction back to its original position.
2586     void undo() override {
2587       LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
2588       Position.insert(Inst);
2589     }
2590   };
2591 
2592   /// Set the operand of an instruction with a new value.
2593   class OperandSetter : public TypePromotionAction {
2594     /// Original operand of the instruction.
2595     Value *Origin;
2596 
2597     /// Index of the modified instruction.
2598     unsigned Idx;
2599 
2600   public:
2601     /// Set \p Idx operand of \p Inst with \p NewVal.
2602     OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
2603         : TypePromotionAction(Inst), Idx(Idx) {
2604       LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
2605                         << "for:" << *Inst << "\n"
2606                         << "with:" << *NewVal << "\n");
2607       Origin = Inst->getOperand(Idx);
2608       Inst->setOperand(Idx, NewVal);
2609     }
2610 
2611     /// Restore the original value of the instruction.
2612     void undo() override {
2613       LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
2614                         << "for: " << *Inst << "\n"
2615                         << "with: " << *Origin << "\n");
2616       Inst->setOperand(Idx, Origin);
2617     }
2618   };
2619 
2620   /// Hide the operands of an instruction.
2621   /// Do as if this instruction was not using any of its operands.
2622   class OperandsHider : public TypePromotionAction {
2623     /// The list of original operands.
2624     SmallVector<Value *, 4> OriginalValues;
2625 
2626   public:
2627     /// Remove \p Inst from the uses of the operands of \p Inst.
2628     OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
2629       LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
2630       unsigned NumOpnds = Inst->getNumOperands();
2631       OriginalValues.reserve(NumOpnds);
2632       for (unsigned It = 0; It < NumOpnds; ++It) {
2633         // Save the current operand.
2634         Value *Val = Inst->getOperand(It);
2635         OriginalValues.push_back(Val);
2636         // Set a dummy one.
2637         // We could use OperandSetter here, but that would imply an overhead
2638         // that we are not willing to pay.
2639         Inst->setOperand(It, UndefValue::get(Val->getType()));
2640       }
2641     }
2642 
2643     /// Restore the original list of uses.
2644     void undo() override {
2645       LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
2646       for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
2647         Inst->setOperand(It, OriginalValues[It]);
2648     }
2649   };
2650 
2651   /// Build a truncate instruction.
2652   class TruncBuilder : public TypePromotionAction {
2653     Value *Val;
2654 
2655   public:
2656     /// Build a truncate instruction of \p Opnd producing a \p Ty
2657     /// result.
2658     /// trunc Opnd to Ty.
2659     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
2660       IRBuilder<> Builder(Opnd);
2661       Builder.SetCurrentDebugLocation(DebugLoc());
2662       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
2663       LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
2664     }
2665 
2666     /// Get the built value.
2667     Value *getBuiltValue() { return Val; }
2668 
2669     /// Remove the built instruction.
2670     void undo() override {
2671       LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
2672       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2673         IVal->eraseFromParent();
2674     }
2675   };
2676 
2677   /// Build a sign extension instruction.
2678   class SExtBuilder : public TypePromotionAction {
2679     Value *Val;
2680 
2681   public:
2682     /// Build a sign extension instruction of \p Opnd producing a \p Ty
2683     /// result.
2684     /// sext Opnd to Ty.
2685     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2686         : TypePromotionAction(InsertPt) {
2687       IRBuilder<> Builder(InsertPt);
2688       Val = Builder.CreateSExt(Opnd, Ty, "promoted");
2689       LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
2690     }
2691 
2692     /// Get the built value.
2693     Value *getBuiltValue() { return Val; }
2694 
2695     /// Remove the built instruction.
2696     void undo() override {
2697       LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
2698       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2699         IVal->eraseFromParent();
2700     }
2701   };
2702 
2703   /// Build a zero extension instruction.
2704   class ZExtBuilder : public TypePromotionAction {
2705     Value *Val;
2706 
2707   public:
2708     /// Build a zero extension instruction of \p Opnd producing a \p Ty
2709     /// result.
2710     /// zext Opnd to Ty.
2711     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2712         : TypePromotionAction(InsertPt) {
2713       IRBuilder<> Builder(InsertPt);
2714       Builder.SetCurrentDebugLocation(DebugLoc());
2715       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
2716       LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
2717     }
2718 
2719     /// Get the built value.
2720     Value *getBuiltValue() { return Val; }
2721 
2722     /// Remove the built instruction.
2723     void undo() override {
2724       LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
2725       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2726         IVal->eraseFromParent();
2727     }
2728   };
2729 
2730   /// Mutate an instruction to another type.
2731   class TypeMutator : public TypePromotionAction {
2732     /// Record the original type.
2733     Type *OrigTy;
2734 
2735   public:
2736     /// Mutate the type of \p Inst into \p NewTy.
2737     TypeMutator(Instruction *Inst, Type *NewTy)
2738         : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
2739       LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
2740                         << "\n");
2741       Inst->mutateType(NewTy);
2742     }
2743 
2744     /// Mutate the instruction back to its original type.
2745     void undo() override {
2746       LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
2747                         << "\n");
2748       Inst->mutateType(OrigTy);
2749     }
2750   };
2751 
2752   /// Replace the uses of an instruction by another instruction.
2753   class UsesReplacer : public TypePromotionAction {
2754     /// Helper structure to keep track of the replaced uses.
2755     struct InstructionAndIdx {
2756       /// The instruction using the instruction.
2757       Instruction *Inst;
2758 
2759       /// The index where this instruction is used for Inst.
2760       unsigned Idx;
2761 
2762       InstructionAndIdx(Instruction *Inst, unsigned Idx)
2763           : Inst(Inst), Idx(Idx) {}
2764     };
2765 
2766     /// Keep track of the original uses (pair Instruction, Index).
2767     SmallVector<InstructionAndIdx, 4> OriginalUses;
2768     /// Keep track of the debug users.
2769     SmallVector<DbgValueInst *, 1> DbgValues;
2770 
2771     using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
2772 
2773   public:
2774     /// Replace all the use of \p Inst by \p New.
2775     UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
2776       LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
2777                         << "\n");
2778       // Record the original uses.
2779       for (Use &U : Inst->uses()) {
2780         Instruction *UserI = cast<Instruction>(U.getUser());
2781         OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
2782       }
2783       // Record the debug uses separately. They are not in the instruction's
2784       // use list, but they are replaced by RAUW.
2785       findDbgValues(DbgValues, Inst);
2786 
2787       // Now, we can replace the uses.
2788       Inst->replaceAllUsesWith(New);
2789     }
2790 
2791     /// Reassign the original uses of Inst to Inst.
2792     void undo() override {
2793       LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
2794       for (use_iterator UseIt = OriginalUses.begin(),
2795                         EndIt = OriginalUses.end();
2796            UseIt != EndIt; ++UseIt) {
2797         UseIt->Inst->setOperand(UseIt->Idx, Inst);
2798       }
2799       // RAUW has replaced all original uses with references to the new value,
2800       // including the debug uses. Since we are undoing the replacements,
2801       // the original debug uses must also be reinstated to maintain the
2802       // correctness and utility of debug value instructions.
2803       for (auto *DVI: DbgValues) {
2804         LLVMContext &Ctx = Inst->getType()->getContext();
2805         auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst));
2806         DVI->setOperand(0, MV);
2807       }
2808     }
2809   };
2810 
2811   /// Remove an instruction from the IR.
2812   class InstructionRemover : public TypePromotionAction {
2813     /// Original position of the instruction.
2814     InsertionHandler Inserter;
2815 
2816     /// Helper structure to hide all the link to the instruction. In other
2817     /// words, this helps to do as if the instruction was removed.
2818     OperandsHider Hider;
2819 
2820     /// Keep track of the uses replaced, if any.
2821     UsesReplacer *Replacer = nullptr;
2822 
2823     /// Keep track of instructions removed.
2824     SetOfInstrs &RemovedInsts;
2825 
2826   public:
2827     /// Remove all reference of \p Inst and optionally replace all its
2828     /// uses with New.
2829     /// \p RemovedInsts Keep track of the instructions removed by this Action.
2830     /// \pre If !Inst->use_empty(), then New != nullptr
2831     InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
2832                        Value *New = nullptr)
2833         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
2834           RemovedInsts(RemovedInsts) {
2835       if (New)
2836         Replacer = new UsesReplacer(Inst, New);
2837       LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
2838       RemovedInsts.insert(Inst);
2839       /// The instructions removed here will be freed after completing
2840       /// optimizeBlock() for all blocks as we need to keep track of the
2841       /// removed instructions during promotion.
2842       Inst->removeFromParent();
2843     }
2844 
2845     ~InstructionRemover() override { delete Replacer; }
2846 
2847     /// Resurrect the instruction and reassign it to the proper uses if
2848     /// new value was provided when build this action.
2849     void undo() override {
2850       LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
2851       Inserter.insert(Inst);
2852       if (Replacer)
2853         Replacer->undo();
2854       Hider.undo();
2855       RemovedInsts.erase(Inst);
2856     }
2857   };
2858 
2859 public:
2860   /// Restoration point.
2861   /// The restoration point is a pointer to an action instead of an iterator
2862   /// because the iterator may be invalidated but not the pointer.
2863   using ConstRestorationPt = const TypePromotionAction *;
2864 
2865   TypePromotionTransaction(SetOfInstrs &RemovedInsts)
2866       : RemovedInsts(RemovedInsts) {}
2867 
2868   /// Advocate every changes made in that transaction. Return true if any change
2869   /// happen.
2870   bool commit();
2871 
2872   /// Undo all the changes made after the given point.
2873   void rollback(ConstRestorationPt Point);
2874 
2875   /// Get the current restoration point.
2876   ConstRestorationPt getRestorationPoint() const;
2877 
2878   /// \name API for IR modification with state keeping to support rollback.
2879   /// @{
2880   /// Same as Instruction::setOperand.
2881   void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
2882 
2883   /// Same as Instruction::eraseFromParent.
2884   void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
2885 
2886   /// Same as Value::replaceAllUsesWith.
2887   void replaceAllUsesWith(Instruction *Inst, Value *New);
2888 
2889   /// Same as Value::mutateType.
2890   void mutateType(Instruction *Inst, Type *NewTy);
2891 
2892   /// Same as IRBuilder::createTrunc.
2893   Value *createTrunc(Instruction *Opnd, Type *Ty);
2894 
2895   /// Same as IRBuilder::createSExt.
2896   Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
2897 
2898   /// Same as IRBuilder::createZExt.
2899   Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
2900 
2901   /// Same as Instruction::moveBefore.
2902   void moveBefore(Instruction *Inst, Instruction *Before);
2903   /// @}
2904 
2905 private:
2906   /// The ordered list of actions made so far.
2907   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
2908 
2909   using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;
2910 
2911   SetOfInstrs &RemovedInsts;
2912 };
2913 
2914 } // end anonymous namespace
2915 
2916 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
2917                                           Value *NewVal) {
2918   Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(
2919       Inst, Idx, NewVal));
2920 }
2921 
2922 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
2923                                                 Value *NewVal) {
2924   Actions.push_back(
2925       std::make_unique<TypePromotionTransaction::InstructionRemover>(
2926           Inst, RemovedInsts, NewVal));
2927 }
2928 
2929 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
2930                                                   Value *New) {
2931   Actions.push_back(
2932       std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
2933 }
2934 
2935 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
2936   Actions.push_back(
2937       std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
2938 }
2939 
2940 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
2941                                              Type *Ty) {
2942   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
2943   Value *Val = Ptr->getBuiltValue();
2944   Actions.push_back(std::move(Ptr));
2945   return Val;
2946 }
2947 
2948 Value *TypePromotionTransaction::createSExt(Instruction *Inst,
2949                                             Value *Opnd, Type *Ty) {
2950   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
2951   Value *Val = Ptr->getBuiltValue();
2952   Actions.push_back(std::move(Ptr));
2953   return Val;
2954 }
2955 
2956 Value *TypePromotionTransaction::createZExt(Instruction *Inst,
2957                                             Value *Opnd, Type *Ty) {
2958   std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
2959   Value *Val = Ptr->getBuiltValue();
2960   Actions.push_back(std::move(Ptr));
2961   return Val;
2962 }
2963 
2964 void TypePromotionTransaction::moveBefore(Instruction *Inst,
2965                                           Instruction *Before) {
2966   Actions.push_back(
2967       std::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
2968           Inst, Before));
2969 }
2970 
2971 TypePromotionTransaction::ConstRestorationPt
2972 TypePromotionTransaction::getRestorationPoint() const {
2973   return !Actions.empty() ? Actions.back().get() : nullptr;
2974 }
2975 
2976 bool TypePromotionTransaction::commit() {
2977   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
2978        ++It)
2979     (*It)->commit();
2980   bool Modified = !Actions.empty();
2981   Actions.clear();
2982   return Modified;
2983 }
2984 
2985 void TypePromotionTransaction::rollback(
2986     TypePromotionTransaction::ConstRestorationPt Point) {
2987   while (!Actions.empty() && Point != Actions.back().get()) {
2988     std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
2989     Curr->undo();
2990   }
2991 }
2992 
2993 namespace {
2994 
2995 /// A helper class for matching addressing modes.
2996 ///
2997 /// This encapsulates the logic for matching the target-legal addressing modes.
2998 class AddressingModeMatcher {
2999   SmallVectorImpl<Instruction*> &AddrModeInsts;
3000   const TargetLowering &TLI;
3001   const TargetRegisterInfo &TRI;
3002   const DataLayout &DL;
3003 
3004   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
3005   /// the memory instruction that we're computing this address for.
3006   Type *AccessTy;
3007   unsigned AddrSpace;
3008   Instruction *MemoryInst;
3009 
3010   /// This is the addressing mode that we're building up. This is
3011   /// part of the return value of this addressing mode matching stuff.
3012   ExtAddrMode &AddrMode;
3013 
3014   /// The instructions inserted by other CodeGenPrepare optimizations.
3015   const SetOfInstrs &InsertedInsts;
3016 
3017   /// A map from the instructions to their type before promotion.
3018   InstrToOrigTy &PromotedInsts;
3019 
3020   /// The ongoing transaction where every action should be registered.
3021   TypePromotionTransaction &TPT;
3022 
3023   // A GEP which has too large offset to be folded into the addressing mode.
3024   std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
3025 
3026   /// This is set to true when we should not do profitability checks.
3027   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
3028   bool IgnoreProfitability;
3029 
3030   /// True if we are optimizing for size.
3031   bool OptSize;
3032 
3033   ProfileSummaryInfo *PSI;
3034   BlockFrequencyInfo *BFI;
3035 
3036   AddressingModeMatcher(
3037       SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
3038       const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
3039       ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
3040       InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
3041       std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3042       bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
3043       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
3044         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
3045         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
3046         PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
3047         OptSize(OptSize), PSI(PSI), BFI(BFI) {
3048     IgnoreProfitability = false;
3049   }
3050 
3051 public:
3052   /// Find the maximal addressing mode that a load/store of V can fold,
3053   /// give an access type of AccessTy.  This returns a list of involved
3054   /// instructions in AddrModeInsts.
3055   /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
3056   /// optimizations.
3057   /// \p PromotedInsts maps the instructions to their type before promotion.
3058   /// \p The ongoing transaction where every action should be registered.
3059   static ExtAddrMode
3060   Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
3061         SmallVectorImpl<Instruction *> &AddrModeInsts,
3062         const TargetLowering &TLI, const TargetRegisterInfo &TRI,
3063         const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
3064         TypePromotionTransaction &TPT,
3065         std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3066         bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
3067     ExtAddrMode Result;
3068 
3069     bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
3070                                          MemoryInst, Result, InsertedInsts,
3071                                          PromotedInsts, TPT, LargeOffsetGEP,
3072                                          OptSize, PSI, BFI)
3073                        .matchAddr(V, 0);
3074     (void)Success; assert(Success && "Couldn't select *anything*?");
3075     return Result;
3076   }
3077 
3078 private:
3079   bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
3080   bool matchAddr(Value *Addr, unsigned Depth);
3081   bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
3082                           bool *MovedAway = nullptr);
3083   bool isProfitableToFoldIntoAddressingMode(Instruction *I,
3084                                             ExtAddrMode &AMBefore,
3085                                             ExtAddrMode &AMAfter);
3086   bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
3087   bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
3088                              Value *PromotedOperand) const;
3089 };
3090 
3091 class PhiNodeSet;
3092 
3093 /// An iterator for PhiNodeSet.
3094 class PhiNodeSetIterator {
3095   PhiNodeSet * const Set;
3096   size_t CurrentIndex = 0;
3097 
3098 public:
3099   /// The constructor. Start should point to either a valid element, or be equal
3100   /// to the size of the underlying SmallVector of the PhiNodeSet.
3101   PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start);
3102   PHINode * operator*() const;
3103   PhiNodeSetIterator& operator++();
3104   bool operator==(const PhiNodeSetIterator &RHS) const;
3105   bool operator!=(const PhiNodeSetIterator &RHS) const;
3106 };
3107 
3108 /// Keeps a set of PHINodes.
3109 ///
3110 /// This is a minimal set implementation for a specific use case:
3111 /// It is very fast when there are very few elements, but also provides good
3112 /// performance when there are many. It is similar to SmallPtrSet, but also
3113 /// provides iteration by insertion order, which is deterministic and stable
3114 /// across runs. It is also similar to SmallSetVector, but provides removing
3115 /// elements in O(1) time. This is achieved by not actually removing the element
3116 /// from the underlying vector, so comes at the cost of using more memory, but
3117 /// that is fine, since PhiNodeSets are used as short lived objects.
3118 class PhiNodeSet {
3119   friend class PhiNodeSetIterator;
3120 
3121   using MapType = SmallDenseMap<PHINode *, size_t, 32>;
3122   using iterator =  PhiNodeSetIterator;
3123 
3124   /// Keeps the elements in the order of their insertion in the underlying
3125   /// vector. To achieve constant time removal, it never deletes any element.
3126   SmallVector<PHINode *, 32> NodeList;
3127 
3128   /// Keeps the elements in the underlying set implementation. This (and not the
3129   /// NodeList defined above) is the source of truth on whether an element
3130   /// is actually in the collection.
3131   MapType NodeMap;
3132 
3133   /// Points to the first valid (not deleted) element when the set is not empty
3134   /// and the value is not zero. Equals to the size of the underlying vector
3135   /// when the set is empty. When the value is 0, as in the beginning, the
3136   /// first element may or may not be valid.
3137   size_t FirstValidElement = 0;
3138 
3139 public:
3140   /// Inserts a new element to the collection.
3141   /// \returns true if the element is actually added, i.e. was not in the
3142   /// collection before the operation.
3143   bool insert(PHINode *Ptr) {
3144     if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
3145       NodeList.push_back(Ptr);
3146       return true;
3147     }
3148     return false;
3149   }
3150 
3151   /// Removes the element from the collection.
3152   /// \returns whether the element is actually removed, i.e. was in the
3153   /// collection before the operation.
3154   bool erase(PHINode *Ptr) {
3155     auto it = NodeMap.find(Ptr);
3156     if (it != NodeMap.end()) {
3157       NodeMap.erase(Ptr);
3158       SkipRemovedElements(FirstValidElement);
3159       return true;
3160     }
3161     return false;
3162   }
3163 
3164   /// Removes all elements and clears the collection.
3165   void clear() {
3166     NodeMap.clear();
3167     NodeList.clear();
3168     FirstValidElement = 0;
3169   }
3170 
3171   /// \returns an iterator that will iterate the elements in the order of
3172   /// insertion.
3173   iterator begin() {
3174     if (FirstValidElement == 0)
3175       SkipRemovedElements(FirstValidElement);
3176     return PhiNodeSetIterator(this, FirstValidElement);
3177   }
3178 
3179   /// \returns an iterator that points to the end of the collection.
3180   iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
3181 
3182   /// Returns the number of elements in the collection.
3183   size_t size() const {
3184     return NodeMap.size();
3185   }
3186 
3187   /// \returns 1 if the given element is in the collection, and 0 if otherwise.
3188   size_t count(PHINode *Ptr) const {
3189     return NodeMap.count(Ptr);
3190   }
3191 
3192 private:
3193   /// Updates the CurrentIndex so that it will point to a valid element.
3194   ///
3195   /// If the element of NodeList at CurrentIndex is valid, it does not
3196   /// change it. If there are no more valid elements, it updates CurrentIndex
3197   /// to point to the end of the NodeList.
3198   void SkipRemovedElements(size_t &CurrentIndex) {
3199     while (CurrentIndex < NodeList.size()) {
3200       auto it = NodeMap.find(NodeList[CurrentIndex]);
3201       // If the element has been deleted and added again later, NodeMap will
3202       // point to a different index, so CurrentIndex will still be invalid.
3203       if (it != NodeMap.end() && it->second == CurrentIndex)
3204         break;
3205       ++CurrentIndex;
3206     }
3207   }
3208 };
3209 
3210 PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
3211     : Set(Set), CurrentIndex(Start) {}
3212 
3213 PHINode * PhiNodeSetIterator::operator*() const {
3214   assert(CurrentIndex < Set->NodeList.size() &&
3215          "PhiNodeSet access out of range");
3216   return Set->NodeList[CurrentIndex];
3217 }
3218 
3219 PhiNodeSetIterator& PhiNodeSetIterator::operator++() {
3220   assert(CurrentIndex < Set->NodeList.size() &&
3221          "PhiNodeSet access out of range");
3222   ++CurrentIndex;
3223   Set->SkipRemovedElements(CurrentIndex);
3224   return *this;
3225 }
3226 
3227 bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
3228   return CurrentIndex == RHS.CurrentIndex;
3229 }
3230 
3231 bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
3232   return !((*this) == RHS);
3233 }
3234 
3235 /// Keep track of simplification of Phi nodes.
3236 /// Accept the set of all phi nodes and erase phi node from this set
3237 /// if it is simplified.
3238 class SimplificationTracker {
3239   DenseMap<Value *, Value *> Storage;
3240   const SimplifyQuery &SQ;
3241   // Tracks newly created Phi nodes. The elements are iterated by insertion
3242   // order.
3243   PhiNodeSet AllPhiNodes;
3244   // Tracks newly created Select nodes.
3245   SmallPtrSet<SelectInst *, 32> AllSelectNodes;
3246 
3247 public:
3248   SimplificationTracker(const SimplifyQuery &sq)
3249       : SQ(sq) {}
3250 
3251   Value *Get(Value *V) {
3252     do {
3253       auto SV = Storage.find(V);
3254       if (SV == Storage.end())
3255         return V;
3256       V = SV->second;
3257     } while (true);
3258   }
3259 
3260   Value *Simplify(Value *Val) {
3261     SmallVector<Value *, 32> WorkList;
3262     SmallPtrSet<Value *, 32> Visited;
3263     WorkList.push_back(Val);
3264     while (!WorkList.empty()) {
3265       auto *P = WorkList.pop_back_val();
3266       if (!Visited.insert(P).second)
3267         continue;
3268       if (auto *PI = dyn_cast<Instruction>(P))
3269         if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
3270           for (auto *U : PI->users())
3271             WorkList.push_back(cast<Value>(U));
3272           Put(PI, V);
3273           PI->replaceAllUsesWith(V);
3274           if (auto *PHI = dyn_cast<PHINode>(PI))
3275             AllPhiNodes.erase(PHI);
3276           if (auto *Select = dyn_cast<SelectInst>(PI))
3277             AllSelectNodes.erase(Select);
3278           PI->eraseFromParent();
3279         }
3280     }
3281     return Get(Val);
3282   }
3283 
3284   void Put(Value *From, Value *To) {
3285     Storage.insert({ From, To });
3286   }
3287 
3288   void ReplacePhi(PHINode *From, PHINode *To) {
3289     Value* OldReplacement = Get(From);
3290     while (OldReplacement != From) {
3291       From = To;
3292       To = dyn_cast<PHINode>(OldReplacement);
3293       OldReplacement = Get(From);
3294     }
3295     assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
3296     Put(From, To);
3297     From->replaceAllUsesWith(To);
3298     AllPhiNodes.erase(From);
3299     From->eraseFromParent();
3300   }
3301 
3302   PhiNodeSet& newPhiNodes() { return AllPhiNodes; }
3303 
3304   void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
3305 
3306   void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }
3307 
3308   unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
3309 
3310   unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
3311 
3312   void destroyNewNodes(Type *CommonType) {
3313     // For safe erasing, replace the uses with dummy value first.
3314     auto *Dummy = UndefValue::get(CommonType);
3315     for (auto *I : AllPhiNodes) {
3316       I->replaceAllUsesWith(Dummy);
3317       I->eraseFromParent();
3318     }
3319     AllPhiNodes.clear();
3320     for (auto *I : AllSelectNodes) {
3321       I->replaceAllUsesWith(Dummy);
3322       I->eraseFromParent();
3323     }
3324     AllSelectNodes.clear();
3325   }
3326 };
3327 
3328 /// A helper class for combining addressing modes.
3329 class AddressingModeCombiner {
3330   typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
3331   typedef std::pair<PHINode *, PHINode *> PHIPair;
3332 
3333 private:
3334   /// The addressing modes we've collected.
3335   SmallVector<ExtAddrMode, 16> AddrModes;
3336 
3337   /// The field in which the AddrModes differ, when we have more than one.
3338   ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;
3339 
3340   /// Are the AddrModes that we have all just equal to their original values?
3341   bool AllAddrModesTrivial = true;
3342 
3343   /// Common Type for all different fields in addressing modes.
3344   Type *CommonType;
3345 
3346   /// SimplifyQuery for simplifyInstruction utility.
3347   const SimplifyQuery &SQ;
3348 
3349   /// Original Address.
3350   Value *Original;
3351 
3352 public:
3353   AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
3354       : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
3355 
3356   /// Get the combined AddrMode
3357   const ExtAddrMode &getAddrMode() const {
3358     return AddrModes[0];
3359   }
3360 
3361   /// Add a new AddrMode if it's compatible with the AddrModes we already
3362   /// have.
3363   /// \return True iff we succeeded in doing so.
3364   bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
3365     // Take note of if we have any non-trivial AddrModes, as we need to detect
3366     // when all AddrModes are trivial as then we would introduce a phi or select
3367     // which just duplicates what's already there.
3368     AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();
3369 
3370     // If this is the first addrmode then everything is fine.
3371     if (AddrModes.empty()) {
3372       AddrModes.emplace_back(NewAddrMode);
3373       return true;
3374     }
3375 
3376     // Figure out how different this is from the other address modes, which we
3377     // can do just by comparing against the first one given that we only care
3378     // about the cumulative difference.
3379     ExtAddrMode::FieldName ThisDifferentField =
3380       AddrModes[0].compare(NewAddrMode);
3381     if (DifferentField == ExtAddrMode::NoField)
3382       DifferentField = ThisDifferentField;
3383     else if (DifferentField != ThisDifferentField)
3384       DifferentField = ExtAddrMode::MultipleFields;
3385 
3386     // If NewAddrMode differs in more than one dimension we cannot handle it.
3387     bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
3388 
3389     // If Scale Field is different then we reject.
3390     CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
3391 
3392     // We also must reject the case when base offset is different and
3393     // scale reg is not null, we cannot handle this case due to merge of
3394     // different offsets will be used as ScaleReg.
3395     CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
3396                               !NewAddrMode.ScaledReg);
3397 
3398     // We also must reject the case when GV is different and BaseReg installed
3399     // due to we want to use base reg as a merge of GV values.
3400     CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
3401                               !NewAddrMode.HasBaseReg);
3402 
3403     // Even if NewAddMode is the same we still need to collect it due to
3404     // original value is different. And later we will need all original values
3405     // as anchors during finding the common Phi node.
3406     if (CanHandle)
3407       AddrModes.emplace_back(NewAddrMode);
3408     else
3409       AddrModes.clear();
3410 
3411     return CanHandle;
3412   }
3413 
3414   /// Combine the addressing modes we've collected into a single
3415   /// addressing mode.
3416   /// \return True iff we successfully combined them or we only had one so
3417   /// didn't need to combine them anyway.
3418   bool combineAddrModes() {
3419     // If we have no AddrModes then they can't be combined.
3420     if (AddrModes.size() == 0)
3421       return false;
3422 
3423     // A single AddrMode can trivially be combined.
3424     if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)
3425       return true;
3426 
3427     // If the AddrModes we collected are all just equal to the value they are
3428     // derived from then combining them wouldn't do anything useful.
3429     if (AllAddrModesTrivial)
3430       return false;
3431 
3432     if (!addrModeCombiningAllowed())
3433       return false;
3434 
3435     // Build a map between <original value, basic block where we saw it> to
3436     // value of base register.
3437     // Bail out if there is no common type.
3438     FoldAddrToValueMapping Map;
3439     if (!initializeMap(Map))
3440       return false;
3441 
3442     Value *CommonValue = findCommon(Map);
3443     if (CommonValue)
3444       AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
3445     return CommonValue != nullptr;
3446   }
3447 
3448 private:
3449   /// Initialize Map with anchor values. For address seen
3450   /// we set the value of different field saw in this address.
3451   /// At the same time we find a common type for different field we will
3452   /// use to create new Phi/Select nodes. Keep it in CommonType field.
3453   /// Return false if there is no common type found.
3454   bool initializeMap(FoldAddrToValueMapping &Map) {
3455     // Keep track of keys where the value is null. We will need to replace it
3456     // with constant null when we know the common type.
3457     SmallVector<Value *, 2> NullValue;
3458     Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
3459     for (auto &AM : AddrModes) {
3460       Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
3461       if (DV) {
3462         auto *Type = DV->getType();
3463         if (CommonType && CommonType != Type)
3464           return false;
3465         CommonType = Type;
3466         Map[AM.OriginalValue] = DV;
3467       } else {
3468         NullValue.push_back(AM.OriginalValue);
3469       }
3470     }
3471     assert(CommonType && "At least one non-null value must be!");
3472     for (auto *V : NullValue)
3473       Map[V] = Constant::getNullValue(CommonType);
3474     return true;
3475   }
3476 
3477   /// We have mapping between value A and other value B where B was a field in
3478   /// addressing mode represented by A. Also we have an original value C
3479   /// representing an address we start with. Traversing from C through phi and
3480   /// selects we ended up with A's in a map. This utility function tries to find
3481   /// a value V which is a field in addressing mode C and traversing through phi
3482   /// nodes and selects we will end up in corresponded values B in a map.
3483   /// The utility will create a new Phi/Selects if needed.
3484   // The simple example looks as follows:
3485   // BB1:
3486   //   p1 = b1 + 40
3487   //   br cond BB2, BB3
3488   // BB2:
3489   //   p2 = b2 + 40
3490   //   br BB3
3491   // BB3:
3492   //   p = phi [p1, BB1], [p2, BB2]
3493   //   v = load p
3494   // Map is
3495   //   p1 -> b1
3496   //   p2 -> b2
3497   // Request is
3498   //   p -> ?
3499   // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
3500   Value *findCommon(FoldAddrToValueMapping &Map) {
3501     // Tracks the simplification of newly created phi nodes. The reason we use
3502     // this mapping is because we will add new created Phi nodes in AddrToBase.
3503     // Simplification of Phi nodes is recursive, so some Phi node may
3504     // be simplified after we added it to AddrToBase. In reality this
3505     // simplification is possible only if original phi/selects were not
3506     // simplified yet.
3507     // Using this mapping we can find the current value in AddrToBase.
3508     SimplificationTracker ST(SQ);
3509 
3510     // First step, DFS to create PHI nodes for all intermediate blocks.
3511     // Also fill traverse order for the second step.
3512     SmallVector<Value *, 32> TraverseOrder;
3513     InsertPlaceholders(Map, TraverseOrder, ST);
3514 
3515     // Second Step, fill new nodes by merged values and simplify if possible.
3516     FillPlaceholders(Map, TraverseOrder, ST);
3517 
3518     if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
3519       ST.destroyNewNodes(CommonType);
3520       return nullptr;
3521     }
3522 
3523     // Now we'd like to match New Phi nodes to existed ones.
3524     unsigned PhiNotMatchedCount = 0;
3525     if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
3526       ST.destroyNewNodes(CommonType);
3527       return nullptr;
3528     }
3529 
3530     auto *Result = ST.Get(Map.find(Original)->second);
3531     if (Result) {
3532       NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
3533       NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
3534     }
3535     return Result;
3536   }
3537 
3538   /// Try to match PHI node to Candidate.
3539   /// Matcher tracks the matched Phi nodes.
3540   bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
3541                     SmallSetVector<PHIPair, 8> &Matcher,
3542                     PhiNodeSet &PhiNodesToMatch) {
3543     SmallVector<PHIPair, 8> WorkList;
3544     Matcher.insert({ PHI, Candidate });
3545     SmallSet<PHINode *, 8> MatchedPHIs;
3546     MatchedPHIs.insert(PHI);
3547     WorkList.push_back({ PHI, Candidate });
3548     SmallSet<PHIPair, 8> Visited;
3549     while (!WorkList.empty()) {
3550       auto Item = WorkList.pop_back_val();
3551       if (!Visited.insert(Item).second)
3552         continue;
3553       // We iterate over all incoming values to Phi to compare them.
3554       // If values are different and both of them Phi and the first one is a
3555       // Phi we added (subject to match) and both of them is in the same basic
3556       // block then we can match our pair if values match. So we state that
3557       // these values match and add it to work list to verify that.
3558       for (auto B : Item.first->blocks()) {
3559         Value *FirstValue = Item.first->getIncomingValueForBlock(B);
3560         Value *SecondValue = Item.second->getIncomingValueForBlock(B);
3561         if (FirstValue == SecondValue)
3562           continue;
3563 
3564         PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
3565         PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
3566 
3567         // One of them is not Phi or
3568         // The first one is not Phi node from the set we'd like to match or
3569         // Phi nodes from different basic blocks then
3570         // we will not be able to match.
3571         if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
3572             FirstPhi->getParent() != SecondPhi->getParent())
3573           return false;
3574 
3575         // If we already matched them then continue.
3576         if (Matcher.count({ FirstPhi, SecondPhi }))
3577           continue;
3578         // So the values are different and does not match. So we need them to
3579         // match. (But we register no more than one match per PHI node, so that
3580         // we won't later try to replace them twice.)
3581         if (MatchedPHIs.insert(FirstPhi).second)
3582           Matcher.insert({ FirstPhi, SecondPhi });
3583         // But me must check it.
3584         WorkList.push_back({ FirstPhi, SecondPhi });
3585       }
3586     }
3587     return true;
3588   }
3589 
3590   /// For the given set of PHI nodes (in the SimplificationTracker) try
3591   /// to find their equivalents.
3592   /// Returns false if this matching fails and creation of new Phi is disabled.
3593   bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
3594                    unsigned &PhiNotMatchedCount) {
3595     // Matched and PhiNodesToMatch iterate their elements in a deterministic
3596     // order, so the replacements (ReplacePhi) are also done in a deterministic
3597     // order.
3598     SmallSetVector<PHIPair, 8> Matched;
3599     SmallPtrSet<PHINode *, 8> WillNotMatch;
3600     PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
3601     while (PhiNodesToMatch.size()) {
3602       PHINode *PHI = *PhiNodesToMatch.begin();
3603 
3604       // Add us, if no Phi nodes in the basic block we do not match.
3605       WillNotMatch.clear();
3606       WillNotMatch.insert(PHI);
3607 
3608       // Traverse all Phis until we found equivalent or fail to do that.
3609       bool IsMatched = false;
3610       for (auto &P : PHI->getParent()->phis()) {
3611         if (&P == PHI)
3612           continue;
3613         if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
3614           break;
3615         // If it does not match, collect all Phi nodes from matcher.
3616         // if we end up with no match, them all these Phi nodes will not match
3617         // later.
3618         for (auto M : Matched)
3619           WillNotMatch.insert(M.first);
3620         Matched.clear();
3621       }
3622       if (IsMatched) {
3623         // Replace all matched values and erase them.
3624         for (auto MV : Matched)
3625           ST.ReplacePhi(MV.first, MV.second);
3626         Matched.clear();
3627         continue;
3628       }
3629       // If we are not allowed to create new nodes then bail out.
3630       if (!AllowNewPhiNodes)
3631         return false;
3632       // Just remove all seen values in matcher. They will not match anything.
3633       PhiNotMatchedCount += WillNotMatch.size();
3634       for (auto *P : WillNotMatch)
3635         PhiNodesToMatch.erase(P);
3636     }
3637     return true;
3638   }
3639   /// Fill the placeholders with values from predecessors and simplify them.
3640   void FillPlaceholders(FoldAddrToValueMapping &Map,
3641                         SmallVectorImpl<Value *> &TraverseOrder,
3642                         SimplificationTracker &ST) {
3643     while (!TraverseOrder.empty()) {
3644       Value *Current = TraverseOrder.pop_back_val();
3645       assert(Map.find(Current) != Map.end() && "No node to fill!!!");
3646       Value *V = Map[Current];
3647 
3648       if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
3649         // CurrentValue also must be Select.
3650         auto *CurrentSelect = cast<SelectInst>(Current);
3651         auto *TrueValue = CurrentSelect->getTrueValue();
3652         assert(Map.find(TrueValue) != Map.end() && "No True Value!");
3653         Select->setTrueValue(ST.Get(Map[TrueValue]));
3654         auto *FalseValue = CurrentSelect->getFalseValue();
3655         assert(Map.find(FalseValue) != Map.end() && "No False Value!");
3656         Select->setFalseValue(ST.Get(Map[FalseValue]));
3657       } else {
3658         // Must be a Phi node then.
3659         auto *PHI = cast<PHINode>(V);
3660         // Fill the Phi node with values from predecessors.
3661         for (auto *B : predecessors(PHI->getParent())) {
3662           Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
3663           assert(Map.find(PV) != Map.end() && "No predecessor Value!");
3664           PHI->addIncoming(ST.Get(Map[PV]), B);
3665         }
3666       }
3667       Map[Current] = ST.Simplify(V);
3668     }
3669   }
3670 
3671   /// Starting from original value recursively iterates over def-use chain up to
3672   /// known ending values represented in a map. For each traversed phi/select
3673   /// inserts a placeholder Phi or Select.
3674   /// Reports all new created Phi/Select nodes by adding them to set.
3675   /// Also reports and order in what values have been traversed.
3676   void InsertPlaceholders(FoldAddrToValueMapping &Map,
3677                           SmallVectorImpl<Value *> &TraverseOrder,
3678                           SimplificationTracker &ST) {
3679     SmallVector<Value *, 32> Worklist;
3680     assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
3681            "Address must be a Phi or Select node");
3682     auto *Dummy = UndefValue::get(CommonType);
3683     Worklist.push_back(Original);
3684     while (!Worklist.empty()) {
3685       Value *Current = Worklist.pop_back_val();
3686       // if it is already visited or it is an ending value then skip it.
3687       if (Map.find(Current) != Map.end())
3688         continue;
3689       TraverseOrder.push_back(Current);
3690 
3691       // CurrentValue must be a Phi node or select. All others must be covered
3692       // by anchors.
3693       if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
3694         // Is it OK to get metadata from OrigSelect?!
3695         // Create a Select placeholder with dummy value.
3696         SelectInst *Select = SelectInst::Create(
3697             CurrentSelect->getCondition(), Dummy, Dummy,
3698             CurrentSelect->getName(), CurrentSelect, CurrentSelect);
3699         Map[Current] = Select;
3700         ST.insertNewSelect(Select);
3701         // We are interested in True and False values.
3702         Worklist.push_back(CurrentSelect->getTrueValue());
3703         Worklist.push_back(CurrentSelect->getFalseValue());
3704       } else {
3705         // It must be a Phi node then.
3706         PHINode *CurrentPhi = cast<PHINode>(Current);
3707         unsigned PredCount = CurrentPhi->getNumIncomingValues();
3708         PHINode *PHI =
3709             PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
3710         Map[Current] = PHI;
3711         ST.insertNewPhi(PHI);
3712         for (Value *P : CurrentPhi->incoming_values())
3713           Worklist.push_back(P);
3714       }
3715     }
3716   }
3717 
3718   bool addrModeCombiningAllowed() {
3719     if (DisableComplexAddrModes)
3720       return false;
3721     switch (DifferentField) {
3722     default:
3723       return false;
3724     case ExtAddrMode::BaseRegField:
3725       return AddrSinkCombineBaseReg;
3726     case ExtAddrMode::BaseGVField:
3727       return AddrSinkCombineBaseGV;
3728     case ExtAddrMode::BaseOffsField:
3729       return AddrSinkCombineBaseOffs;
3730     case ExtAddrMode::ScaledRegField:
3731       return AddrSinkCombineScaledReg;
3732     }
3733   }
3734 };
3735 } // end anonymous namespace
3736 
3737 /// Try adding ScaleReg*Scale to the current addressing mode.
3738 /// Return true and update AddrMode if this addr mode is legal for the target,
3739 /// false if not.
3740 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
3741                                              unsigned Depth) {
3742   // If Scale is 1, then this is the same as adding ScaleReg to the addressing
3743   // mode.  Just process that directly.
3744   if (Scale == 1)
3745     return matchAddr(ScaleReg, Depth);
3746 
3747   // If the scale is 0, it takes nothing to add this.
3748   if (Scale == 0)
3749     return true;
3750 
3751   // If we already have a scale of this value, we can add to it, otherwise, we
3752   // need an available scale field.
3753   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
3754     return false;
3755 
3756   ExtAddrMode TestAddrMode = AddrMode;
3757 
3758   // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
3759   // [A+B + A*7] -> [B+A*8].
3760   TestAddrMode.Scale += Scale;
3761   TestAddrMode.ScaledReg = ScaleReg;
3762 
3763   // If the new address isn't legal, bail out.
3764   if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
3765     return false;
3766 
3767   // It was legal, so commit it.
3768   AddrMode = TestAddrMode;
3769 
3770   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
3771   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
3772   // X*Scale + C*Scale to addr mode.
3773   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
3774   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
3775       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
3776       CI->getValue().isSignedIntN(64)) {
3777     TestAddrMode.InBounds = false;
3778     TestAddrMode.ScaledReg = AddLHS;
3779     TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
3780 
3781     // If this addressing mode is legal, commit it and remember that we folded
3782     // this instruction.
3783     if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
3784       AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
3785       AddrMode = TestAddrMode;
3786       return true;
3787     }
3788   }
3789 
3790   // Otherwise, not (x+c)*scale, just return what we have.
3791   return true;
3792 }
3793 
3794 /// This is a little filter, which returns true if an addressing computation
3795 /// involving I might be folded into a load/store accessing it.
3796 /// This doesn't need to be perfect, but needs to accept at least
3797 /// the set of instructions that MatchOperationAddr can.
3798 static bool MightBeFoldableInst(Instruction *I) {
3799   switch (I->getOpcode()) {
3800   case Instruction::BitCast:
3801   case Instruction::AddrSpaceCast:
3802     // Don't touch identity bitcasts.
3803     if (I->getType() == I->getOperand(0)->getType())
3804       return false;
3805     return I->getType()->isIntOrPtrTy();
3806   case Instruction::PtrToInt:
3807     // PtrToInt is always a noop, as we know that the int type is pointer sized.
3808     return true;
3809   case Instruction::IntToPtr:
3810     // We know the input is intptr_t, so this is foldable.
3811     return true;
3812   case Instruction::Add:
3813     return true;
3814   case Instruction::Mul:
3815   case Instruction::Shl:
3816     // Can only handle X*C and X << C.
3817     return isa<ConstantInt>(I->getOperand(1));
3818   case Instruction::GetElementPtr:
3819     return true;
3820   default:
3821     return false;
3822   }
3823 }
3824 
3825 /// Check whether or not \p Val is a legal instruction for \p TLI.
3826 /// \note \p Val is assumed to be the product of some type promotion.
3827 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
3828 /// to be legal, as the non-promoted value would have had the same state.
3829 static bool isPromotedInstructionLegal(const TargetLowering &TLI,
3830                                        const DataLayout &DL, Value *Val) {
3831   Instruction *PromotedInst = dyn_cast<Instruction>(Val);
3832   if (!PromotedInst)
3833     return false;
3834   int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
3835   // If the ISDOpcode is undefined, it was undefined before the promotion.
3836   if (!ISDOpcode)
3837     return true;
3838   // Otherwise, check if the promoted instruction is legal or not.
3839   return TLI.isOperationLegalOrCustom(
3840       ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
3841 }
3842 
3843 namespace {
3844 
3845 /// Hepler class to perform type promotion.
3846 class TypePromotionHelper {
3847   /// Utility function to add a promoted instruction \p ExtOpnd to
3848   /// \p PromotedInsts and record the type of extension we have seen.
3849   static void addPromotedInst(InstrToOrigTy &PromotedInsts,
3850                               Instruction *ExtOpnd,
3851                               bool IsSExt) {
3852     ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
3853     InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);
3854     if (It != PromotedInsts.end()) {
3855       // If the new extension is same as original, the information in
3856       // PromotedInsts[ExtOpnd] is still correct.
3857       if (It->second.getInt() == ExtTy)
3858         return;
3859 
3860       // Now the new extension is different from old extension, we make
3861       // the type information invalid by setting extension type to
3862       // BothExtension.
3863       ExtTy = BothExtension;
3864     }
3865     PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);
3866   }
3867 
3868   /// Utility function to query the original type of instruction \p Opnd
3869   /// with a matched extension type. If the extension doesn't match, we
3870   /// cannot use the information we had on the original type.
3871   /// BothExtension doesn't match any extension type.
3872   static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
3873                                  Instruction *Opnd,
3874                                  bool IsSExt) {
3875     ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
3876     InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
3877     if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
3878       return It->second.getPointer();
3879     return nullptr;
3880   }
3881 
3882   /// Utility function to check whether or not a sign or zero extension
3883   /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
3884   /// either using the operands of \p Inst or promoting \p Inst.
3885   /// The type of the extension is defined by \p IsSExt.
3886   /// In other words, check if:
3887   /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
3888   /// #1 Promotion applies:
3889   /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
3890   /// #2 Operand reuses:
3891   /// ext opnd1 to ConsideredExtType.
3892   /// \p PromotedInsts maps the instructions to their type before promotion.
3893   static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
3894                             const InstrToOrigTy &PromotedInsts, bool IsSExt);
3895 
3896   /// Utility function to determine if \p OpIdx should be promoted when
3897   /// promoting \p Inst.
3898   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
3899     return !(isa<SelectInst>(Inst) && OpIdx == 0);
3900   }
3901 
3902   /// Utility function to promote the operand of \p Ext when this
3903   /// operand is a promotable trunc or sext or zext.
3904   /// \p PromotedInsts maps the instructions to their type before promotion.
3905   /// \p CreatedInstsCost[out] contains the cost of all instructions
3906   /// created to promote the operand of Ext.
3907   /// Newly added extensions are inserted in \p Exts.
3908   /// Newly added truncates are inserted in \p Truncs.
3909   /// Should never be called directly.
3910   /// \return The promoted value which is used instead of Ext.
3911   static Value *promoteOperandForTruncAndAnyExt(
3912       Instruction *Ext, TypePromotionTransaction &TPT,
3913       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3914       SmallVectorImpl<Instruction *> *Exts,
3915       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
3916 
3917   /// Utility function to promote the operand of \p Ext when this
3918   /// operand is promotable and is not a supported trunc or sext.
3919   /// \p PromotedInsts maps the instructions to their type before promotion.
3920   /// \p CreatedInstsCost[out] contains the cost of all the instructions
3921   /// created to promote the operand of Ext.
3922   /// Newly added extensions are inserted in \p Exts.
3923   /// Newly added truncates are inserted in \p Truncs.
3924   /// Should never be called directly.
3925   /// \return The promoted value which is used instead of Ext.
3926   static Value *promoteOperandForOther(Instruction *Ext,
3927                                        TypePromotionTransaction &TPT,
3928                                        InstrToOrigTy &PromotedInsts,
3929                                        unsigned &CreatedInstsCost,
3930                                        SmallVectorImpl<Instruction *> *Exts,
3931                                        SmallVectorImpl<Instruction *> *Truncs,
3932                                        const TargetLowering &TLI, bool IsSExt);
3933 
3934   /// \see promoteOperandForOther.
3935   static Value *signExtendOperandForOther(
3936       Instruction *Ext, TypePromotionTransaction &TPT,
3937       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3938       SmallVectorImpl<Instruction *> *Exts,
3939       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3940     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3941                                   Exts, Truncs, TLI, true);
3942   }
3943 
3944   /// \see promoteOperandForOther.
3945   static Value *zeroExtendOperandForOther(
3946       Instruction *Ext, TypePromotionTransaction &TPT,
3947       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3948       SmallVectorImpl<Instruction *> *Exts,
3949       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3950     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3951                                   Exts, Truncs, TLI, false);
3952   }
3953 
3954 public:
3955   /// Type for the utility function that promotes the operand of Ext.
3956   using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
3957                             InstrToOrigTy &PromotedInsts,
3958                             unsigned &CreatedInstsCost,
3959                             SmallVectorImpl<Instruction *> *Exts,
3960                             SmallVectorImpl<Instruction *> *Truncs,
3961                             const TargetLowering &TLI);
3962 
3963   /// Given a sign/zero extend instruction \p Ext, return the appropriate
3964   /// action to promote the operand of \p Ext instead of using Ext.
3965   /// \return NULL if no promotable action is possible with the current
3966   /// sign extension.
3967   /// \p InsertedInsts keeps track of all the instructions inserted by the
3968   /// other CodeGenPrepare optimizations. This information is important
3969   /// because we do not want to promote these instructions as CodeGenPrepare
3970   /// will reinsert them later. Thus creating an infinite loop: create/remove.
3971   /// \p PromotedInsts maps the instructions to their type before promotion.
3972   static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
3973                           const TargetLowering &TLI,
3974                           const InstrToOrigTy &PromotedInsts);
3975 };
3976 
3977 } // end anonymous namespace
3978 
3979 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
3980                                         Type *ConsideredExtType,
3981                                         const InstrToOrigTy &PromotedInsts,
3982                                         bool IsSExt) {
3983   // The promotion helper does not know how to deal with vector types yet.
3984   // To be able to fix that, we would need to fix the places where we
3985   // statically extend, e.g., constants and such.
3986   if (Inst->getType()->isVectorTy())
3987     return false;
3988 
3989   // We can always get through zext.
3990   if (isa<ZExtInst>(Inst))
3991     return true;
3992 
3993   // sext(sext) is ok too.
3994   if (IsSExt && isa<SExtInst>(Inst))
3995     return true;
3996 
3997   // We can get through binary operator, if it is legal. In other words, the
3998   // binary operator must have a nuw or nsw flag.
3999   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
4000   if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&
4001       ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
4002        (IsSExt && BinOp->hasNoSignedWrap())))
4003     return true;
4004 
4005   // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
4006   if ((Inst->getOpcode() == Instruction::And ||
4007        Inst->getOpcode() == Instruction::Or))
4008     return true;
4009 
4010   // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
4011   if (Inst->getOpcode() == Instruction::Xor) {
4012     const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
4013     // Make sure it is not a NOT.
4014     if (Cst && !Cst->getValue().isAllOnesValue())
4015       return true;
4016   }
4017 
4018   // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
4019   // It may change a poisoned value into a regular value, like
4020   //     zext i32 (shrl i8 %val, 12)  -->  shrl i32 (zext i8 %val), 12
4021   //          poisoned value                    regular value
4022   // It should be OK since undef covers valid value.
4023   if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
4024     return true;
4025 
4026   // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
4027   // It may change a poisoned value into a regular value, like
4028   //     zext i32 (shl i8 %val, 12)  -->  shl i32 (zext i8 %val), 12
4029   //          poisoned value                    regular value
4030   // It should be OK since undef covers valid value.
4031   if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
4032     const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());
4033     if (ExtInst->hasOneUse()) {
4034       const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());
4035       if (AndInst && AndInst->getOpcode() == Instruction::And) {
4036         const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
4037         if (Cst &&
4038             Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
4039           return true;
4040       }
4041     }
4042   }
4043 
4044   // Check if we can do the following simplification.
4045   // ext(trunc(opnd)) --> ext(opnd)
4046   if (!isa<TruncInst>(Inst))
4047     return false;
4048 
4049   Value *OpndVal = Inst->getOperand(0);
4050   // Check if we can use this operand in the extension.
4051   // If the type is larger than the result type of the extension, we cannot.
4052   if (!OpndVal->getType()->isIntegerTy() ||
4053       OpndVal->getType()->getIntegerBitWidth() >
4054           ConsideredExtType->getIntegerBitWidth())
4055     return false;
4056 
4057   // If the operand of the truncate is not an instruction, we will not have
4058   // any information on the dropped bits.
4059   // (Actually we could for constant but it is not worth the extra logic).
4060   Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
4061   if (!Opnd)
4062     return false;
4063 
4064   // Check if the source of the type is narrow enough.
4065   // I.e., check that trunc just drops extended bits of the same kind of
4066   // the extension.
4067   // #1 get the type of the operand and check the kind of the extended bits.
4068   const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
4069   if (OpndType)
4070     ;
4071   else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
4072     OpndType = Opnd->getOperand(0)->getType();
4073   else
4074     return false;
4075 
4076   // #2 check that the truncate just drops extended bits.
4077   return Inst->getType()->getIntegerBitWidth() >=
4078          OpndType->getIntegerBitWidth();
4079 }
4080 
4081 TypePromotionHelper::Action TypePromotionHelper::getAction(
4082     Instruction *Ext, const SetOfInstrs &InsertedInsts,
4083     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
4084   assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4085          "Unexpected instruction type");
4086   Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
4087   Type *ExtTy = Ext->getType();
4088   bool IsSExt = isa<SExtInst>(Ext);
4089   // If the operand of the extension is not an instruction, we cannot
4090   // get through.
4091   // If it, check we can get through.
4092   if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
4093     return nullptr;
4094 
4095   // Do not promote if the operand has been added by codegenprepare.
4096   // Otherwise, it means we are undoing an optimization that is likely to be
4097   // redone, thus causing potential infinite loop.
4098   if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
4099     return nullptr;
4100 
4101   // SExt or Trunc instructions.
4102   // Return the related handler.
4103   if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
4104       isa<ZExtInst>(ExtOpnd))
4105     return promoteOperandForTruncAndAnyExt;
4106 
4107   // Regular instruction.
4108   // Abort early if we will have to insert non-free instructions.
4109   if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
4110     return nullptr;
4111   return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
4112 }
4113 
4114 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
4115     Instruction *SExt, TypePromotionTransaction &TPT,
4116     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4117     SmallVectorImpl<Instruction *> *Exts,
4118     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
4119   // By construction, the operand of SExt is an instruction. Otherwise we cannot
4120   // get through it and this method should not be called.
4121   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
4122   Value *ExtVal = SExt;
4123   bool HasMergedNonFreeExt = false;
4124   if (isa<ZExtInst>(SExtOpnd)) {
4125     // Replace s|zext(zext(opnd))
4126     // => zext(opnd).
4127     HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
4128     Value *ZExt =
4129         TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
4130     TPT.replaceAllUsesWith(SExt, ZExt);
4131     TPT.eraseInstruction(SExt);
4132     ExtVal = ZExt;
4133   } else {
4134     // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
4135     // => z|sext(opnd).
4136     TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
4137   }
4138   CreatedInstsCost = 0;
4139 
4140   // Remove dead code.
4141   if (SExtOpnd->use_empty())
4142     TPT.eraseInstruction(SExtOpnd);
4143 
4144   // Check if the extension is still needed.
4145   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
4146   if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
4147     if (ExtInst) {
4148       if (Exts)
4149         Exts->push_back(ExtInst);
4150       CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
4151     }
4152     return ExtVal;
4153   }
4154 
4155   // At this point we have: ext ty opnd to ty.
4156   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
4157   Value *NextVal = ExtInst->getOperand(0);
4158   TPT.eraseInstruction(ExtInst, NextVal);
4159   return NextVal;
4160 }
4161 
4162 Value *TypePromotionHelper::promoteOperandForOther(
4163     Instruction *Ext, TypePromotionTransaction &TPT,
4164     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4165     SmallVectorImpl<Instruction *> *Exts,
4166     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
4167     bool IsSExt) {
4168   // By construction, the operand of Ext is an instruction. Otherwise we cannot
4169   // get through it and this method should not be called.
4170   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
4171   CreatedInstsCost = 0;
4172   if (!ExtOpnd->hasOneUse()) {
4173     // ExtOpnd will be promoted.
4174     // All its uses, but Ext, will need to use a truncated value of the
4175     // promoted version.
4176     // Create the truncate now.
4177     Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
4178     if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
4179       // Insert it just after the definition.
4180       ITrunc->moveAfter(ExtOpnd);
4181       if (Truncs)
4182         Truncs->push_back(ITrunc);
4183     }
4184 
4185     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
4186     // Restore the operand of Ext (which has been replaced by the previous call
4187     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
4188     TPT.setOperand(Ext, 0, ExtOpnd);
4189   }
4190 
4191   // Get through the Instruction:
4192   // 1. Update its type.
4193   // 2. Replace the uses of Ext by Inst.
4194   // 3. Extend each operand that needs to be extended.
4195 
4196   // Remember the original type of the instruction before promotion.
4197   // This is useful to know that the high bits are sign extended bits.
4198   addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
4199   // Step #1.
4200   TPT.mutateType(ExtOpnd, Ext->getType());
4201   // Step #2.
4202   TPT.replaceAllUsesWith(Ext, ExtOpnd);
4203   // Step #3.
4204   Instruction *ExtForOpnd = Ext;
4205 
4206   LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
4207   for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
4208        ++OpIdx) {
4209     LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
4210     if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
4211         !shouldExtOperand(ExtOpnd, OpIdx)) {
4212       LLVM_DEBUG(dbgs() << "No need to propagate\n");
4213       continue;
4214     }
4215     // Check if we can statically extend the operand.
4216     Value *Opnd = ExtOpnd->getOperand(OpIdx);
4217     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
4218       LLVM_DEBUG(dbgs() << "Statically extend\n");
4219       unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
4220       APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
4221                             : Cst->getValue().zext(BitWidth);
4222       TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
4223       continue;
4224     }
4225     // UndefValue are typed, so we have to statically sign extend them.
4226     if (isa<UndefValue>(Opnd)) {
4227       LLVM_DEBUG(dbgs() << "Statically extend\n");
4228       TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
4229       continue;
4230     }
4231 
4232     // Otherwise we have to explicitly sign extend the operand.
4233     // Check if Ext was reused to extend an operand.
4234     if (!ExtForOpnd) {
4235       // If yes, create a new one.
4236       LLVM_DEBUG(dbgs() << "More operands to ext\n");
4237       Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
4238         : TPT.createZExt(Ext, Opnd, Ext->getType());
4239       if (!isa<Instruction>(ValForExtOpnd)) {
4240         TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
4241         continue;
4242       }
4243       ExtForOpnd = cast<Instruction>(ValForExtOpnd);
4244     }
4245     if (Exts)
4246       Exts->push_back(ExtForOpnd);
4247     TPT.setOperand(ExtForOpnd, 0, Opnd);
4248 
4249     // Move the sign extension before the insertion point.
4250     TPT.moveBefore(ExtForOpnd, ExtOpnd);
4251     TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
4252     CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
4253     // If more sext are required, new instructions will have to be created.
4254     ExtForOpnd = nullptr;
4255   }
4256   if (ExtForOpnd == Ext) {
4257     LLVM_DEBUG(dbgs() << "Extension is useless now\n");
4258     TPT.eraseInstruction(Ext);
4259   }
4260   return ExtOpnd;
4261 }
4262 
4263 /// Check whether or not promoting an instruction to a wider type is profitable.
4264 /// \p NewCost gives the cost of extension instructions created by the
4265 /// promotion.
4266 /// \p OldCost gives the cost of extension instructions before the promotion
4267 /// plus the number of instructions that have been
4268 /// matched in the addressing mode the promotion.
4269 /// \p PromotedOperand is the value that has been promoted.
4270 /// \return True if the promotion is profitable, false otherwise.
4271 bool AddressingModeMatcher::isPromotionProfitable(
4272     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
4273   LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
4274                     << '\n');
4275   // The cost of the new extensions is greater than the cost of the
4276   // old extension plus what we folded.
4277   // This is not profitable.
4278   if (NewCost > OldCost)
4279     return false;
4280   if (NewCost < OldCost)
4281     return true;
4282   // The promotion is neutral but it may help folding the sign extension in
4283   // loads for instance.
4284   // Check that we did not create an illegal instruction.
4285   return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
4286 }
4287 
4288 /// Given an instruction or constant expr, see if we can fold the operation
4289 /// into the addressing mode. If so, update the addressing mode and return
4290 /// true, otherwise return false without modifying AddrMode.
4291 /// If \p MovedAway is not NULL, it contains the information of whether or
4292 /// not AddrInst has to be folded into the addressing mode on success.
4293 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
4294 /// because it has been moved away.
4295 /// Thus AddrInst must not be added in the matched instructions.
4296 /// This state can happen when AddrInst is a sext, since it may be moved away.
4297 /// Therefore, AddrInst may not be valid when MovedAway is true and it must
4298 /// not be referenced anymore.
4299 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
4300                                                unsigned Depth,
4301                                                bool *MovedAway) {
4302   // Avoid exponential behavior on extremely deep expression trees.
4303   if (Depth >= 5) return false;
4304 
4305   // By default, all matched instructions stay in place.
4306   if (MovedAway)
4307     *MovedAway = false;
4308 
4309   switch (Opcode) {
4310   case Instruction::PtrToInt:
4311     // PtrToInt is always a noop, as we know that the int type is pointer sized.
4312     return matchAddr(AddrInst->getOperand(0), Depth);
4313   case Instruction::IntToPtr: {
4314     auto AS = AddrInst->getType()->getPointerAddressSpace();
4315     auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
4316     // This inttoptr is a no-op if the integer type is pointer sized.
4317     if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
4318       return matchAddr(AddrInst->getOperand(0), Depth);
4319     return false;
4320   }
4321   case Instruction::BitCast:
4322     // BitCast is always a noop, and we can handle it as long as it is
4323     // int->int or pointer->pointer (we don't want int<->fp or something).
4324     if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
4325         // Don't touch identity bitcasts.  These were probably put here by LSR,
4326         // and we don't want to mess around with them.  Assume it knows what it
4327         // is doing.
4328         AddrInst->getOperand(0)->getType() != AddrInst->getType())
4329       return matchAddr(AddrInst->getOperand(0), Depth);
4330     return false;
4331   case Instruction::AddrSpaceCast: {
4332     unsigned SrcAS
4333       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
4334     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
4335     if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS))
4336       return matchAddr(AddrInst->getOperand(0), Depth);
4337     return false;
4338   }
4339   case Instruction::Add: {
4340     // Check to see if we can merge in the RHS then the LHS.  If so, we win.
4341     ExtAddrMode BackupAddrMode = AddrMode;
4342     unsigned OldSize = AddrModeInsts.size();
4343     // Start a transaction at this point.
4344     // The LHS may match but not the RHS.
4345     // Therefore, we need a higher level restoration point to undo partially
4346     // matched operation.
4347     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4348         TPT.getRestorationPoint();
4349 
4350     AddrMode.InBounds = false;
4351     if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
4352         matchAddr(AddrInst->getOperand(0), Depth+1))
4353       return true;
4354 
4355     // Restore the old addr mode info.
4356     AddrMode = BackupAddrMode;
4357     AddrModeInsts.resize(OldSize);
4358     TPT.rollback(LastKnownGood);
4359 
4360     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
4361     if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
4362         matchAddr(AddrInst->getOperand(1), Depth+1))
4363       return true;
4364 
4365     // Otherwise we definitely can't merge the ADD in.
4366     AddrMode = BackupAddrMode;
4367     AddrModeInsts.resize(OldSize);
4368     TPT.rollback(LastKnownGood);
4369     break;
4370   }
4371   //case Instruction::Or:
4372   // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
4373   //break;
4374   case Instruction::Mul:
4375   case Instruction::Shl: {
4376     // Can only handle X*C and X << C.
4377     AddrMode.InBounds = false;
4378     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
4379     if (!RHS || RHS->getBitWidth() > 64)
4380       return false;
4381     int64_t Scale = RHS->getSExtValue();
4382     if (Opcode == Instruction::Shl)
4383       Scale = 1LL << Scale;
4384 
4385     return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
4386   }
4387   case Instruction::GetElementPtr: {
4388     // Scan the GEP.  We check it if it contains constant offsets and at most
4389     // one variable offset.
4390     int VariableOperand = -1;
4391     unsigned VariableScale = 0;
4392 
4393     int64_t ConstantOffset = 0;
4394     gep_type_iterator GTI = gep_type_begin(AddrInst);
4395     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
4396       if (StructType *STy = GTI.getStructTypeOrNull()) {
4397         const StructLayout *SL = DL.getStructLayout(STy);
4398         unsigned Idx =
4399           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
4400         ConstantOffset += SL->getElementOffset(Idx);
4401       } else {
4402         TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());
4403         if (TS.isNonZero()) {
4404           // The optimisations below currently only work for fixed offsets.
4405           if (TS.isScalable())
4406             return false;
4407           int64_t TypeSize = TS.getFixedSize();
4408           if (ConstantInt *CI =
4409                   dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
4410             const APInt &CVal = CI->getValue();
4411             if (CVal.getMinSignedBits() <= 64) {
4412               ConstantOffset += CVal.getSExtValue() * TypeSize;
4413               continue;
4414             }
4415           }
4416           // We only allow one variable index at the moment.
4417           if (VariableOperand != -1)
4418             return false;
4419 
4420           // Remember the variable index.
4421           VariableOperand = i;
4422           VariableScale = TypeSize;
4423         }
4424       }
4425     }
4426 
4427     // A common case is for the GEP to only do a constant offset.  In this case,
4428     // just add it to the disp field and check validity.
4429     if (VariableOperand == -1) {
4430       AddrMode.BaseOffs += ConstantOffset;
4431       if (ConstantOffset == 0 ||
4432           TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
4433         // Check to see if we can fold the base pointer in too.
4434         if (matchAddr(AddrInst->getOperand(0), Depth+1)) {
4435           if (!cast<GEPOperator>(AddrInst)->isInBounds())
4436             AddrMode.InBounds = false;
4437           return true;
4438         }
4439       } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
4440                  TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
4441                  ConstantOffset > 0) {
4442         // Record GEPs with non-zero offsets as candidates for splitting in the
4443         // event that the offset cannot fit into the r+i addressing mode.
4444         // Simple and common case that only one GEP is used in calculating the
4445         // address for the memory access.
4446         Value *Base = AddrInst->getOperand(0);
4447         auto *BaseI = dyn_cast<Instruction>(Base);
4448         auto *GEP = cast<GetElementPtrInst>(AddrInst);
4449         if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
4450             (BaseI && !isa<CastInst>(BaseI) &&
4451              !isa<GetElementPtrInst>(BaseI))) {
4452           // Make sure the parent block allows inserting non-PHI instructions
4453           // before the terminator.
4454           BasicBlock *Parent =
4455               BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
4456           if (!Parent->getTerminator()->isEHPad())
4457             LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
4458         }
4459       }
4460       AddrMode.BaseOffs -= ConstantOffset;
4461       return false;
4462     }
4463 
4464     // Save the valid addressing mode in case we can't match.
4465     ExtAddrMode BackupAddrMode = AddrMode;
4466     unsigned OldSize = AddrModeInsts.size();
4467 
4468     // See if the scale and offset amount is valid for this target.
4469     AddrMode.BaseOffs += ConstantOffset;
4470     if (!cast<GEPOperator>(AddrInst)->isInBounds())
4471       AddrMode.InBounds = false;
4472 
4473     // Match the base operand of the GEP.
4474     if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
4475       // If it couldn't be matched, just stuff the value in a register.
4476       if (AddrMode.HasBaseReg) {
4477         AddrMode = BackupAddrMode;
4478         AddrModeInsts.resize(OldSize);
4479         return false;
4480       }
4481       AddrMode.HasBaseReg = true;
4482       AddrMode.BaseReg = AddrInst->getOperand(0);
4483     }
4484 
4485     // Match the remaining variable portion of the GEP.
4486     if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
4487                           Depth)) {
4488       // If it couldn't be matched, try stuffing the base into a register
4489       // instead of matching it, and retrying the match of the scale.
4490       AddrMode = BackupAddrMode;
4491       AddrModeInsts.resize(OldSize);
4492       if (AddrMode.HasBaseReg)
4493         return false;
4494       AddrMode.HasBaseReg = true;
4495       AddrMode.BaseReg = AddrInst->getOperand(0);
4496       AddrMode.BaseOffs += ConstantOffset;
4497       if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
4498                             VariableScale, Depth)) {
4499         // If even that didn't work, bail.
4500         AddrMode = BackupAddrMode;
4501         AddrModeInsts.resize(OldSize);
4502         return false;
4503       }
4504     }
4505 
4506     return true;
4507   }
4508   case Instruction::SExt:
4509   case Instruction::ZExt: {
4510     Instruction *Ext = dyn_cast<Instruction>(AddrInst);
4511     if (!Ext)
4512       return false;
4513 
4514     // Try to move this ext out of the way of the addressing mode.
4515     // Ask for a method for doing so.
4516     TypePromotionHelper::Action TPH =
4517         TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
4518     if (!TPH)
4519       return false;
4520 
4521     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4522         TPT.getRestorationPoint();
4523     unsigned CreatedInstsCost = 0;
4524     unsigned ExtCost = !TLI.isExtFree(Ext);
4525     Value *PromotedOperand =
4526         TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
4527     // SExt has been moved away.
4528     // Thus either it will be rematched later in the recursive calls or it is
4529     // gone. Anyway, we must not fold it into the addressing mode at this point.
4530     // E.g.,
4531     // op = add opnd, 1
4532     // idx = ext op
4533     // addr = gep base, idx
4534     // is now:
4535     // promotedOpnd = ext opnd            <- no match here
4536     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
4537     // addr = gep base, op                <- match
4538     if (MovedAway)
4539       *MovedAway = true;
4540 
4541     assert(PromotedOperand &&
4542            "TypePromotionHelper should have filtered out those cases");
4543 
4544     ExtAddrMode BackupAddrMode = AddrMode;
4545     unsigned OldSize = AddrModeInsts.size();
4546 
4547     if (!matchAddr(PromotedOperand, Depth) ||
4548         // The total of the new cost is equal to the cost of the created
4549         // instructions.
4550         // The total of the old cost is equal to the cost of the extension plus
4551         // what we have saved in the addressing mode.
4552         !isPromotionProfitable(CreatedInstsCost,
4553                                ExtCost + (AddrModeInsts.size() - OldSize),
4554                                PromotedOperand)) {
4555       AddrMode = BackupAddrMode;
4556       AddrModeInsts.resize(OldSize);
4557       LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
4558       TPT.rollback(LastKnownGood);
4559       return false;
4560     }
4561     return true;
4562   }
4563   }
4564   return false;
4565 }
4566 
4567 /// If we can, try to add the value of 'Addr' into the current addressing mode.
4568 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode
4569 /// unmodified. This assumes that Addr is either a pointer type or intptr_t
4570 /// for the target.
4571 ///
4572 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
4573   // Start a transaction at this point that we will rollback if the matching
4574   // fails.
4575   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4576       TPT.getRestorationPoint();
4577   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
4578     if (CI->getValue().isSignedIntN(64)) {
4579       // Fold in immediates if legal for the target.
4580       AddrMode.BaseOffs += CI->getSExtValue();
4581       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4582         return true;
4583       AddrMode.BaseOffs -= CI->getSExtValue();
4584     }
4585   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
4586     // If this is a global variable, try to fold it into the addressing mode.
4587     if (!AddrMode.BaseGV) {
4588       AddrMode.BaseGV = GV;
4589       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4590         return true;
4591       AddrMode.BaseGV = nullptr;
4592     }
4593   } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
4594     ExtAddrMode BackupAddrMode = AddrMode;
4595     unsigned OldSize = AddrModeInsts.size();
4596 
4597     // Check to see if it is possible to fold this operation.
4598     bool MovedAway = false;
4599     if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
4600       // This instruction may have been moved away. If so, there is nothing
4601       // to check here.
4602       if (MovedAway)
4603         return true;
4604       // Okay, it's possible to fold this.  Check to see if it is actually
4605       // *profitable* to do so.  We use a simple cost model to avoid increasing
4606       // register pressure too much.
4607       if (I->hasOneUse() ||
4608           isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
4609         AddrModeInsts.push_back(I);
4610         return true;
4611       }
4612 
4613       // It isn't profitable to do this, roll back.
4614       //cerr << "NOT FOLDING: " << *I;
4615       AddrMode = BackupAddrMode;
4616       AddrModeInsts.resize(OldSize);
4617       TPT.rollback(LastKnownGood);
4618     }
4619   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
4620     if (matchOperationAddr(CE, CE->getOpcode(), Depth))
4621       return true;
4622     TPT.rollback(LastKnownGood);
4623   } else if (isa<ConstantPointerNull>(Addr)) {
4624     // Null pointer gets folded without affecting the addressing mode.
4625     return true;
4626   }
4627 
4628   // Worse case, the target should support [reg] addressing modes. :)
4629   if (!AddrMode.HasBaseReg) {
4630     AddrMode.HasBaseReg = true;
4631     AddrMode.BaseReg = Addr;
4632     // Still check for legality in case the target supports [imm] but not [i+r].
4633     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4634       return true;
4635     AddrMode.HasBaseReg = false;
4636     AddrMode.BaseReg = nullptr;
4637   }
4638 
4639   // If the base register is already taken, see if we can do [r+r].
4640   if (AddrMode.Scale == 0) {
4641     AddrMode.Scale = 1;
4642     AddrMode.ScaledReg = Addr;
4643     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
4644       return true;
4645     AddrMode.Scale = 0;
4646     AddrMode.ScaledReg = nullptr;
4647   }
4648   // Couldn't match.
4649   TPT.rollback(LastKnownGood);
4650   return false;
4651 }
4652 
4653 /// Check to see if all uses of OpVal by the specified inline asm call are due
4654 /// to memory operands. If so, return true, otherwise return false.
4655 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
4656                                     const TargetLowering &TLI,
4657                                     const TargetRegisterInfo &TRI) {
4658   const Function *F = CI->getFunction();
4659   TargetLowering::AsmOperandInfoVector TargetConstraints =
4660       TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
4661 
4662   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
4663     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
4664 
4665     // Compute the constraint code and ConstraintType to use.
4666     TLI.ComputeConstraintToUse(OpInfo, SDValue());
4667 
4668     // If this asm operand is our Value*, and if it isn't an indirect memory
4669     // operand, we can't fold it!
4670     if (OpInfo.CallOperandVal == OpVal &&
4671         (OpInfo.ConstraintType != TargetLowering::C_Memory ||
4672          !OpInfo.isIndirect))
4673       return false;
4674   }
4675 
4676   return true;
4677 }
4678 
4679 // Max number of memory uses to look at before aborting the search to conserve
4680 // compile time.
4681 static constexpr int MaxMemoryUsesToScan = 20;
4682 
4683 /// Recursively walk all the uses of I until we find a memory use.
4684 /// If we find an obviously non-foldable instruction, return true.
4685 /// Add the ultimately found memory instructions to MemoryUses.
4686 static bool FindAllMemoryUses(
4687     Instruction *I,
4688     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
4689     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
4690     const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
4691     BlockFrequencyInfo *BFI, int SeenInsts = 0) {
4692   // If we already considered this instruction, we're done.
4693   if (!ConsideredInsts.insert(I).second)
4694     return false;
4695 
4696   // If this is an obviously unfoldable instruction, bail out.
4697   if (!MightBeFoldableInst(I))
4698     return true;
4699 
4700   // Loop over all the uses, recursively processing them.
4701   for (Use &U : I->uses()) {
4702     // Conservatively return true if we're seeing a large number or a deep chain
4703     // of users. This avoids excessive compilation times in pathological cases.
4704     if (SeenInsts++ >= MaxMemoryUsesToScan)
4705       return true;
4706 
4707     Instruction *UserI = cast<Instruction>(U.getUser());
4708     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
4709       MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
4710       continue;
4711     }
4712 
4713     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
4714       unsigned opNo = U.getOperandNo();
4715       if (opNo != StoreInst::getPointerOperandIndex())
4716         return true; // Storing addr, not into addr.
4717       MemoryUses.push_back(std::make_pair(SI, opNo));
4718       continue;
4719     }
4720 
4721     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
4722       unsigned opNo = U.getOperandNo();
4723       if (opNo != AtomicRMWInst::getPointerOperandIndex())
4724         return true; // Storing addr, not into addr.
4725       MemoryUses.push_back(std::make_pair(RMW, opNo));
4726       continue;
4727     }
4728 
4729     if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
4730       unsigned opNo = U.getOperandNo();
4731       if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
4732         return true; // Storing addr, not into addr.
4733       MemoryUses.push_back(std::make_pair(CmpX, opNo));
4734       continue;
4735     }
4736 
4737     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
4738       if (CI->hasFnAttr(Attribute::Cold)) {
4739         // If this is a cold call, we can sink the addressing calculation into
4740         // the cold path.  See optimizeCallInst
4741         bool OptForSize = OptSize ||
4742           llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
4743         if (!OptForSize)
4744           continue;
4745       }
4746 
4747       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());
4748       if (!IA) return true;
4749 
4750       // If this is a memory operand, we're cool, otherwise bail out.
4751       if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
4752         return true;
4753       continue;
4754     }
4755 
4756     if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
4757                           PSI, BFI, SeenInsts))
4758       return true;
4759   }
4760 
4761   return false;
4762 }
4763 
4764 /// Return true if Val is already known to be live at the use site that we're
4765 /// folding it into. If so, there is no cost to include it in the addressing
4766 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
4767 /// instruction already.
4768 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
4769                                                    Value *KnownLive2) {
4770   // If Val is either of the known-live values, we know it is live!
4771   if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
4772     return true;
4773 
4774   // All values other than instructions and arguments (e.g. constants) are live.
4775   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
4776 
4777   // If Val is a constant sized alloca in the entry block, it is live, this is
4778   // true because it is just a reference to the stack/frame pointer, which is
4779   // live for the whole function.
4780   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
4781     if (AI->isStaticAlloca())
4782       return true;
4783 
4784   // Check to see if this value is already used in the memory instruction's
4785   // block.  If so, it's already live into the block at the very least, so we
4786   // can reasonably fold it.
4787   return Val->isUsedInBasicBlock(MemoryInst->getParent());
4788 }
4789 
4790 /// It is possible for the addressing mode of the machine to fold the specified
4791 /// instruction into a load or store that ultimately uses it.
4792 /// However, the specified instruction has multiple uses.
4793 /// Given this, it may actually increase register pressure to fold it
4794 /// into the load. For example, consider this code:
4795 ///
4796 ///     X = ...
4797 ///     Y = X+1
4798 ///     use(Y)   -> nonload/store
4799 ///     Z = Y+1
4800 ///     load Z
4801 ///
4802 /// In this case, Y has multiple uses, and can be folded into the load of Z
4803 /// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
4804 /// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
4805 /// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
4806 /// number of computations either.
4807 ///
4808 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
4809 /// X was live across 'load Z' for other reasons, we actually *would* want to
4810 /// fold the addressing mode in the Z case.  This would make Y die earlier.
4811 bool AddressingModeMatcher::
4812 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
4813                                      ExtAddrMode &AMAfter) {
4814   if (IgnoreProfitability) return true;
4815 
4816   // AMBefore is the addressing mode before this instruction was folded into it,
4817   // and AMAfter is the addressing mode after the instruction was folded.  Get
4818   // the set of registers referenced by AMAfter and subtract out those
4819   // referenced by AMBefore: this is the set of values which folding in this
4820   // address extends the lifetime of.
4821   //
4822   // Note that there are only two potential values being referenced here,
4823   // BaseReg and ScaleReg (global addresses are always available, as are any
4824   // folded immediates).
4825   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
4826 
4827   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
4828   // lifetime wasn't extended by adding this instruction.
4829   if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4830     BaseReg = nullptr;
4831   if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4832     ScaledReg = nullptr;
4833 
4834   // If folding this instruction (and it's subexprs) didn't extend any live
4835   // ranges, we're ok with it.
4836   if (!BaseReg && !ScaledReg)
4837     return true;
4838 
4839   // If all uses of this instruction can have the address mode sunk into them,
4840   // we can remove the addressing mode and effectively trade one live register
4841   // for another (at worst.)  In this context, folding an addressing mode into
4842   // the use is just a particularly nice way of sinking it.
4843   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
4844   SmallPtrSet<Instruction*, 16> ConsideredInsts;
4845   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
4846                         PSI, BFI))
4847     return false;  // Has a non-memory, non-foldable use!
4848 
4849   // Now that we know that all uses of this instruction are part of a chain of
4850   // computation involving only operations that could theoretically be folded
4851   // into a memory use, loop over each of these memory operation uses and see
4852   // if they could  *actually* fold the instruction.  The assumption is that
4853   // addressing modes are cheap and that duplicating the computation involved
4854   // many times is worthwhile, even on a fastpath. For sinking candidates
4855   // (i.e. cold call sites), this serves as a way to prevent excessive code
4856   // growth since most architectures have some reasonable small and fast way to
4857   // compute an effective address.  (i.e LEA on x86)
4858   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
4859   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
4860     Instruction *User = MemoryUses[i].first;
4861     unsigned OpNo = MemoryUses[i].second;
4862 
4863     // Get the access type of this use.  If the use isn't a pointer, we don't
4864     // know what it accesses.
4865     Value *Address = User->getOperand(OpNo);
4866     PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
4867     if (!AddrTy)
4868       return false;
4869     Type *AddressAccessTy = AddrTy->getElementType();
4870     unsigned AS = AddrTy->getAddressSpace();
4871 
4872     // Do a match against the root of this address, ignoring profitability. This
4873     // will tell us if the addressing mode for the memory operation will
4874     // *actually* cover the shared instruction.
4875     ExtAddrMode Result;
4876     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
4877                                                                       0);
4878     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4879         TPT.getRestorationPoint();
4880     AddressingModeMatcher Matcher(
4881         MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
4882         InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
4883     Matcher.IgnoreProfitability = true;
4884     bool Success = Matcher.matchAddr(Address, 0);
4885     (void)Success; assert(Success && "Couldn't select *anything*?");
4886 
4887     // The match was to check the profitability, the changes made are not
4888     // part of the original matcher. Therefore, they should be dropped
4889     // otherwise the original matcher will not present the right state.
4890     TPT.rollback(LastKnownGood);
4891 
4892     // If the match didn't cover I, then it won't be shared by it.
4893     if (!is_contained(MatchedAddrModeInsts, I))
4894       return false;
4895 
4896     MatchedAddrModeInsts.clear();
4897   }
4898 
4899   return true;
4900 }
4901 
4902 /// Return true if the specified values are defined in a
4903 /// different basic block than BB.
4904 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
4905   if (Instruction *I = dyn_cast<Instruction>(V))
4906     return I->getParent() != BB;
4907   return false;
4908 }
4909 
4910 /// Sink addressing mode computation immediate before MemoryInst if doing so
4911 /// can be done without increasing register pressure.  The need for the
4912 /// register pressure constraint means this can end up being an all or nothing
4913 /// decision for all uses of the same addressing computation.
4914 ///
4915 /// Load and Store Instructions often have addressing modes that can do
4916 /// significant amounts of computation. As such, instruction selection will try
4917 /// to get the load or store to do as much computation as possible for the
4918 /// program. The problem is that isel can only see within a single block. As
4919 /// such, we sink as much legal addressing mode work into the block as possible.
4920 ///
4921 /// This method is used to optimize both load/store and inline asms with memory
4922 /// operands.  It's also used to sink addressing computations feeding into cold
4923 /// call sites into their (cold) basic block.
4924 ///
4925 /// The motivation for handling sinking into cold blocks is that doing so can
4926 /// both enable other address mode sinking (by satisfying the register pressure
4927 /// constraint above), and reduce register pressure globally (by removing the
4928 /// addressing mode computation from the fast path entirely.).
4929 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
4930                                         Type *AccessTy, unsigned AddrSpace) {
4931   Value *Repl = Addr;
4932 
4933   // Try to collapse single-value PHI nodes.  This is necessary to undo
4934   // unprofitable PRE transformations.
4935   SmallVector<Value*, 8> worklist;
4936   SmallPtrSet<Value*, 16> Visited;
4937   worklist.push_back(Addr);
4938 
4939   // Use a worklist to iteratively look through PHI and select nodes, and
4940   // ensure that the addressing mode obtained from the non-PHI/select roots of
4941   // the graph are compatible.
4942   bool PhiOrSelectSeen = false;
4943   SmallVector<Instruction*, 16> AddrModeInsts;
4944   const SimplifyQuery SQ(*DL, TLInfo);
4945   AddressingModeCombiner AddrModes(SQ, Addr);
4946   TypePromotionTransaction TPT(RemovedInsts);
4947   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4948       TPT.getRestorationPoint();
4949   while (!worklist.empty()) {
4950     Value *V = worklist.back();
4951     worklist.pop_back();
4952 
4953     // We allow traversing cyclic Phi nodes.
4954     // In case of success after this loop we ensure that traversing through
4955     // Phi nodes ends up with all cases to compute address of the form
4956     //    BaseGV + Base + Scale * Index + Offset
4957     // where Scale and Offset are constans and BaseGV, Base and Index
4958     // are exactly the same Values in all cases.
4959     // It means that BaseGV, Scale and Offset dominate our memory instruction
4960     // and have the same value as they had in address computation represented
4961     // as Phi. So we can safely sink address computation to memory instruction.
4962     if (!Visited.insert(V).second)
4963       continue;
4964 
4965     // For a PHI node, push all of its incoming values.
4966     if (PHINode *P = dyn_cast<PHINode>(V)) {
4967       for (Value *IncValue : P->incoming_values())
4968         worklist.push_back(IncValue);
4969       PhiOrSelectSeen = true;
4970       continue;
4971     }
4972     // Similar for select.
4973     if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
4974       worklist.push_back(SI->getFalseValue());
4975       worklist.push_back(SI->getTrueValue());
4976       PhiOrSelectSeen = true;
4977       continue;
4978     }
4979 
4980     // For non-PHIs, determine the addressing mode being computed.  Note that
4981     // the result may differ depending on what other uses our candidate
4982     // addressing instructions might have.
4983     AddrModeInsts.clear();
4984     std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
4985                                                                       0);
4986     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
4987         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
4988         InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
4989         BFI.get());
4990 
4991     GetElementPtrInst *GEP = LargeOffsetGEP.first;
4992     if (GEP && !NewGEPBases.count(GEP)) {
4993       // If splitting the underlying data structure can reduce the offset of a
4994       // GEP, collect the GEP.  Skip the GEPs that are the new bases of
4995       // previously split data structures.
4996       LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
4997       if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
4998         LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
4999     }
5000 
5001     NewAddrMode.OriginalValue = V;
5002     if (!AddrModes.addNewAddrMode(NewAddrMode))
5003       break;
5004   }
5005 
5006   // Try to combine the AddrModes we've collected. If we couldn't collect any,
5007   // or we have multiple but either couldn't combine them or combining them
5008   // wouldn't do anything useful, bail out now.
5009   if (!AddrModes.combineAddrModes()) {
5010     TPT.rollback(LastKnownGood);
5011     return false;
5012   }
5013   bool Modified = TPT.commit();
5014 
5015   // Get the combined AddrMode (or the only AddrMode, if we only had one).
5016   ExtAddrMode AddrMode = AddrModes.getAddrMode();
5017 
5018   // If all the instructions matched are already in this BB, don't do anything.
5019   // If we saw a Phi node then it is not local definitely, and if we saw a select
5020   // then we want to push the address calculation past it even if it's already
5021   // in this BB.
5022   if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
5023         return IsNonLocalValue(V, MemoryInst->getParent());
5024                   })) {
5025     LLVM_DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode
5026                       << "\n");
5027     return Modified;
5028   }
5029 
5030   // Insert this computation right after this user.  Since our caller is
5031   // scanning from the top of the BB to the bottom, reuse of the expr are
5032   // guaranteed to happen later.
5033   IRBuilder<> Builder(MemoryInst);
5034 
5035   // Now that we determined the addressing expression we want to use and know
5036   // that we have to sink it into this block.  Check to see if we have already
5037   // done this for some other load/store instr in this block.  If so, reuse
5038   // the computation.  Before attempting reuse, check if the address is valid
5039   // as it may have been erased.
5040 
5041   WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
5042 
5043   Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
5044   if (SunkAddr) {
5045     LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
5046                       << " for " << *MemoryInst << "\n");
5047     if (SunkAddr->getType() != Addr->getType())
5048       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
5049   } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
5050                                    SubtargetInfo->addrSinkUsingGEPs())) {
5051     // By default, we use the GEP-based method when AA is used later. This
5052     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
5053     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
5054                       << " for " << *MemoryInst << "\n");
5055     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5056     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
5057 
5058     // First, find the pointer.
5059     if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
5060       ResultPtr = AddrMode.BaseReg;
5061       AddrMode.BaseReg = nullptr;
5062     }
5063 
5064     if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
5065       // We can't add more than one pointer together, nor can we scale a
5066       // pointer (both of which seem meaningless).
5067       if (ResultPtr || AddrMode.Scale != 1)
5068         return Modified;
5069 
5070       ResultPtr = AddrMode.ScaledReg;
5071       AddrMode.Scale = 0;
5072     }
5073 
5074     // It is only safe to sign extend the BaseReg if we know that the math
5075     // required to create it did not overflow before we extend it. Since
5076     // the original IR value was tossed in favor of a constant back when
5077     // the AddrMode was created we need to bail out gracefully if widths
5078     // do not match instead of extending it.
5079     //
5080     // (See below for code to add the scale.)
5081     if (AddrMode.Scale) {
5082       Type *ScaledRegTy = AddrMode.ScaledReg->getType();
5083       if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
5084           cast<IntegerType>(ScaledRegTy)->getBitWidth())
5085         return Modified;
5086     }
5087 
5088     if (AddrMode.BaseGV) {
5089       if (ResultPtr)
5090         return Modified;
5091 
5092       ResultPtr = AddrMode.BaseGV;
5093     }
5094 
5095     // If the real base value actually came from an inttoptr, then the matcher
5096     // will look through it and provide only the integer value. In that case,
5097     // use it here.
5098     if (!DL->isNonIntegralPointerType(Addr->getType())) {
5099       if (!ResultPtr && AddrMode.BaseReg) {
5100         ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
5101                                            "sunkaddr");
5102         AddrMode.BaseReg = nullptr;
5103       } else if (!ResultPtr && AddrMode.Scale == 1) {
5104         ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
5105                                            "sunkaddr");
5106         AddrMode.Scale = 0;
5107       }
5108     }
5109 
5110     if (!ResultPtr &&
5111         !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
5112       SunkAddr = Constant::getNullValue(Addr->getType());
5113     } else if (!ResultPtr) {
5114       return Modified;
5115     } else {
5116       Type *I8PtrTy =
5117           Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
5118       Type *I8Ty = Builder.getInt8Ty();
5119 
5120       // Start with the base register. Do this first so that subsequent address
5121       // matching finds it last, which will prevent it from trying to match it
5122       // as the scaled value in case it happens to be a mul. That would be
5123       // problematic if we've sunk a different mul for the scale, because then
5124       // we'd end up sinking both muls.
5125       if (AddrMode.BaseReg) {
5126         Value *V = AddrMode.BaseReg;
5127         if (V->getType() != IntPtrTy)
5128           V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
5129 
5130         ResultIndex = V;
5131       }
5132 
5133       // Add the scale value.
5134       if (AddrMode.Scale) {
5135         Value *V = AddrMode.ScaledReg;
5136         if (V->getType() == IntPtrTy) {
5137           // done.
5138         } else {
5139           assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
5140                  cast<IntegerType>(V->getType())->getBitWidth() &&
5141                  "We can't transform if ScaledReg is too narrow");
5142           V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
5143         }
5144 
5145         if (AddrMode.Scale != 1)
5146           V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
5147                                 "sunkaddr");
5148         if (ResultIndex)
5149           ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
5150         else
5151           ResultIndex = V;
5152       }
5153 
5154       // Add in the Base Offset if present.
5155       if (AddrMode.BaseOffs) {
5156         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
5157         if (ResultIndex) {
5158           // We need to add this separately from the scale above to help with
5159           // SDAG consecutive load/store merging.
5160           if (ResultPtr->getType() != I8PtrTy)
5161             ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
5162           ResultPtr =
5163               AddrMode.InBounds
5164                   ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
5165                                               "sunkaddr")
5166                   : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
5167         }
5168 
5169         ResultIndex = V;
5170       }
5171 
5172       if (!ResultIndex) {
5173         SunkAddr = ResultPtr;
5174       } else {
5175         if (ResultPtr->getType() != I8PtrTy)
5176           ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
5177         SunkAddr =
5178             AddrMode.InBounds
5179                 ? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
5180                                             "sunkaddr")
5181                 : Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
5182       }
5183 
5184       if (SunkAddr->getType() != Addr->getType())
5185         SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
5186     }
5187   } else {
5188     // We'd require a ptrtoint/inttoptr down the line, which we can't do for
5189     // non-integral pointers, so in that case bail out now.
5190     Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
5191     Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
5192     PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
5193     PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
5194     if (DL->isNonIntegralPointerType(Addr->getType()) ||
5195         (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
5196         (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
5197         (AddrMode.BaseGV &&
5198          DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
5199       return Modified;
5200 
5201     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
5202                       << " for " << *MemoryInst << "\n");
5203     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5204     Value *Result = nullptr;
5205 
5206     // Start with the base register. Do this first so that subsequent address
5207     // matching finds it last, which will prevent it from trying to match it
5208     // as the scaled value in case it happens to be a mul. That would be
5209     // problematic if we've sunk a different mul for the scale, because then
5210     // we'd end up sinking both muls.
5211     if (AddrMode.BaseReg) {
5212       Value *V = AddrMode.BaseReg;
5213       if (V->getType()->isPointerTy())
5214         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
5215       if (V->getType() != IntPtrTy)
5216         V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
5217       Result = V;
5218     }
5219 
5220     // Add the scale value.
5221     if (AddrMode.Scale) {
5222       Value *V = AddrMode.ScaledReg;
5223       if (V->getType() == IntPtrTy) {
5224         // done.
5225       } else if (V->getType()->isPointerTy()) {
5226         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
5227       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
5228                  cast<IntegerType>(V->getType())->getBitWidth()) {
5229         V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
5230       } else {
5231         // It is only safe to sign extend the BaseReg if we know that the math
5232         // required to create it did not overflow before we extend it. Since
5233         // the original IR value was tossed in favor of a constant back when
5234         // the AddrMode was created we need to bail out gracefully if widths
5235         // do not match instead of extending it.
5236         Instruction *I = dyn_cast_or_null<Instruction>(Result);
5237         if (I && (Result != AddrMode.BaseReg))
5238           I->eraseFromParent();
5239         return Modified;
5240       }
5241       if (AddrMode.Scale != 1)
5242         V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
5243                               "sunkaddr");
5244       if (Result)
5245         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5246       else
5247         Result = V;
5248     }
5249 
5250     // Add in the BaseGV if present.
5251     if (AddrMode.BaseGV) {
5252       Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
5253       if (Result)
5254         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5255       else
5256         Result = V;
5257     }
5258 
5259     // Add in the Base Offset if present.
5260     if (AddrMode.BaseOffs) {
5261       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
5262       if (Result)
5263         Result = Builder.CreateAdd(Result, V, "sunkaddr");
5264       else
5265         Result = V;
5266     }
5267 
5268     if (!Result)
5269       SunkAddr = Constant::getNullValue(Addr->getType());
5270     else
5271       SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
5272   }
5273 
5274   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
5275   // Store the newly computed address into the cache. In the case we reused a
5276   // value, this should be idempotent.
5277   SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);
5278 
5279   // If we have no uses, recursively delete the value and all dead instructions
5280   // using it.
5281   if (Repl->use_empty()) {
5282     resetIteratorIfInvalidatedWhileCalling(CurInstIterator->getParent(), [&]() {
5283       RecursivelyDeleteTriviallyDeadInstructions(
5284           Repl, TLInfo, nullptr,
5285           [&](Value *V) { removeAllAssertingVHReferences(V); });
5286     });
5287   }
5288   ++NumMemoryInsts;
5289   return true;
5290 }
5291 
5292 /// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
5293 /// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
5294 /// only handle a 2 operand GEP in the same basic block or a splat constant
5295 /// vector. The 2 operands to the GEP must have a scalar pointer and a vector
5296 /// index.
5297 ///
5298 /// If the existing GEP has a vector base pointer that is splat, we can look
5299 /// through the splat to find the scalar pointer. If we can't find a scalar
5300 /// pointer there's nothing we can do.
5301 ///
5302 /// If we have a GEP with more than 2 indices where the middle indices are all
5303 /// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
5304 ///
5305 /// If the final index isn't a vector or is a splat, we can emit a scalar GEP
5306 /// followed by a GEP with an all zeroes vector index. This will enable
5307 /// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
5308 /// zero index.
5309 bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
5310                                                Value *Ptr) {
5311   // FIXME: Support scalable vectors.
5312   if (isa<ScalableVectorType>(Ptr->getType()))
5313     return false;
5314 
5315   Value *NewAddr;
5316 
5317   if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
5318     // Don't optimize GEPs that don't have indices.
5319     if (!GEP->hasIndices())
5320       return false;
5321 
5322     // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
5323     // FIXME: We should support this by sinking the GEP.
5324     if (MemoryInst->getParent() != GEP->getParent())
5325       return false;
5326 
5327     SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
5328 
5329     bool RewriteGEP = false;
5330 
5331     if (Ops[0]->getType()->isVectorTy()) {
5332       Ops[0] = getSplatValue(Ops[0]);
5333       if (!Ops[0])
5334         return false;
5335       RewriteGEP = true;
5336     }
5337 
5338     unsigned FinalIndex = Ops.size() - 1;
5339 
5340     // Ensure all but the last index is 0.
5341     // FIXME: This isn't strictly required. All that's required is that they are
5342     // all scalars or splats.
5343     for (unsigned i = 1; i < FinalIndex; ++i) {
5344       auto *C = dyn_cast<Constant>(Ops[i]);
5345       if (!C)
5346         return false;
5347       if (isa<VectorType>(C->getType()))
5348         C = C->getSplatValue();
5349       auto *CI = dyn_cast_or_null<ConstantInt>(C);
5350       if (!CI || !CI->isZero())
5351         return false;
5352       // Scalarize the index if needed.
5353       Ops[i] = CI;
5354     }
5355 
5356     // Try to scalarize the final index.
5357     if (Ops[FinalIndex]->getType()->isVectorTy()) {
5358       if (Value *V = getSplatValue(Ops[FinalIndex])) {
5359         auto *C = dyn_cast<ConstantInt>(V);
5360         // Don't scalarize all zeros vector.
5361         if (!C || !C->isZero()) {
5362           Ops[FinalIndex] = V;
5363           RewriteGEP = true;
5364         }
5365       }
5366     }
5367 
5368     // If we made any changes or the we have extra operands, we need to generate
5369     // new instructions.
5370     if (!RewriteGEP && Ops.size() == 2)
5371       return false;
5372 
5373     unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
5374 
5375     IRBuilder<> Builder(MemoryInst);
5376 
5377     Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
5378 
5379     // If the final index isn't a vector, emit a scalar GEP containing all ops
5380     // and a vector GEP with all zeroes final index.
5381     if (!Ops[FinalIndex]->getType()->isVectorTy()) {
5382       NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
5383       auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
5384       NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
5385     } else {
5386       Value *Base = Ops[0];
5387       Value *Index = Ops[FinalIndex];
5388 
5389       // Create a scalar GEP if there are more than 2 operands.
5390       if (Ops.size() != 2) {
5391         // Replace the last index with 0.
5392         Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
5393         Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
5394       }
5395 
5396       // Now create the GEP with scalar pointer and vector index.
5397       NewAddr = Builder.CreateGEP(Base, Index);
5398     }
5399   } else if (!isa<Constant>(Ptr)) {
5400     // Not a GEP, maybe its a splat and we can create a GEP to enable
5401     // SelectionDAGBuilder to use it as a uniform base.
5402     Value *V = getSplatValue(Ptr);
5403     if (!V)
5404       return false;
5405 
5406     unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
5407 
5408     IRBuilder<> Builder(MemoryInst);
5409 
5410     // Emit a vector GEP with a scalar pointer and all 0s vector index.
5411     Type *ScalarIndexTy = DL->getIndexType(V->getType()->getScalarType());
5412     auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
5413     NewAddr = Builder.CreateGEP(V, Constant::getNullValue(IndexTy));
5414   } else {
5415     // Constant, SelectionDAGBuilder knows to check if its a splat.
5416     return false;
5417   }
5418 
5419   MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
5420 
5421   // If we have no uses, recursively delete the value and all dead instructions
5422   // using it.
5423   if (Ptr->use_empty())
5424     RecursivelyDeleteTriviallyDeadInstructions(
5425         Ptr, TLInfo, nullptr,
5426         [&](Value *V) { removeAllAssertingVHReferences(V); });
5427 
5428   return true;
5429 }
5430 
5431 /// If there are any memory operands, use OptimizeMemoryInst to sink their
5432 /// address computing into the block when possible / profitable.
5433 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
5434   bool MadeChange = false;
5435 
5436   const TargetRegisterInfo *TRI =
5437       TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
5438   TargetLowering::AsmOperandInfoVector TargetConstraints =
5439       TLI->ParseConstraints(*DL, TRI, *CS);
5440   unsigned ArgNo = 0;
5441   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
5442     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
5443 
5444     // Compute the constraint code and ConstraintType to use.
5445     TLI->ComputeConstraintToUse(OpInfo, SDValue());
5446 
5447     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
5448         OpInfo.isIndirect) {
5449       Value *OpVal = CS->getArgOperand(ArgNo++);
5450       MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
5451     } else if (OpInfo.Type == InlineAsm::isInput)
5452       ArgNo++;
5453   }
5454 
5455   return MadeChange;
5456 }
5457 
5458 /// Check if all the uses of \p Val are equivalent (or free) zero or
5459 /// sign extensions.
5460 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
5461   assert(!Val->use_empty() && "Input must have at least one use");
5462   const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
5463   bool IsSExt = isa<SExtInst>(FirstUser);
5464   Type *ExtTy = FirstUser->getType();
5465   for (const User *U : Val->users()) {
5466     const Instruction *UI = cast<Instruction>(U);
5467     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
5468       return false;
5469     Type *CurTy = UI->getType();
5470     // Same input and output types: Same instruction after CSE.
5471     if (CurTy == ExtTy)
5472       continue;
5473 
5474     // If IsSExt is true, we are in this situation:
5475     // a = Val
5476     // b = sext ty1 a to ty2
5477     // c = sext ty1 a to ty3
5478     // Assuming ty2 is shorter than ty3, this could be turned into:
5479     // a = Val
5480     // b = sext ty1 a to ty2
5481     // c = sext ty2 b to ty3
5482     // However, the last sext is not free.
5483     if (IsSExt)
5484       return false;
5485 
5486     // This is a ZExt, maybe this is free to extend from one type to another.
5487     // In that case, we would not account for a different use.
5488     Type *NarrowTy;
5489     Type *LargeTy;
5490     if (ExtTy->getScalarType()->getIntegerBitWidth() >
5491         CurTy->getScalarType()->getIntegerBitWidth()) {
5492       NarrowTy = CurTy;
5493       LargeTy = ExtTy;
5494     } else {
5495       NarrowTy = ExtTy;
5496       LargeTy = CurTy;
5497     }
5498 
5499     if (!TLI.isZExtFree(NarrowTy, LargeTy))
5500       return false;
5501   }
5502   // All uses are the same or can be derived from one another for free.
5503   return true;
5504 }
5505 
5506 /// Try to speculatively promote extensions in \p Exts and continue
5507 /// promoting through newly promoted operands recursively as far as doing so is
5508 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
5509 /// When some promotion happened, \p TPT contains the proper state to revert
5510 /// them.
5511 ///
5512 /// \return true if some promotion happened, false otherwise.
5513 bool CodeGenPrepare::tryToPromoteExts(
5514     TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
5515     SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
5516     unsigned CreatedInstsCost) {
5517   bool Promoted = false;
5518 
5519   // Iterate over all the extensions to try to promote them.
5520   for (auto *I : Exts) {
5521     // Early check if we directly have ext(load).
5522     if (isa<LoadInst>(I->getOperand(0))) {
5523       ProfitablyMovedExts.push_back(I);
5524       continue;
5525     }
5526 
5527     // Check whether or not we want to do any promotion.  The reason we have
5528     // this check inside the for loop is to catch the case where an extension
5529     // is directly fed by a load because in such case the extension can be moved
5530     // up without any promotion on its operands.
5531     if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
5532       return false;
5533 
5534     // Get the action to perform the promotion.
5535     TypePromotionHelper::Action TPH =
5536         TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
5537     // Check if we can promote.
5538     if (!TPH) {
5539       // Save the current extension as we cannot move up through its operand.
5540       ProfitablyMovedExts.push_back(I);
5541       continue;
5542     }
5543 
5544     // Save the current state.
5545     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5546         TPT.getRestorationPoint();
5547     SmallVector<Instruction *, 4> NewExts;
5548     unsigned NewCreatedInstsCost = 0;
5549     unsigned ExtCost = !TLI->isExtFree(I);
5550     // Promote.
5551     Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
5552                              &NewExts, nullptr, *TLI);
5553     assert(PromotedVal &&
5554            "TypePromotionHelper should have filtered out those cases");
5555 
5556     // We would be able to merge only one extension in a load.
5557     // Therefore, if we have more than 1 new extension we heuristically
5558     // cut this search path, because it means we degrade the code quality.
5559     // With exactly 2, the transformation is neutral, because we will merge
5560     // one extension but leave one. However, we optimistically keep going,
5561     // because the new extension may be removed too.
5562     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
5563     // FIXME: It would be possible to propagate a negative value instead of
5564     // conservatively ceiling it to 0.
5565     TotalCreatedInstsCost =
5566         std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
5567     if (!StressExtLdPromotion &&
5568         (TotalCreatedInstsCost > 1 ||
5569          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
5570       // This promotion is not profitable, rollback to the previous state, and
5571       // save the current extension in ProfitablyMovedExts as the latest
5572       // speculative promotion turned out to be unprofitable.
5573       TPT.rollback(LastKnownGood);
5574       ProfitablyMovedExts.push_back(I);
5575       continue;
5576     }
5577     // Continue promoting NewExts as far as doing so is profitable.
5578     SmallVector<Instruction *, 2> NewlyMovedExts;
5579     (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
5580     bool NewPromoted = false;
5581     for (auto *ExtInst : NewlyMovedExts) {
5582       Instruction *MovedExt = cast<Instruction>(ExtInst);
5583       Value *ExtOperand = MovedExt->getOperand(0);
5584       // If we have reached to a load, we need this extra profitability check
5585       // as it could potentially be merged into an ext(load).
5586       if (isa<LoadInst>(ExtOperand) &&
5587           !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
5588             (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
5589         continue;
5590 
5591       ProfitablyMovedExts.push_back(MovedExt);
5592       NewPromoted = true;
5593     }
5594 
5595     // If none of speculative promotions for NewExts is profitable, rollback
5596     // and save the current extension (I) as the last profitable extension.
5597     if (!NewPromoted) {
5598       TPT.rollback(LastKnownGood);
5599       ProfitablyMovedExts.push_back(I);
5600       continue;
5601     }
5602     // The promotion is profitable.
5603     Promoted = true;
5604   }
5605   return Promoted;
5606 }
5607 
5608 /// Merging redundant sexts when one is dominating the other.
5609 bool CodeGenPrepare::mergeSExts(Function &F) {
5610   bool Changed = false;
5611   for (auto &Entry : ValToSExtendedUses) {
5612     SExts &Insts = Entry.second;
5613     SExts CurPts;
5614     for (Instruction *Inst : Insts) {
5615       if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
5616           Inst->getOperand(0) != Entry.first)
5617         continue;
5618       bool inserted = false;
5619       for (auto &Pt : CurPts) {
5620         if (getDT(F).dominates(Inst, Pt)) {
5621           Pt->replaceAllUsesWith(Inst);
5622           RemovedInsts.insert(Pt);
5623           Pt->removeFromParent();
5624           Pt = Inst;
5625           inserted = true;
5626           Changed = true;
5627           break;
5628         }
5629         if (!getDT(F).dominates(Pt, Inst))
5630           // Give up if we need to merge in a common dominator as the
5631           // experiments show it is not profitable.
5632           continue;
5633         Inst->replaceAllUsesWith(Pt);
5634         RemovedInsts.insert(Inst);
5635         Inst->removeFromParent();
5636         inserted = true;
5637         Changed = true;
5638         break;
5639       }
5640       if (!inserted)
5641         CurPts.push_back(Inst);
5642     }
5643   }
5644   return Changed;
5645 }
5646 
5647 // Splitting large data structures so that the GEPs accessing them can have
5648 // smaller offsets so that they can be sunk to the same blocks as their users.
5649 // For example, a large struct starting from %base is split into two parts
5650 // where the second part starts from %new_base.
5651 //
5652 // Before:
5653 // BB0:
5654 //   %base     =
5655 //
5656 // BB1:
5657 //   %gep0     = gep %base, off0
5658 //   %gep1     = gep %base, off1
5659 //   %gep2     = gep %base, off2
5660 //
5661 // BB2:
5662 //   %load1    = load %gep0
5663 //   %load2    = load %gep1
5664 //   %load3    = load %gep2
5665 //
5666 // After:
5667 // BB0:
5668 //   %base     =
5669 //   %new_base = gep %base, off0
5670 //
5671 // BB1:
5672 //   %new_gep0 = %new_base
5673 //   %new_gep1 = gep %new_base, off1 - off0
5674 //   %new_gep2 = gep %new_base, off2 - off0
5675 //
5676 // BB2:
5677 //   %load1    = load i32, i32* %new_gep0
5678 //   %load2    = load i32, i32* %new_gep1
5679 //   %load3    = load i32, i32* %new_gep2
5680 //
5681 // %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
5682 // their offsets are smaller enough to fit into the addressing mode.
5683 bool CodeGenPrepare::splitLargeGEPOffsets() {
5684   bool Changed = false;
5685   for (auto &Entry : LargeOffsetGEPMap) {
5686     Value *OldBase = Entry.first;
5687     SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
5688         &LargeOffsetGEPs = Entry.second;
5689     auto compareGEPOffset =
5690         [&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
5691             const std::pair<GetElementPtrInst *, int64_t> &RHS) {
5692           if (LHS.first == RHS.first)
5693             return false;
5694           if (LHS.second != RHS.second)
5695             return LHS.second < RHS.second;
5696           return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
5697         };
5698     // Sorting all the GEPs of the same data structures based on the offsets.
5699     llvm::sort(LargeOffsetGEPs, compareGEPOffset);
5700     LargeOffsetGEPs.erase(
5701         std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
5702         LargeOffsetGEPs.end());
5703     // Skip if all the GEPs have the same offsets.
5704     if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
5705       continue;
5706     GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
5707     int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
5708     Value *NewBaseGEP = nullptr;
5709 
5710     auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
5711     while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
5712       GetElementPtrInst *GEP = LargeOffsetGEP->first;
5713       int64_t Offset = LargeOffsetGEP->second;
5714       if (Offset != BaseOffset) {
5715         TargetLowering::AddrMode AddrMode;
5716         AddrMode.BaseOffs = Offset - BaseOffset;
5717         // The result type of the GEP might not be the type of the memory
5718         // access.
5719         if (!TLI->isLegalAddressingMode(*DL, AddrMode,
5720                                         GEP->getResultElementType(),
5721                                         GEP->getAddressSpace())) {
5722           // We need to create a new base if the offset to the current base is
5723           // too large to fit into the addressing mode. So, a very large struct
5724           // may be split into several parts.
5725           BaseGEP = GEP;
5726           BaseOffset = Offset;
5727           NewBaseGEP = nullptr;
5728         }
5729       }
5730 
5731       // Generate a new GEP to replace the current one.
5732       LLVMContext &Ctx = GEP->getContext();
5733       Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
5734       Type *I8PtrTy =
5735           Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
5736       Type *I8Ty = Type::getInt8Ty(Ctx);
5737 
5738       if (!NewBaseGEP) {
5739         // Create a new base if we don't have one yet.  Find the insertion
5740         // pointer for the new base first.
5741         BasicBlock::iterator NewBaseInsertPt;
5742         BasicBlock *NewBaseInsertBB;
5743         if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
5744           // If the base of the struct is an instruction, the new base will be
5745           // inserted close to it.
5746           NewBaseInsertBB = BaseI->getParent();
5747           if (isa<PHINode>(BaseI))
5748             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5749           else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
5750             NewBaseInsertBB =
5751                 SplitEdge(NewBaseInsertBB, Invoke->getNormalDest());
5752             NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5753           } else
5754             NewBaseInsertPt = std::next(BaseI->getIterator());
5755         } else {
5756           // If the current base is an argument or global value, the new base
5757           // will be inserted to the entry block.
5758           NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
5759           NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
5760         }
5761         IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
5762         // Create a new base.
5763         Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset);
5764         NewBaseGEP = OldBase;
5765         if (NewBaseGEP->getType() != I8PtrTy)
5766           NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
5767         NewBaseGEP =
5768             NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
5769         NewGEPBases.insert(NewBaseGEP);
5770       }
5771 
5772       IRBuilder<> Builder(GEP);
5773       Value *NewGEP = NewBaseGEP;
5774       if (Offset == BaseOffset) {
5775         if (GEP->getType() != I8PtrTy)
5776           NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
5777       } else {
5778         // Calculate the new offset for the new GEP.
5779         Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset);
5780         NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
5781 
5782         if (GEP->getType() != I8PtrTy)
5783           NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
5784       }
5785       GEP->replaceAllUsesWith(NewGEP);
5786       LargeOffsetGEPID.erase(GEP);
5787       LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
5788       GEP->eraseFromParent();
5789       Changed = true;
5790     }
5791   }
5792   return Changed;
5793 }
5794 
5795 bool CodeGenPrepare::optimizePhiType(
5796     PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
5797     SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
5798   // We are looking for a collection on interconnected phi nodes that together
5799   // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
5800   // are of the same type. Convert the whole set of nodes to the type of the
5801   // bitcast.
5802   Type *PhiTy = I->getType();
5803   Type *ConvertTy = nullptr;
5804   if (Visited.count(I) ||
5805       (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
5806     return false;
5807 
5808   SmallVector<Instruction *, 4> Worklist;
5809   Worklist.push_back(cast<Instruction>(I));
5810   SmallPtrSet<PHINode *, 4> PhiNodes;
5811   PhiNodes.insert(I);
5812   Visited.insert(I);
5813   SmallPtrSet<Instruction *, 4> Defs;
5814   SmallPtrSet<Instruction *, 4> Uses;
5815   // This works by adding extra bitcasts between load/stores and removing
5816   // existing bicasts. If we have a phi(bitcast(load)) or a store(bitcast(phi))
5817   // we can get in the situation where we remove a bitcast in one iteration
5818   // just to add it again in the next. We need to ensure that at least one
5819   // bitcast we remove are anchored to something that will not change back.
5820   bool AnyAnchored = false;
5821 
5822   while (!Worklist.empty()) {
5823     Instruction *II = Worklist.pop_back_val();
5824 
5825     if (auto *Phi = dyn_cast<PHINode>(II)) {
5826       // Handle Defs, which might also be PHI's
5827       for (Value *V : Phi->incoming_values()) {
5828         if (auto *OpPhi = dyn_cast<PHINode>(V)) {
5829           if (!PhiNodes.count(OpPhi)) {
5830             if (Visited.count(OpPhi))
5831               return false;
5832             PhiNodes.insert(OpPhi);
5833             Visited.insert(OpPhi);
5834             Worklist.push_back(OpPhi);
5835           }
5836         } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
5837           if (!OpLoad->isSimple())
5838             return false;
5839           if (!Defs.count(OpLoad)) {
5840             Defs.insert(OpLoad);
5841             Worklist.push_back(OpLoad);
5842           }
5843         } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
5844           if (!Defs.count(OpEx)) {
5845             Defs.insert(OpEx);
5846             Worklist.push_back(OpEx);
5847           }
5848         } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
5849           if (!ConvertTy)
5850             ConvertTy = OpBC->getOperand(0)->getType();
5851           if (OpBC->getOperand(0)->getType() != ConvertTy)
5852             return false;
5853           if (!Defs.count(OpBC)) {
5854             Defs.insert(OpBC);
5855             Worklist.push_back(OpBC);
5856             AnyAnchored |= !isa<LoadInst>(OpBC->getOperand(0)) &&
5857                            !isa<ExtractElementInst>(OpBC->getOperand(0));
5858           }
5859         } else if (!isa<UndefValue>(V)) {
5860           return false;
5861         }
5862       }
5863     }
5864 
5865     // Handle uses which might also be phi's
5866     for (User *V : II->users()) {
5867       if (auto *OpPhi = dyn_cast<PHINode>(V)) {
5868         if (!PhiNodes.count(OpPhi)) {
5869           if (Visited.count(OpPhi))
5870             return false;
5871           PhiNodes.insert(OpPhi);
5872           Visited.insert(OpPhi);
5873           Worklist.push_back(OpPhi);
5874         }
5875       } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
5876         if (!OpStore->isSimple() || OpStore->getOperand(0) != II)
5877           return false;
5878         Uses.insert(OpStore);
5879       } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
5880         if (!ConvertTy)
5881           ConvertTy = OpBC->getType();
5882         if (OpBC->getType() != ConvertTy)
5883           return false;
5884         Uses.insert(OpBC);
5885         AnyAnchored |=
5886             any_of(OpBC->users(), [](User *U) { return !isa<StoreInst>(U); });
5887       } else {
5888         return false;
5889       }
5890     }
5891   }
5892 
5893   if (!ConvertTy || !AnyAnchored || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
5894     return false;
5895 
5896   LLVM_DEBUG(dbgs() << "Converting " << *I << "\n  and connected nodes to "
5897                     << *ConvertTy << "\n");
5898 
5899   // Create all the new phi nodes of the new type, and bitcast any loads to the
5900   // correct type.
5901   ValueToValueMap ValMap;
5902   ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);
5903   for (Instruction *D : Defs) {
5904     if (isa<BitCastInst>(D)) {
5905       ValMap[D] = D->getOperand(0);
5906       DeletedInstrs.insert(D);
5907     } else {
5908       ValMap[D] =
5909           new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
5910     }
5911   }
5912   for (PHINode *Phi : PhiNodes)
5913     ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
5914                                   Phi->getName() + ".tc", Phi);
5915   // Pipe together all the PhiNodes.
5916   for (PHINode *Phi : PhiNodes) {
5917     PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
5918     for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
5919       NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
5920                           Phi->getIncomingBlock(i));
5921     Visited.insert(NewPhi);
5922   }
5923   // And finally pipe up the stores and bitcasts
5924   for (Instruction *U : Uses) {
5925     if (isa<BitCastInst>(U)) {
5926       DeletedInstrs.insert(U);
5927       U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
5928     } else {
5929       U->setOperand(0,
5930                     new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
5931     }
5932   }
5933 
5934   // Save the removed phis to be deleted later.
5935   for (PHINode *Phi : PhiNodes)
5936     DeletedInstrs.insert(Phi);
5937   return true;
5938 }
5939 
5940 bool CodeGenPrepare::optimizePhiTypes(Function &F) {
5941   if (!OptimizePhiTypes)
5942     return false;
5943 
5944   bool Changed = false;
5945   SmallPtrSet<PHINode *, 4> Visited;
5946   SmallPtrSet<Instruction *, 4> DeletedInstrs;
5947 
5948   // Attempt to optimize all the phis in the functions to the correct type.
5949   for (auto &BB : F)
5950     for (auto &Phi : BB.phis())
5951       Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);
5952 
5953   // Remove any old phi's that have been converted.
5954   for (auto *I : DeletedInstrs) {
5955     I->replaceAllUsesWith(UndefValue::get(I->getType()));
5956     I->eraseFromParent();
5957   }
5958 
5959   return Changed;
5960 }
5961 
5962 /// Return true, if an ext(load) can be formed from an extension in
5963 /// \p MovedExts.
5964 bool CodeGenPrepare::canFormExtLd(
5965     const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
5966     Instruction *&Inst, bool HasPromoted) {
5967   for (auto *MovedExtInst : MovedExts) {
5968     if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
5969       LI = cast<LoadInst>(MovedExtInst->getOperand(0));
5970       Inst = MovedExtInst;
5971       break;
5972     }
5973   }
5974   if (!LI)
5975     return false;
5976 
5977   // If they're already in the same block, there's nothing to do.
5978   // Make the cheap checks first if we did not promote.
5979   // If we promoted, we need to check if it is indeed profitable.
5980   if (!HasPromoted && LI->getParent() == Inst->getParent())
5981     return false;
5982 
5983   return TLI->isExtLoad(LI, Inst, *DL);
5984 }
5985 
5986 /// Move a zext or sext fed by a load into the same basic block as the load,
5987 /// unless conditions are unfavorable. This allows SelectionDAG to fold the
5988 /// extend into the load.
5989 ///
5990 /// E.g.,
5991 /// \code
5992 /// %ld = load i32* %addr
5993 /// %add = add nuw i32 %ld, 4
5994 /// %zext = zext i32 %add to i64
5995 // \endcode
5996 /// =>
5997 /// \code
5998 /// %ld = load i32* %addr
5999 /// %zext = zext i32 %ld to i64
6000 /// %add = add nuw i64 %zext, 4
6001 /// \encode
6002 /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
6003 /// allow us to match zext(load i32*) to i64.
6004 ///
6005 /// Also, try to promote the computations used to obtain a sign extended
6006 /// value used into memory accesses.
6007 /// E.g.,
6008 /// \code
6009 /// a = add nsw i32 b, 3
6010 /// d = sext i32 a to i64
6011 /// e = getelementptr ..., i64 d
6012 /// \endcode
6013 /// =>
6014 /// \code
6015 /// f = sext i32 b to i64
6016 /// a = add nsw i64 f, 3
6017 /// e = getelementptr ..., i64 a
6018 /// \endcode
6019 ///
6020 /// \p Inst[in/out] the extension may be modified during the process if some
6021 /// promotions apply.
6022 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
6023   bool AllowPromotionWithoutCommonHeader = false;
6024   /// See if it is an interesting sext operations for the address type
6025   /// promotion before trying to promote it, e.g., the ones with the right
6026   /// type and used in memory accesses.
6027   bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
6028       *Inst, AllowPromotionWithoutCommonHeader);
6029   TypePromotionTransaction TPT(RemovedInsts);
6030   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
6031       TPT.getRestorationPoint();
6032   SmallVector<Instruction *, 1> Exts;
6033   SmallVector<Instruction *, 2> SpeculativelyMovedExts;
6034   Exts.push_back(Inst);
6035 
6036   bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
6037 
6038   // Look for a load being extended.
6039   LoadInst *LI = nullptr;
6040   Instruction *ExtFedByLoad;
6041 
6042   // Try to promote a chain of computation if it allows to form an extended
6043   // load.
6044   if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
6045     assert(LI && ExtFedByLoad && "Expect a valid load and extension");
6046     TPT.commit();
6047     // Move the extend into the same block as the load.
6048     ExtFedByLoad->moveAfter(LI);
6049     ++NumExtsMoved;
6050     Inst = ExtFedByLoad;
6051     return true;
6052   }
6053 
6054   // Continue promoting SExts if known as considerable depending on targets.
6055   if (ATPConsiderable &&
6056       performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
6057                                   HasPromoted, TPT, SpeculativelyMovedExts))
6058     return true;
6059 
6060   TPT.rollback(LastKnownGood);
6061   return false;
6062 }
6063 
6064 // Perform address type promotion if doing so is profitable.
6065 // If AllowPromotionWithoutCommonHeader == false, we should find other sext
6066 // instructions that sign extended the same initial value. However, if
6067 // AllowPromotionWithoutCommonHeader == true, we expect promoting the
6068 // extension is just profitable.
6069 bool CodeGenPrepare::performAddressTypePromotion(
6070     Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
6071     bool HasPromoted, TypePromotionTransaction &TPT,
6072     SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
6073   bool Promoted = false;
6074   SmallPtrSet<Instruction *, 1> UnhandledExts;
6075   bool AllSeenFirst = true;
6076   for (auto *I : SpeculativelyMovedExts) {
6077     Value *HeadOfChain = I->getOperand(0);
6078     DenseMap<Value *, Instruction *>::iterator AlreadySeen =
6079         SeenChainsForSExt.find(HeadOfChain);
6080     // If there is an unhandled SExt which has the same header, try to promote
6081     // it as well.
6082     if (AlreadySeen != SeenChainsForSExt.end()) {
6083       if (AlreadySeen->second != nullptr)
6084         UnhandledExts.insert(AlreadySeen->second);
6085       AllSeenFirst = false;
6086     }
6087   }
6088 
6089   if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
6090                         SpeculativelyMovedExts.size() == 1)) {
6091     TPT.commit();
6092     if (HasPromoted)
6093       Promoted = true;
6094     for (auto *I : SpeculativelyMovedExts) {
6095       Value *HeadOfChain = I->getOperand(0);
6096       SeenChainsForSExt[HeadOfChain] = nullptr;
6097       ValToSExtendedUses[HeadOfChain].push_back(I);
6098     }
6099     // Update Inst as promotion happen.
6100     Inst = SpeculativelyMovedExts.pop_back_val();
6101   } else {
6102     // This is the first chain visited from the header, keep the current chain
6103     // as unhandled. Defer to promote this until we encounter another SExt
6104     // chain derived from the same header.
6105     for (auto *I : SpeculativelyMovedExts) {
6106       Value *HeadOfChain = I->getOperand(0);
6107       SeenChainsForSExt[HeadOfChain] = Inst;
6108     }
6109     return false;
6110   }
6111 
6112   if (!AllSeenFirst && !UnhandledExts.empty())
6113     for (auto *VisitedSExt : UnhandledExts) {
6114       if (RemovedInsts.count(VisitedSExt))
6115         continue;
6116       TypePromotionTransaction TPT(RemovedInsts);
6117       SmallVector<Instruction *, 1> Exts;
6118       SmallVector<Instruction *, 2> Chains;
6119       Exts.push_back(VisitedSExt);
6120       bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
6121       TPT.commit();
6122       if (HasPromoted)
6123         Promoted = true;
6124       for (auto *I : Chains) {
6125         Value *HeadOfChain = I->getOperand(0);
6126         // Mark this as handled.
6127         SeenChainsForSExt[HeadOfChain] = nullptr;
6128         ValToSExtendedUses[HeadOfChain].push_back(I);
6129       }
6130     }
6131   return Promoted;
6132 }
6133 
6134 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
6135   BasicBlock *DefBB = I->getParent();
6136 
6137   // If the result of a {s|z}ext and its source are both live out, rewrite all
6138   // other uses of the source with result of extension.
6139   Value *Src = I->getOperand(0);
6140   if (Src->hasOneUse())
6141     return false;
6142 
6143   // Only do this xform if truncating is free.
6144   if (!TLI->isTruncateFree(I->getType(), Src->getType()))
6145     return false;
6146 
6147   // Only safe to perform the optimization if the source is also defined in
6148   // this block.
6149   if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
6150     return false;
6151 
6152   bool DefIsLiveOut = false;
6153   for (User *U : I->users()) {
6154     Instruction *UI = cast<Instruction>(U);
6155 
6156     // Figure out which BB this ext is used in.
6157     BasicBlock *UserBB = UI->getParent();
6158     if (UserBB == DefBB) continue;
6159     DefIsLiveOut = true;
6160     break;
6161   }
6162   if (!DefIsLiveOut)
6163     return false;
6164 
6165   // Make sure none of the uses are PHI nodes.
6166   for (User *U : Src->users()) {
6167     Instruction *UI = cast<Instruction>(U);
6168     BasicBlock *UserBB = UI->getParent();
6169     if (UserBB == DefBB) continue;
6170     // Be conservative. We don't want this xform to end up introducing
6171     // reloads just before load / store instructions.
6172     if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
6173       return false;
6174   }
6175 
6176   // InsertedTruncs - Only insert one trunc in each block once.
6177   DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
6178 
6179   bool MadeChange = false;
6180   for (Use &U : Src->uses()) {
6181     Instruction *User = cast<Instruction>(U.getUser());
6182 
6183     // Figure out which BB this ext is used in.
6184     BasicBlock *UserBB = User->getParent();
6185     if (UserBB == DefBB) continue;
6186 
6187     // Both src and def are live in this block. Rewrite the use.
6188     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
6189 
6190     if (!InsertedTrunc) {
6191       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
6192       assert(InsertPt != UserBB->end());
6193       InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
6194       InsertedInsts.insert(InsertedTrunc);
6195     }
6196 
6197     // Replace a use of the {s|z}ext source with a use of the result.
6198     U = InsertedTrunc;
6199     ++NumExtUses;
6200     MadeChange = true;
6201   }
6202 
6203   return MadeChange;
6204 }
6205 
6206 // Find loads whose uses only use some of the loaded value's bits.  Add an "and"
6207 // just after the load if the target can fold this into one extload instruction,
6208 // with the hope of eliminating some of the other later "and" instructions using
6209 // the loaded value.  "and"s that are made trivially redundant by the insertion
6210 // of the new "and" are removed by this function, while others (e.g. those whose
6211 // path from the load goes through a phi) are left for isel to potentially
6212 // remove.
6213 //
6214 // For example:
6215 //
6216 // b0:
6217 //   x = load i32
6218 //   ...
6219 // b1:
6220 //   y = and x, 0xff
6221 //   z = use y
6222 //
6223 // becomes:
6224 //
6225 // b0:
6226 //   x = load i32
6227 //   x' = and x, 0xff
6228 //   ...
6229 // b1:
6230 //   z = use x'
6231 //
6232 // whereas:
6233 //
6234 // b0:
6235 //   x1 = load i32
6236 //   ...
6237 // b1:
6238 //   x2 = load i32
6239 //   ...
6240 // b2:
6241 //   x = phi x1, x2
6242 //   y = and x, 0xff
6243 //
6244 // becomes (after a call to optimizeLoadExt for each load):
6245 //
6246 // b0:
6247 //   x1 = load i32
6248 //   x1' = and x1, 0xff
6249 //   ...
6250 // b1:
6251 //   x2 = load i32
6252 //   x2' = and x2, 0xff
6253 //   ...
6254 // b2:
6255 //   x = phi x1', x2'
6256 //   y = and x, 0xff
6257 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
6258   if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
6259     return false;
6260 
6261   // Skip loads we've already transformed.
6262   if (Load->hasOneUse() &&
6263       InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
6264     return false;
6265 
6266   // Look at all uses of Load, looking through phis, to determine how many bits
6267   // of the loaded value are needed.
6268   SmallVector<Instruction *, 8> WorkList;
6269   SmallPtrSet<Instruction *, 16> Visited;
6270   SmallVector<Instruction *, 8> AndsToMaybeRemove;
6271   for (auto *U : Load->users())
6272     WorkList.push_back(cast<Instruction>(U));
6273 
6274   EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
6275   unsigned BitWidth = LoadResultVT.getSizeInBits();
6276   APInt DemandBits(BitWidth, 0);
6277   APInt WidestAndBits(BitWidth, 0);
6278 
6279   while (!WorkList.empty()) {
6280     Instruction *I = WorkList.back();
6281     WorkList.pop_back();
6282 
6283     // Break use-def graph loops.
6284     if (!Visited.insert(I).second)
6285       continue;
6286 
6287     // For a PHI node, push all of its users.
6288     if (auto *Phi = dyn_cast<PHINode>(I)) {
6289       for (auto *U : Phi->users())
6290         WorkList.push_back(cast<Instruction>(U));
6291       continue;
6292     }
6293 
6294     switch (I->getOpcode()) {
6295     case Instruction::And: {
6296       auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
6297       if (!AndC)
6298         return false;
6299       APInt AndBits = AndC->getValue();
6300       DemandBits |= AndBits;
6301       // Keep track of the widest and mask we see.
6302       if (AndBits.ugt(WidestAndBits))
6303         WidestAndBits = AndBits;
6304       if (AndBits == WidestAndBits && I->getOperand(0) == Load)
6305         AndsToMaybeRemove.push_back(I);
6306       break;
6307     }
6308 
6309     case Instruction::Shl: {
6310       auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
6311       if (!ShlC)
6312         return false;
6313       uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
6314       DemandBits.setLowBits(BitWidth - ShiftAmt);
6315       break;
6316     }
6317 
6318     case Instruction::Trunc: {
6319       EVT TruncVT = TLI->getValueType(*DL, I->getType());
6320       unsigned TruncBitWidth = TruncVT.getSizeInBits();
6321       DemandBits.setLowBits(TruncBitWidth);
6322       break;
6323     }
6324 
6325     default:
6326       return false;
6327     }
6328   }
6329 
6330   uint32_t ActiveBits = DemandBits.getActiveBits();
6331   // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
6332   // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
6333   // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
6334   // (and (load x) 1) is not matched as a single instruction, rather as a LDR
6335   // followed by an AND.
6336   // TODO: Look into removing this restriction by fixing backends to either
6337   // return false for isLoadExtLegal for i1 or have them select this pattern to
6338   // a single instruction.
6339   //
6340   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
6341   // mask, since these are the only ands that will be removed by isel.
6342   if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
6343       WidestAndBits != DemandBits)
6344     return false;
6345 
6346   LLVMContext &Ctx = Load->getType()->getContext();
6347   Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
6348   EVT TruncVT = TLI->getValueType(*DL, TruncTy);
6349 
6350   // Reject cases that won't be matched as extloads.
6351   if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
6352       !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
6353     return false;
6354 
6355   IRBuilder<> Builder(Load->getNextNode());
6356   auto *NewAnd = cast<Instruction>(
6357       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
6358   // Mark this instruction as "inserted by CGP", so that other
6359   // optimizations don't touch it.
6360   InsertedInsts.insert(NewAnd);
6361 
6362   // Replace all uses of load with new and (except for the use of load in the
6363   // new and itself).
6364   Load->replaceAllUsesWith(NewAnd);
6365   NewAnd->setOperand(0, Load);
6366 
6367   // Remove any and instructions that are now redundant.
6368   for (auto *And : AndsToMaybeRemove)
6369     // Check that the and mask is the same as the one we decided to put on the
6370     // new and.
6371     if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
6372       And->replaceAllUsesWith(NewAnd);
6373       if (&*CurInstIterator == And)
6374         CurInstIterator = std::next(And->getIterator());
6375       And->eraseFromParent();
6376       ++NumAndUses;
6377     }
6378 
6379   ++NumAndsAdded;
6380   return true;
6381 }
6382 
6383 /// Check if V (an operand of a select instruction) is an expensive instruction
6384 /// that is only used once.
6385 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
6386   auto *I = dyn_cast<Instruction>(V);
6387   // If it's safe to speculatively execute, then it should not have side
6388   // effects; therefore, it's safe to sink and possibly *not* execute.
6389   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
6390          TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
6391          TargetTransformInfo::TCC_Expensive;
6392 }
6393 
6394 /// Returns true if a SelectInst should be turned into an explicit branch.
6395 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
6396                                                 const TargetLowering *TLI,
6397                                                 SelectInst *SI) {
6398   // If even a predictable select is cheap, then a branch can't be cheaper.
6399   if (!TLI->isPredictableSelectExpensive())
6400     return false;
6401 
6402   // FIXME: This should use the same heuristics as IfConversion to determine
6403   // whether a select is better represented as a branch.
6404 
6405   // If metadata tells us that the select condition is obviously predictable,
6406   // then we want to replace the select with a branch.
6407   uint64_t TrueWeight, FalseWeight;
6408   if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
6409     uint64_t Max = std::max(TrueWeight, FalseWeight);
6410     uint64_t Sum = TrueWeight + FalseWeight;
6411     if (Sum != 0) {
6412       auto Probability = BranchProbability::getBranchProbability(Max, Sum);
6413       if (Probability > TLI->getPredictableBranchThreshold())
6414         return true;
6415     }
6416   }
6417 
6418   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
6419 
6420   // If a branch is predictable, an out-of-order CPU can avoid blocking on its
6421   // comparison condition. If the compare has more than one use, there's
6422   // probably another cmov or setcc around, so it's not worth emitting a branch.
6423   if (!Cmp || !Cmp->hasOneUse())
6424     return false;
6425 
6426   // If either operand of the select is expensive and only needed on one side
6427   // of the select, we should form a branch.
6428   if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
6429       sinkSelectOperand(TTI, SI->getFalseValue()))
6430     return true;
6431 
6432   return false;
6433 }
6434 
6435 /// If \p isTrue is true, return the true value of \p SI, otherwise return
6436 /// false value of \p SI. If the true/false value of \p SI is defined by any
6437 /// select instructions in \p Selects, look through the defining select
6438 /// instruction until the true/false value is not defined in \p Selects.
6439 static Value *getTrueOrFalseValue(
6440     SelectInst *SI, bool isTrue,
6441     const SmallPtrSet<const Instruction *, 2> &Selects) {
6442   Value *V = nullptr;
6443 
6444   for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
6445        DefSI = dyn_cast<SelectInst>(V)) {
6446     assert(DefSI->getCondition() == SI->getCondition() &&
6447            "The condition of DefSI does not match with SI");
6448     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
6449   }
6450 
6451   assert(V && "Failed to get select true/false value");
6452   return V;
6453 }
6454 
6455 bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
6456   assert(Shift->isShift() && "Expected a shift");
6457 
6458   // If this is (1) a vector shift, (2) shifts by scalars are cheaper than
6459   // general vector shifts, and (3) the shift amount is a select-of-splatted
6460   // values, hoist the shifts before the select:
6461   //   shift Op0, (select Cond, TVal, FVal) -->
6462   //   select Cond, (shift Op0, TVal), (shift Op0, FVal)
6463   //
6464   // This is inverting a generic IR transform when we know that the cost of a
6465   // general vector shift is more than the cost of 2 shift-by-scalars.
6466   // We can't do this effectively in SDAG because we may not be able to
6467   // determine if the select operands are splats from within a basic block.
6468   Type *Ty = Shift->getType();
6469   if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
6470     return false;
6471   Value *Cond, *TVal, *FVal;
6472   if (!match(Shift->getOperand(1),
6473              m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
6474     return false;
6475   if (!isSplatValue(TVal) || !isSplatValue(FVal))
6476     return false;
6477 
6478   IRBuilder<> Builder(Shift);
6479   BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
6480   Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
6481   Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
6482   Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
6483   Shift->replaceAllUsesWith(NewSel);
6484   Shift->eraseFromParent();
6485   return true;
6486 }
6487 
6488 bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
6489   Intrinsic::ID Opcode = Fsh->getIntrinsicID();
6490   assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
6491          "Expected a funnel shift");
6492 
6493   // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
6494   // than general vector shifts, and (3) the shift amount is select-of-splatted
6495   // values, hoist the funnel shifts before the select:
6496   //   fsh Op0, Op1, (select Cond, TVal, FVal) -->
6497   //   select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
6498   //
6499   // This is inverting a generic IR transform when we know that the cost of a
6500   // general vector shift is more than the cost of 2 shift-by-scalars.
6501   // We can't do this effectively in SDAG because we may not be able to
6502   // determine if the select operands are splats from within a basic block.
6503   Type *Ty = Fsh->getType();
6504   if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
6505     return false;
6506   Value *Cond, *TVal, *FVal;
6507   if (!match(Fsh->getOperand(2),
6508              m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
6509     return false;
6510   if (!isSplatValue(TVal) || !isSplatValue(FVal))
6511     return false;
6512 
6513   IRBuilder<> Builder(Fsh);
6514   Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
6515   Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
6516   Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
6517   Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
6518   Fsh->replaceAllUsesWith(NewSel);
6519   Fsh->eraseFromParent();
6520   return true;
6521 }
6522 
6523 /// If we have a SelectInst that will likely profit from branch prediction,
6524 /// turn it into a branch.
6525 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
6526   if (DisableSelectToBranch)
6527     return false;
6528 
6529   // Find all consecutive select instructions that share the same condition.
6530   SmallVector<SelectInst *, 2> ASI;
6531   ASI.push_back(SI);
6532   for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
6533        It != SI->getParent()->end(); ++It) {
6534     SelectInst *I = dyn_cast<SelectInst>(&*It);
6535     if (I && SI->getCondition() == I->getCondition()) {
6536       ASI.push_back(I);
6537     } else {
6538       break;
6539     }
6540   }
6541 
6542   SelectInst *LastSI = ASI.back();
6543   // Increment the current iterator to skip all the rest of select instructions
6544   // because they will be either "not lowered" or "all lowered" to branch.
6545   CurInstIterator = std::next(LastSI->getIterator());
6546 
6547   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
6548 
6549   // Can we convert the 'select' to CF ?
6550   if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable))
6551     return false;
6552 
6553   TargetLowering::SelectSupportKind SelectKind;
6554   if (VectorCond)
6555     SelectKind = TargetLowering::VectorMaskSelect;
6556   else if (SI->getType()->isVectorTy())
6557     SelectKind = TargetLowering::ScalarCondVectorVal;
6558   else
6559     SelectKind = TargetLowering::ScalarValSelect;
6560 
6561   if (TLI->isSelectSupported(SelectKind) &&
6562       (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize ||
6563        llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())))
6564     return false;
6565 
6566   // The DominatorTree needs to be rebuilt by any consumers after this
6567   // transformation. We simply reset here rather than setting the ModifiedDT
6568   // flag to avoid restarting the function walk in runOnFunction for each
6569   // select optimized.
6570   DT.reset();
6571 
6572   // Transform a sequence like this:
6573   //    start:
6574   //       %cmp = cmp uge i32 %a, %b
6575   //       %sel = select i1 %cmp, i32 %c, i32 %d
6576   //
6577   // Into:
6578   //    start:
6579   //       %cmp = cmp uge i32 %a, %b
6580   //       %cmp.frozen = freeze %cmp
6581   //       br i1 %cmp.frozen, label %select.true, label %select.false
6582   //    select.true:
6583   //       br label %select.end
6584   //    select.false:
6585   //       br label %select.end
6586   //    select.end:
6587   //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
6588   //
6589   // %cmp should be frozen, otherwise it may introduce undefined behavior.
6590   // In addition, we may sink instructions that produce %c or %d from
6591   // the entry block into the destination(s) of the new branch.
6592   // If the true or false blocks do not contain a sunken instruction, that
6593   // block and its branch may be optimized away. In that case, one side of the
6594   // first branch will point directly to select.end, and the corresponding PHI
6595   // predecessor block will be the start block.
6596 
6597   // First, we split the block containing the select into 2 blocks.
6598   BasicBlock *StartBlock = SI->getParent();
6599   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
6600   BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
6601   BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
6602 
6603   // Delete the unconditional branch that was just created by the split.
6604   StartBlock->getTerminator()->eraseFromParent();
6605 
6606   // These are the new basic blocks for the conditional branch.
6607   // At least one will become an actual new basic block.
6608   BasicBlock *TrueBlock = nullptr;
6609   BasicBlock *FalseBlock = nullptr;
6610   BranchInst *TrueBranch = nullptr;
6611   BranchInst *FalseBranch = nullptr;
6612 
6613   // Sink expensive instructions into the conditional blocks to avoid executing
6614   // them speculatively.
6615   for (SelectInst *SI : ASI) {
6616     if (sinkSelectOperand(TTI, SI->getTrueValue())) {
6617       if (TrueBlock == nullptr) {
6618         TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
6619                                        EndBlock->getParent(), EndBlock);
6620         TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
6621         TrueBranch->setDebugLoc(SI->getDebugLoc());
6622       }
6623       auto *TrueInst = cast<Instruction>(SI->getTrueValue());
6624       TrueInst->moveBefore(TrueBranch);
6625     }
6626     if (sinkSelectOperand(TTI, SI->getFalseValue())) {
6627       if (FalseBlock == nullptr) {
6628         FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
6629                                         EndBlock->getParent(), EndBlock);
6630         FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
6631         FalseBranch->setDebugLoc(SI->getDebugLoc());
6632       }
6633       auto *FalseInst = cast<Instruction>(SI->getFalseValue());
6634       FalseInst->moveBefore(FalseBranch);
6635     }
6636   }
6637 
6638   // If there was nothing to sink, then arbitrarily choose the 'false' side
6639   // for a new input value to the PHI.
6640   if (TrueBlock == FalseBlock) {
6641     assert(TrueBlock == nullptr &&
6642            "Unexpected basic block transform while optimizing select");
6643 
6644     FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
6645                                     EndBlock->getParent(), EndBlock);
6646     auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
6647     FalseBranch->setDebugLoc(SI->getDebugLoc());
6648   }
6649 
6650   // Insert the real conditional branch based on the original condition.
6651   // If we did not create a new block for one of the 'true' or 'false' paths
6652   // of the condition, it means that side of the branch goes to the end block
6653   // directly and the path originates from the start block from the point of
6654   // view of the new PHI.
6655   BasicBlock *TT, *FT;
6656   if (TrueBlock == nullptr) {
6657     TT = EndBlock;
6658     FT = FalseBlock;
6659     TrueBlock = StartBlock;
6660   } else if (FalseBlock == nullptr) {
6661     TT = TrueBlock;
6662     FT = EndBlock;
6663     FalseBlock = StartBlock;
6664   } else {
6665     TT = TrueBlock;
6666     FT = FalseBlock;
6667   }
6668   IRBuilder<> IB(SI);
6669   auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
6670   IB.CreateCondBr(CondFr, TT, FT, SI);
6671 
6672   SmallPtrSet<const Instruction *, 2> INS;
6673   INS.insert(ASI.begin(), ASI.end());
6674   // Use reverse iterator because later select may use the value of the
6675   // earlier select, and we need to propagate value through earlier select
6676   // to get the PHI operand.
6677   for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
6678     SelectInst *SI = *It;
6679     // The select itself is replaced with a PHI Node.
6680     PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
6681     PN->takeName(SI);
6682     PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
6683     PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
6684     PN->setDebugLoc(SI->getDebugLoc());
6685 
6686     SI->replaceAllUsesWith(PN);
6687     SI->eraseFromParent();
6688     INS.erase(SI);
6689     ++NumSelectsExpanded;
6690   }
6691 
6692   // Instruct OptimizeBlock to skip to the next block.
6693   CurInstIterator = StartBlock->end();
6694   return true;
6695 }
6696 
6697 /// Some targets only accept certain types for splat inputs. For example a VDUP
6698 /// in MVE takes a GPR (integer) register, and the instruction that incorporate
6699 /// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
6700 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
6701   if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
6702                             m_Undef(), m_ZeroMask())))
6703     return false;
6704   Type *NewType = TLI->shouldConvertSplatType(SVI);
6705   if (!NewType)
6706     return false;
6707 
6708   auto *SVIVecType = cast<FixedVectorType>(SVI->getType());
6709   assert(!NewType->isVectorTy() && "Expected a scalar type!");
6710   assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
6711          "Expected a type of the same size!");
6712   auto *NewVecType =
6713       FixedVectorType::get(NewType, SVIVecType->getNumElements());
6714 
6715   // Create a bitcast (shuffle (insert (bitcast(..))))
6716   IRBuilder<> Builder(SVI->getContext());
6717   Builder.SetInsertPoint(SVI);
6718   Value *BC1 = Builder.CreateBitCast(
6719       cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
6720   Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,
6721                                               (uint64_t)0);
6722   Value *Shuffle = Builder.CreateShuffleVector(
6723       Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());
6724   Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
6725 
6726   SVI->replaceAllUsesWith(BC2);
6727   RecursivelyDeleteTriviallyDeadInstructions(
6728       SVI, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); });
6729 
6730   // Also hoist the bitcast up to its operand if it they are not in the same
6731   // block.
6732   if (auto *BCI = dyn_cast<Instruction>(BC1))
6733     if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))
6734       if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&
6735           !Op->isTerminator() && !Op->isEHPad())
6736         BCI->moveAfter(Op);
6737 
6738   return true;
6739 }
6740 
6741 bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
6742   // If the operands of I can be folded into a target instruction together with
6743   // I, duplicate and sink them.
6744   SmallVector<Use *, 4> OpsToSink;
6745   if (!TLI->shouldSinkOperands(I, OpsToSink))
6746     return false;
6747 
6748   // OpsToSink can contain multiple uses in a use chain (e.g.
6749   // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
6750   // uses must come first, so we process the ops in reverse order so as to not
6751   // create invalid IR.
6752   BasicBlock *TargetBB = I->getParent();
6753   bool Changed = false;
6754   SmallVector<Use *, 4> ToReplace;
6755   for (Use *U : reverse(OpsToSink)) {
6756     auto *UI = cast<Instruction>(U->get());
6757     if (UI->getParent() == TargetBB || isa<PHINode>(UI))
6758       continue;
6759     ToReplace.push_back(U);
6760   }
6761 
6762   SetVector<Instruction *> MaybeDead;
6763   DenseMap<Instruction *, Instruction *> NewInstructions;
6764   Instruction *InsertPoint = I;
6765   for (Use *U : ToReplace) {
6766     auto *UI = cast<Instruction>(U->get());
6767     Instruction *NI = UI->clone();
6768     NewInstructions[UI] = NI;
6769     MaybeDead.insert(UI);
6770     LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
6771     NI->insertBefore(InsertPoint);
6772     InsertPoint = NI;
6773     InsertedInsts.insert(NI);
6774 
6775     // Update the use for the new instruction, making sure that we update the
6776     // sunk instruction uses, if it is part of a chain that has already been
6777     // sunk.
6778     Instruction *OldI = cast<Instruction>(U->getUser());
6779     if (NewInstructions.count(OldI))
6780       NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
6781     else
6782       U->set(NI);
6783     Changed = true;
6784   }
6785 
6786   // Remove instructions that are dead after sinking.
6787   for (auto *I : MaybeDead) {
6788     if (!I->hasNUsesOrMore(1)) {
6789       LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
6790       I->eraseFromParent();
6791     }
6792   }
6793 
6794   return Changed;
6795 }
6796 
6797 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
6798   Value *Cond = SI->getCondition();
6799   Type *OldType = Cond->getType();
6800   LLVMContext &Context = Cond->getContext();
6801   MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
6802   unsigned RegWidth = RegType.getSizeInBits();
6803 
6804   if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
6805     return false;
6806 
6807   // If the register width is greater than the type width, expand the condition
6808   // of the switch instruction and each case constant to the width of the
6809   // register. By widening the type of the switch condition, subsequent
6810   // comparisons (for case comparisons) will not need to be extended to the
6811   // preferred register width, so we will potentially eliminate N-1 extends,
6812   // where N is the number of cases in the switch.
6813   auto *NewType = Type::getIntNTy(Context, RegWidth);
6814 
6815   // Zero-extend the switch condition and case constants unless the switch
6816   // condition is a function argument that is already being sign-extended.
6817   // In that case, we can avoid an unnecessary mask/extension by sign-extending
6818   // everything instead.
6819   Instruction::CastOps ExtType = Instruction::ZExt;
6820   if (auto *Arg = dyn_cast<Argument>(Cond))
6821     if (Arg->hasSExtAttr())
6822       ExtType = Instruction::SExt;
6823 
6824   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
6825   ExtInst->insertBefore(SI);
6826   ExtInst->setDebugLoc(SI->getDebugLoc());
6827   SI->setCondition(ExtInst);
6828   for (auto Case : SI->cases()) {
6829     APInt NarrowConst = Case.getCaseValue()->getValue();
6830     APInt WideConst = (ExtType == Instruction::ZExt) ?
6831                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
6832     Case.setValue(ConstantInt::get(Context, WideConst));
6833   }
6834 
6835   return true;
6836 }
6837 
6838 
6839 namespace {
6840 
6841 /// Helper class to promote a scalar operation to a vector one.
6842 /// This class is used to move downward extractelement transition.
6843 /// E.g.,
6844 /// a = vector_op <2 x i32>
6845 /// b = extractelement <2 x i32> a, i32 0
6846 /// c = scalar_op b
6847 /// store c
6848 ///
6849 /// =>
6850 /// a = vector_op <2 x i32>
6851 /// c = vector_op a (equivalent to scalar_op on the related lane)
6852 /// * d = extractelement <2 x i32> c, i32 0
6853 /// * store d
6854 /// Assuming both extractelement and store can be combine, we get rid of the
6855 /// transition.
6856 class VectorPromoteHelper {
6857   /// DataLayout associated with the current module.
6858   const DataLayout &DL;
6859 
6860   /// Used to perform some checks on the legality of vector operations.
6861   const TargetLowering &TLI;
6862 
6863   /// Used to estimated the cost of the promoted chain.
6864   const TargetTransformInfo &TTI;
6865 
6866   /// The transition being moved downwards.
6867   Instruction *Transition;
6868 
6869   /// The sequence of instructions to be promoted.
6870   SmallVector<Instruction *, 4> InstsToBePromoted;
6871 
6872   /// Cost of combining a store and an extract.
6873   unsigned StoreExtractCombineCost;
6874 
6875   /// Instruction that will be combined with the transition.
6876   Instruction *CombineInst = nullptr;
6877 
6878   /// The instruction that represents the current end of the transition.
6879   /// Since we are faking the promotion until we reach the end of the chain
6880   /// of computation, we need a way to get the current end of the transition.
6881   Instruction *getEndOfTransition() const {
6882     if (InstsToBePromoted.empty())
6883       return Transition;
6884     return InstsToBePromoted.back();
6885   }
6886 
6887   /// Return the index of the original value in the transition.
6888   /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
6889   /// c, is at index 0.
6890   unsigned getTransitionOriginalValueIdx() const {
6891     assert(isa<ExtractElementInst>(Transition) &&
6892            "Other kind of transitions are not supported yet");
6893     return 0;
6894   }
6895 
6896   /// Return the index of the index in the transition.
6897   /// E.g., for "extractelement <2 x i32> c, i32 0" the index
6898   /// is at index 1.
6899   unsigned getTransitionIdx() const {
6900     assert(isa<ExtractElementInst>(Transition) &&
6901            "Other kind of transitions are not supported yet");
6902     return 1;
6903   }
6904 
6905   /// Get the type of the transition.
6906   /// This is the type of the original value.
6907   /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
6908   /// transition is <2 x i32>.
6909   Type *getTransitionType() const {
6910     return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
6911   }
6912 
6913   /// Promote \p ToBePromoted by moving \p Def downward through.
6914   /// I.e., we have the following sequence:
6915   /// Def = Transition <ty1> a to <ty2>
6916   /// b = ToBePromoted <ty2> Def, ...
6917   /// =>
6918   /// b = ToBePromoted <ty1> a, ...
6919   /// Def = Transition <ty1> ToBePromoted to <ty2>
6920   void promoteImpl(Instruction *ToBePromoted);
6921 
6922   /// Check whether or not it is profitable to promote all the
6923   /// instructions enqueued to be promoted.
6924   bool isProfitableToPromote() {
6925     Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
6926     unsigned Index = isa<ConstantInt>(ValIdx)
6927                          ? cast<ConstantInt>(ValIdx)->getZExtValue()
6928                          : -1;
6929     Type *PromotedType = getTransitionType();
6930 
6931     StoreInst *ST = cast<StoreInst>(CombineInst);
6932     unsigned AS = ST->getPointerAddressSpace();
6933     unsigned Align = ST->getAlignment();
6934     // Check if this store is supported.
6935     if (!TLI.allowsMisalignedMemoryAccesses(
6936             TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
6937             Align)) {
6938       // If this is not supported, there is no way we can combine
6939       // the extract with the store.
6940       return false;
6941     }
6942 
6943     // The scalar chain of computation has to pay for the transition
6944     // scalar to vector.
6945     // The vector chain has to account for the combining cost.
6946     uint64_t ScalarCost =
6947         TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
6948     uint64_t VectorCost = StoreExtractCombineCost;
6949     enum TargetTransformInfo::TargetCostKind CostKind =
6950       TargetTransformInfo::TCK_RecipThroughput;
6951     for (const auto &Inst : InstsToBePromoted) {
6952       // Compute the cost.
6953       // By construction, all instructions being promoted are arithmetic ones.
6954       // Moreover, one argument is a constant that can be viewed as a splat
6955       // constant.
6956       Value *Arg0 = Inst->getOperand(0);
6957       bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
6958                             isa<ConstantFP>(Arg0);
6959       TargetTransformInfo::OperandValueKind Arg0OVK =
6960           IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
6961                          : TargetTransformInfo::OK_AnyValue;
6962       TargetTransformInfo::OperandValueKind Arg1OVK =
6963           !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
6964                           : TargetTransformInfo::OK_AnyValue;
6965       ScalarCost += TTI.getArithmeticInstrCost(
6966           Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK);
6967       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
6968                                                CostKind,
6969                                                Arg0OVK, Arg1OVK);
6970     }
6971     LLVM_DEBUG(
6972         dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
6973                << ScalarCost << "\nVector: " << VectorCost << '\n');
6974     return ScalarCost > VectorCost;
6975   }
6976 
6977   /// Generate a constant vector with \p Val with the same
6978   /// number of elements as the transition.
6979   /// \p UseSplat defines whether or not \p Val should be replicated
6980   /// across the whole vector.
6981   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
6982   /// otherwise we generate a vector with as many undef as possible:
6983   /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
6984   /// used at the index of the extract.
6985   Value *getConstantVector(Constant *Val, bool UseSplat) const {
6986     unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
6987     if (!UseSplat) {
6988       // If we cannot determine where the constant must be, we have to
6989       // use a splat constant.
6990       Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
6991       if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
6992         ExtractIdx = CstVal->getSExtValue();
6993       else
6994         UseSplat = true;
6995     }
6996 
6997     ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();
6998     if (UseSplat)
6999       return ConstantVector::getSplat(EC, Val);
7000 
7001     if (!EC.isScalable()) {
7002       SmallVector<Constant *, 4> ConstVec;
7003       UndefValue *UndefVal = UndefValue::get(Val->getType());
7004       for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
7005         if (Idx == ExtractIdx)
7006           ConstVec.push_back(Val);
7007         else
7008           ConstVec.push_back(UndefVal);
7009       }
7010       return ConstantVector::get(ConstVec);
7011     } else
7012       llvm_unreachable(
7013           "Generate scalable vector for non-splat is unimplemented");
7014   }
7015 
7016   /// Check if promoting to a vector type an operand at \p OperandIdx
7017   /// in \p Use can trigger undefined behavior.
7018   static bool canCauseUndefinedBehavior(const Instruction *Use,
7019                                         unsigned OperandIdx) {
7020     // This is not safe to introduce undef when the operand is on
7021     // the right hand side of a division-like instruction.
7022     if (OperandIdx != 1)
7023       return false;
7024     switch (Use->getOpcode()) {
7025     default:
7026       return false;
7027     case Instruction::SDiv:
7028     case Instruction::UDiv:
7029     case Instruction::SRem:
7030     case Instruction::URem:
7031       return true;
7032     case Instruction::FDiv:
7033     case Instruction::FRem:
7034       return !Use->hasNoNaNs();
7035     }
7036     llvm_unreachable(nullptr);
7037   }
7038 
7039 public:
7040   VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
7041                       const TargetTransformInfo &TTI, Instruction *Transition,
7042                       unsigned CombineCost)
7043       : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
7044         StoreExtractCombineCost(CombineCost) {
7045     assert(Transition && "Do not know how to promote null");
7046   }
7047 
7048   /// Check if we can promote \p ToBePromoted to \p Type.
7049   bool canPromote(const Instruction *ToBePromoted) const {
7050     // We could support CastInst too.
7051     return isa<BinaryOperator>(ToBePromoted);
7052   }
7053 
7054   /// Check if it is profitable to promote \p ToBePromoted
7055   /// by moving downward the transition through.
7056   bool shouldPromote(const Instruction *ToBePromoted) const {
7057     // Promote only if all the operands can be statically expanded.
7058     // Indeed, we do not want to introduce any new kind of transitions.
7059     for (const Use &U : ToBePromoted->operands()) {
7060       const Value *Val = U.get();
7061       if (Val == getEndOfTransition()) {
7062         // If the use is a division and the transition is on the rhs,
7063         // we cannot promote the operation, otherwise we may create a
7064         // division by zero.
7065         if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
7066           return false;
7067         continue;
7068       }
7069       if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
7070           !isa<ConstantFP>(Val))
7071         return false;
7072     }
7073     // Check that the resulting operation is legal.
7074     int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
7075     if (!ISDOpcode)
7076       return false;
7077     return StressStoreExtract ||
7078            TLI.isOperationLegalOrCustom(
7079                ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
7080   }
7081 
7082   /// Check whether or not \p Use can be combined
7083   /// with the transition.
7084   /// I.e., is it possible to do Use(Transition) => AnotherUse?
7085   bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
7086 
7087   /// Record \p ToBePromoted as part of the chain to be promoted.
7088   void enqueueForPromotion(Instruction *ToBePromoted) {
7089     InstsToBePromoted.push_back(ToBePromoted);
7090   }
7091 
7092   /// Set the instruction that will be combined with the transition.
7093   void recordCombineInstruction(Instruction *ToBeCombined) {
7094     assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
7095     CombineInst = ToBeCombined;
7096   }
7097 
7098   /// Promote all the instructions enqueued for promotion if it is
7099   /// is profitable.
7100   /// \return True if the promotion happened, false otherwise.
7101   bool promote() {
7102     // Check if there is something to promote.
7103     // Right now, if we do not have anything to combine with,
7104     // we assume the promotion is not profitable.
7105     if (InstsToBePromoted.empty() || !CombineInst)
7106       return false;
7107 
7108     // Check cost.
7109     if (!StressStoreExtract && !isProfitableToPromote())
7110       return false;
7111 
7112     // Promote.
7113     for (auto &ToBePromoted : InstsToBePromoted)
7114       promoteImpl(ToBePromoted);
7115     InstsToBePromoted.clear();
7116     return true;
7117   }
7118 };
7119 
7120 } // end anonymous namespace
7121 
7122 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
7123   // At this point, we know that all the operands of ToBePromoted but Def
7124   // can be statically promoted.
7125   // For Def, we need to use its parameter in ToBePromoted:
7126   // b = ToBePromoted ty1 a
7127   // Def = Transition ty1 b to ty2
7128   // Move the transition down.
7129   // 1. Replace all uses of the promoted operation by the transition.
7130   // = ... b => = ... Def.
7131   assert(ToBePromoted->getType() == Transition->getType() &&
7132          "The type of the result of the transition does not match "
7133          "the final type");
7134   ToBePromoted->replaceAllUsesWith(Transition);
7135   // 2. Update the type of the uses.
7136   // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
7137   Type *TransitionTy = getTransitionType();
7138   ToBePromoted->mutateType(TransitionTy);
7139   // 3. Update all the operands of the promoted operation with promoted
7140   // operands.
7141   // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
7142   for (Use &U : ToBePromoted->operands()) {
7143     Value *Val = U.get();
7144     Value *NewVal = nullptr;
7145     if (Val == Transition)
7146       NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
7147     else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
7148              isa<ConstantFP>(Val)) {
7149       // Use a splat constant if it is not safe to use undef.
7150       NewVal = getConstantVector(
7151           cast<Constant>(Val),
7152           isa<UndefValue>(Val) ||
7153               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
7154     } else
7155       llvm_unreachable("Did you modified shouldPromote and forgot to update "
7156                        "this?");
7157     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
7158   }
7159   Transition->moveAfter(ToBePromoted);
7160   Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
7161 }
7162 
7163 /// Some targets can do store(extractelement) with one instruction.
7164 /// Try to push the extractelement towards the stores when the target
7165 /// has this feature and this is profitable.
7166 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
7167   unsigned CombineCost = std::numeric_limits<unsigned>::max();
7168   if (DisableStoreExtract ||
7169       (!StressStoreExtract &&
7170        !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
7171                                        Inst->getOperand(1), CombineCost)))
7172     return false;
7173 
7174   // At this point we know that Inst is a vector to scalar transition.
7175   // Try to move it down the def-use chain, until:
7176   // - We can combine the transition with its single use
7177   //   => we got rid of the transition.
7178   // - We escape the current basic block
7179   //   => we would need to check that we are moving it at a cheaper place and
7180   //      we do not do that for now.
7181   BasicBlock *Parent = Inst->getParent();
7182   LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
7183   VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
7184   // If the transition has more than one use, assume this is not going to be
7185   // beneficial.
7186   while (Inst->hasOneUse()) {
7187     Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
7188     LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
7189 
7190     if (ToBePromoted->getParent() != Parent) {
7191       LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
7192                         << ToBePromoted->getParent()->getName()
7193                         << ") than the transition (" << Parent->getName()
7194                         << ").\n");
7195       return false;
7196     }
7197 
7198     if (VPH.canCombine(ToBePromoted)) {
7199       LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
7200                         << "will be combined with: " << *ToBePromoted << '\n');
7201       VPH.recordCombineInstruction(ToBePromoted);
7202       bool Changed = VPH.promote();
7203       NumStoreExtractExposed += Changed;
7204       return Changed;
7205     }
7206 
7207     LLVM_DEBUG(dbgs() << "Try promoting.\n");
7208     if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
7209       return false;
7210 
7211     LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
7212 
7213     VPH.enqueueForPromotion(ToBePromoted);
7214     Inst = ToBePromoted;
7215   }
7216   return false;
7217 }
7218 
7219 /// For the instruction sequence of store below, F and I values
7220 /// are bundled together as an i64 value before being stored into memory.
7221 /// Sometimes it is more efficient to generate separate stores for F and I,
7222 /// which can remove the bitwise instructions or sink them to colder places.
7223 ///
7224 ///   (store (or (zext (bitcast F to i32) to i64),
7225 ///              (shl (zext I to i64), 32)), addr)  -->
7226 ///   (store F, addr) and (store I, addr+4)
7227 ///
7228 /// Similarly, splitting for other merged store can also be beneficial, like:
7229 /// For pair of {i32, i32}, i64 store --> two i32 stores.
7230 /// For pair of {i32, i16}, i64 store --> two i32 stores.
7231 /// For pair of {i16, i16}, i32 store --> two i16 stores.
7232 /// For pair of {i16, i8},  i32 store --> two i16 stores.
7233 /// For pair of {i8, i8},   i16 store --> two i8 stores.
7234 ///
7235 /// We allow each target to determine specifically which kind of splitting is
7236 /// supported.
7237 ///
7238 /// The store patterns are commonly seen from the simple code snippet below
7239 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
7240 ///   void goo(const std::pair<int, float> &);
7241 ///   hoo() {
7242 ///     ...
7243 ///     goo(std::make_pair(tmp, ftmp));
7244 ///     ...
7245 ///   }
7246 ///
7247 /// Although we already have similar splitting in DAG Combine, we duplicate
7248 /// it in CodeGenPrepare to catch the case in which pattern is across
7249 /// multiple BBs. The logic in DAG Combine is kept to catch case generated
7250 /// during code expansion.
7251 static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
7252                                 const TargetLowering &TLI) {
7253   // Handle simple but common cases only.
7254   Type *StoreType = SI.getValueOperand()->getType();
7255 
7256   // The code below assumes shifting a value by <number of bits>,
7257   // whereas scalable vectors would have to be shifted by
7258   // <2log(vscale) + number of bits> in order to store the
7259   // low/high parts. Bailing out for now.
7260   if (isa<ScalableVectorType>(StoreType))
7261     return false;
7262 
7263   if (!DL.typeSizeEqualsStoreSize(StoreType) ||
7264       DL.getTypeSizeInBits(StoreType) == 0)
7265     return false;
7266 
7267   unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
7268   Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
7269   if (!DL.typeSizeEqualsStoreSize(SplitStoreType))
7270     return false;
7271 
7272   // Don't split the store if it is volatile.
7273   if (SI.isVolatile())
7274     return false;
7275 
7276   // Match the following patterns:
7277   // (store (or (zext LValue to i64),
7278   //            (shl (zext HValue to i64), 32)), HalfValBitSize)
7279   //  or
7280   // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
7281   //            (zext LValue to i64),
7282   // Expect both operands of OR and the first operand of SHL have only
7283   // one use.
7284   Value *LValue, *HValue;
7285   if (!match(SI.getValueOperand(),
7286              m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
7287                     m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
7288                                    m_SpecificInt(HalfValBitSize))))))
7289     return false;
7290 
7291   // Check LValue and HValue are int with size less or equal than 32.
7292   if (!LValue->getType()->isIntegerTy() ||
7293       DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
7294       !HValue->getType()->isIntegerTy() ||
7295       DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
7296     return false;
7297 
7298   // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
7299   // as the input of target query.
7300   auto *LBC = dyn_cast<BitCastInst>(LValue);
7301   auto *HBC = dyn_cast<BitCastInst>(HValue);
7302   EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
7303                   : EVT::getEVT(LValue->getType());
7304   EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
7305                    : EVT::getEVT(HValue->getType());
7306   if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
7307     return false;
7308 
7309   // Start to split store.
7310   IRBuilder<> Builder(SI.getContext());
7311   Builder.SetInsertPoint(&SI);
7312 
7313   // If LValue/HValue is a bitcast in another BB, create a new one in current
7314   // BB so it may be merged with the splitted stores by dag combiner.
7315   if (LBC && LBC->getParent() != SI.getParent())
7316     LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
7317   if (HBC && HBC->getParent() != SI.getParent())
7318     HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
7319 
7320   bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
7321   auto CreateSplitStore = [&](Value *V, bool Upper) {
7322     V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
7323     Value *Addr = Builder.CreateBitCast(
7324         SI.getOperand(1),
7325         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
7326     Align Alignment = SI.getAlign();
7327     const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
7328     if (IsOffsetStore) {
7329       Addr = Builder.CreateGEP(
7330           SplitStoreType, Addr,
7331           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
7332 
7333       // When splitting the store in half, naturally one half will retain the
7334       // alignment of the original wider store, regardless of whether it was
7335       // over-aligned or not, while the other will require adjustment.
7336       Alignment = commonAlignment(Alignment, HalfValBitSize / 8);
7337     }
7338     Builder.CreateAlignedStore(V, Addr, Alignment);
7339   };
7340 
7341   CreateSplitStore(LValue, false);
7342   CreateSplitStore(HValue, true);
7343 
7344   // Delete the old store.
7345   SI.eraseFromParent();
7346   return true;
7347 }
7348 
7349 // Return true if the GEP has two operands, the first operand is of a sequential
7350 // type, and the second operand is a constant.
7351 static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
7352   gep_type_iterator I = gep_type_begin(*GEP);
7353   return GEP->getNumOperands() == 2 &&
7354       I.isSequential() &&
7355       isa<ConstantInt>(GEP->getOperand(1));
7356 }
7357 
7358 // Try unmerging GEPs to reduce liveness interference (register pressure) across
7359 // IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
7360 // reducing liveness interference across those edges benefits global register
7361 // allocation. Currently handles only certain cases.
7362 //
7363 // For example, unmerge %GEPI and %UGEPI as below.
7364 //
7365 // ---------- BEFORE ----------
7366 // SrcBlock:
7367 //   ...
7368 //   %GEPIOp = ...
7369 //   ...
7370 //   %GEPI = gep %GEPIOp, Idx
7371 //   ...
7372 //   indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
7373 //   (* %GEPI is alive on the indirectbr edges due to other uses ahead)
7374 //   (* %GEPIOp is alive on the indirectbr edges only because of it's used by
7375 //   %UGEPI)
7376 //
7377 // DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
7378 // DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
7379 // ...
7380 //
7381 // DstBi:
7382 //   ...
7383 //   %UGEPI = gep %GEPIOp, UIdx
7384 // ...
7385 // ---------------------------
7386 //
7387 // ---------- AFTER ----------
7388 // SrcBlock:
7389 //   ... (same as above)
7390 //    (* %GEPI is still alive on the indirectbr edges)
7391 //    (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
7392 //    unmerging)
7393 // ...
7394 //
7395 // DstBi:
7396 //   ...
7397 //   %UGEPI = gep %GEPI, (UIdx-Idx)
7398 //   ...
7399 // ---------------------------
7400 //
7401 // The register pressure on the IndirectBr edges is reduced because %GEPIOp is
7402 // no longer alive on them.
7403 //
7404 // We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
7405 // of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
7406 // not to disable further simplications and optimizations as a result of GEP
7407 // merging.
7408 //
7409 // Note this unmerging may increase the length of the data flow critical path
7410 // (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
7411 // between the register pressure and the length of data-flow critical
7412 // path. Restricting this to the uncommon IndirectBr case would minimize the
7413 // impact of potentially longer critical path, if any, and the impact on compile
7414 // time.
7415 static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
7416                                              const TargetTransformInfo *TTI) {
7417   BasicBlock *SrcBlock = GEPI->getParent();
7418   // Check that SrcBlock ends with an IndirectBr. If not, give up. The common
7419   // (non-IndirectBr) cases exit early here.
7420   if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
7421     return false;
7422   // Check that GEPI is a simple gep with a single constant index.
7423   if (!GEPSequentialConstIndexed(GEPI))
7424     return false;
7425   ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
7426   // Check that GEPI is a cheap one.
7427   if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),
7428                          TargetTransformInfo::TCK_SizeAndLatency)
7429       > TargetTransformInfo::TCC_Basic)
7430     return false;
7431   Value *GEPIOp = GEPI->getOperand(0);
7432   // Check that GEPIOp is an instruction that's also defined in SrcBlock.
7433   if (!isa<Instruction>(GEPIOp))
7434     return false;
7435   auto *GEPIOpI = cast<Instruction>(GEPIOp);
7436   if (GEPIOpI->getParent() != SrcBlock)
7437     return false;
7438   // Check that GEP is used outside the block, meaning it's alive on the
7439   // IndirectBr edge(s).
7440   if (find_if(GEPI->users(), [&](User *Usr) {
7441         if (auto *I = dyn_cast<Instruction>(Usr)) {
7442           if (I->getParent() != SrcBlock) {
7443             return true;
7444           }
7445         }
7446         return false;
7447       }) == GEPI->users().end())
7448     return false;
7449   // The second elements of the GEP chains to be unmerged.
7450   std::vector<GetElementPtrInst *> UGEPIs;
7451   // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
7452   // on IndirectBr edges.
7453   for (User *Usr : GEPIOp->users()) {
7454     if (Usr == GEPI) continue;
7455     // Check if Usr is an Instruction. If not, give up.
7456     if (!isa<Instruction>(Usr))
7457       return false;
7458     auto *UI = cast<Instruction>(Usr);
7459     // Check if Usr in the same block as GEPIOp, which is fine, skip.
7460     if (UI->getParent() == SrcBlock)
7461       continue;
7462     // Check if Usr is a GEP. If not, give up.
7463     if (!isa<GetElementPtrInst>(Usr))
7464       return false;
7465     auto *UGEPI = cast<GetElementPtrInst>(Usr);
7466     // Check if UGEPI is a simple gep with a single constant index and GEPIOp is
7467     // the pointer operand to it. If so, record it in the vector. If not, give
7468     // up.
7469     if (!GEPSequentialConstIndexed(UGEPI))
7470       return false;
7471     if (UGEPI->getOperand(0) != GEPIOp)
7472       return false;
7473     if (GEPIIdx->getType() !=
7474         cast<ConstantInt>(UGEPI->getOperand(1))->getType())
7475       return false;
7476     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7477     if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),
7478                            TargetTransformInfo::TCK_SizeAndLatency)
7479         > TargetTransformInfo::TCC_Basic)
7480       return false;
7481     UGEPIs.push_back(UGEPI);
7482   }
7483   if (UGEPIs.size() == 0)
7484     return false;
7485   // Check the materializing cost of (Uidx-Idx).
7486   for (GetElementPtrInst *UGEPI : UGEPIs) {
7487     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7488     APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
7489     unsigned ImmCost =
7490       TTI->getIntImmCost(NewIdx, GEPIIdx->getType(),
7491                          TargetTransformInfo::TCK_SizeAndLatency);
7492     if (ImmCost > TargetTransformInfo::TCC_Basic)
7493       return false;
7494   }
7495   // Now unmerge between GEPI and UGEPIs.
7496   for (GetElementPtrInst *UGEPI : UGEPIs) {
7497     UGEPI->setOperand(0, GEPI);
7498     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
7499     Constant *NewUGEPIIdx =
7500         ConstantInt::get(GEPIIdx->getType(),
7501                          UGEPIIdx->getValue() - GEPIIdx->getValue());
7502     UGEPI->setOperand(1, NewUGEPIIdx);
7503     // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
7504     // inbounds to avoid UB.
7505     if (!GEPI->isInBounds()) {
7506       UGEPI->setIsInBounds(false);
7507     }
7508   }
7509   // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
7510   // alive on IndirectBr edges).
7511   assert(find_if(GEPIOp->users(), [&](User *Usr) {
7512         return cast<Instruction>(Usr)->getParent() != SrcBlock;
7513       }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
7514   return true;
7515 }
7516 
7517 bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
7518   // Bail out if we inserted the instruction to prevent optimizations from
7519   // stepping on each other's toes.
7520   if (InsertedInsts.count(I))
7521     return false;
7522 
7523   // TODO: Move into the switch on opcode below here.
7524   if (PHINode *P = dyn_cast<PHINode>(I)) {
7525     // It is possible for very late stage optimizations (such as SimplifyCFG)
7526     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
7527     // trivial PHI, go ahead and zap it here.
7528     if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
7529       LargeOffsetGEPMap.erase(P);
7530       P->replaceAllUsesWith(V);
7531       P->eraseFromParent();
7532       ++NumPHIsElim;
7533       return true;
7534     }
7535     return false;
7536   }
7537 
7538   if (CastInst *CI = dyn_cast<CastInst>(I)) {
7539     // If the source of the cast is a constant, then this should have
7540     // already been constant folded.  The only reason NOT to constant fold
7541     // it is if something (e.g. LSR) was careful to place the constant
7542     // evaluation in a block other than then one that uses it (e.g. to hoist
7543     // the address of globals out of a loop).  If this is the case, we don't
7544     // want to forward-subst the cast.
7545     if (isa<Constant>(CI->getOperand(0)))
7546       return false;
7547 
7548     if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
7549       return true;
7550 
7551     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
7552       /// Sink a zext or sext into its user blocks if the target type doesn't
7553       /// fit in one register
7554       if (TLI->getTypeAction(CI->getContext(),
7555                              TLI->getValueType(*DL, CI->getType())) ==
7556           TargetLowering::TypeExpandInteger) {
7557         return SinkCast(CI);
7558       } else {
7559         bool MadeChange = optimizeExt(I);
7560         return MadeChange | optimizeExtUses(I);
7561       }
7562     }
7563     return false;
7564   }
7565 
7566   if (auto *Cmp = dyn_cast<CmpInst>(I))
7567     if (optimizeCmp(Cmp, ModifiedDT))
7568       return true;
7569 
7570   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
7571     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
7572     bool Modified = optimizeLoadExt(LI);
7573     unsigned AS = LI->getPointerAddressSpace();
7574     Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
7575     return Modified;
7576   }
7577 
7578   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
7579     if (splitMergedValStore(*SI, *DL, *TLI))
7580       return true;
7581     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
7582     unsigned AS = SI->getPointerAddressSpace();
7583     return optimizeMemoryInst(I, SI->getOperand(1),
7584                               SI->getOperand(0)->getType(), AS);
7585   }
7586 
7587   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
7588       unsigned AS = RMW->getPointerAddressSpace();
7589       return optimizeMemoryInst(I, RMW->getPointerOperand(),
7590                                 RMW->getType(), AS);
7591   }
7592 
7593   if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
7594       unsigned AS = CmpX->getPointerAddressSpace();
7595       return optimizeMemoryInst(I, CmpX->getPointerOperand(),
7596                                 CmpX->getCompareOperand()->getType(), AS);
7597   }
7598 
7599   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
7600 
7601   if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)
7602     return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
7603 
7604   // TODO: Move this into the switch on opcode - it handles shifts already.
7605   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
7606                 BinOp->getOpcode() == Instruction::LShr)) {
7607     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
7608     if (CI && TLI->hasExtractBitsInsn())
7609       if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
7610         return true;
7611   }
7612 
7613   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
7614     if (GEPI->hasAllZeroIndices()) {
7615       /// The GEP operand must be a pointer, so must its result -> BitCast
7616       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
7617                                         GEPI->getName(), GEPI);
7618       NC->setDebugLoc(GEPI->getDebugLoc());
7619       GEPI->replaceAllUsesWith(NC);
7620       GEPI->eraseFromParent();
7621       ++NumGEPsElim;
7622       optimizeInst(NC, ModifiedDT);
7623       return true;
7624     }
7625     if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
7626       return true;
7627     }
7628     return false;
7629   }
7630 
7631   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
7632     // freeze(icmp a, const)) -> icmp (freeze a), const
7633     // This helps generate efficient conditional jumps.
7634     Instruction *CmpI = nullptr;
7635     if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))
7636       CmpI = II;
7637     else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))
7638       CmpI = F->getFastMathFlags().none() ? F : nullptr;
7639 
7640     if (CmpI && CmpI->hasOneUse()) {
7641       auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);
7642       bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||
7643                     isa<ConstantPointerNull>(Op0);
7644       bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||
7645                     isa<ConstantPointerNull>(Op1);
7646       if (Const0 || Const1) {
7647         if (!Const0 || !Const1) {
7648           auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
7649           F->takeName(FI);
7650           CmpI->setOperand(Const0 ? 1 : 0, F);
7651         }
7652         FI->replaceAllUsesWith(CmpI);
7653         FI->eraseFromParent();
7654         return true;
7655       }
7656     }
7657     return false;
7658   }
7659 
7660   if (tryToSinkFreeOperands(I))
7661     return true;
7662 
7663   switch (I->getOpcode()) {
7664   case Instruction::Shl:
7665   case Instruction::LShr:
7666   case Instruction::AShr:
7667     return optimizeShiftInst(cast<BinaryOperator>(I));
7668   case Instruction::Call:
7669     return optimizeCallInst(cast<CallInst>(I), ModifiedDT);
7670   case Instruction::Select:
7671     return optimizeSelectInst(cast<SelectInst>(I));
7672   case Instruction::ShuffleVector:
7673     return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
7674   case Instruction::Switch:
7675     return optimizeSwitchInst(cast<SwitchInst>(I));
7676   case Instruction::ExtractElement:
7677     return optimizeExtractElementInst(cast<ExtractElementInst>(I));
7678   }
7679 
7680   return false;
7681 }
7682 
7683 /// Given an OR instruction, check to see if this is a bitreverse
7684 /// idiom. If so, insert the new intrinsic and return true.
7685 bool CodeGenPrepare::makeBitReverse(Instruction &I) {
7686   if (!I.getType()->isIntegerTy() ||
7687       !TLI->isOperationLegalOrCustom(ISD::BITREVERSE,
7688                                      TLI->getValueType(*DL, I.getType(), true)))
7689     return false;
7690 
7691   SmallVector<Instruction*, 4> Insts;
7692   if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
7693     return false;
7694   Instruction *LastInst = Insts.back();
7695   I.replaceAllUsesWith(LastInst);
7696   RecursivelyDeleteTriviallyDeadInstructions(
7697       &I, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); });
7698   return true;
7699 }
7700 
7701 // In this pass we look for GEP and cast instructions that are used
7702 // across basic blocks and rewrite them to improve basic-block-at-a-time
7703 // selection.
7704 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
7705   SunkAddrs.clear();
7706   bool MadeChange = false;
7707 
7708   CurInstIterator = BB.begin();
7709   while (CurInstIterator != BB.end()) {
7710     MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
7711     if (ModifiedDT)
7712       return true;
7713   }
7714 
7715   bool MadeBitReverse = true;
7716   while (MadeBitReverse) {
7717     MadeBitReverse = false;
7718     for (auto &I : reverse(BB)) {
7719       if (makeBitReverse(I)) {
7720         MadeBitReverse = MadeChange = true;
7721         break;
7722       }
7723     }
7724   }
7725   MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT);
7726 
7727   return MadeChange;
7728 }
7729 
7730 // Some CGP optimizations may move or alter what's computed in a block. Check
7731 // whether a dbg.value intrinsic could be pointed at a more appropriate operand.
7732 bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
7733   assert(isa<DbgValueInst>(I));
7734   DbgValueInst &DVI = *cast<DbgValueInst>(I);
7735 
7736   // Does this dbg.value refer to a sunk address calculation?
7737   Value *Location = DVI.getVariableLocation();
7738   WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
7739   Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
7740   if (SunkAddr) {
7741     // Point dbg.value at locally computed address, which should give the best
7742     // opportunity to be accurately lowered. This update may change the type of
7743     // pointer being referred to; however this makes no difference to debugging
7744     // information, and we can't generate bitcasts that may affect codegen.
7745     DVI.setOperand(0, MetadataAsValue::get(DVI.getContext(),
7746                                            ValueAsMetadata::get(SunkAddr)));
7747     return true;
7748   }
7749   return false;
7750 }
7751 
7752 // A llvm.dbg.value may be using a value before its definition, due to
7753 // optimizations in this pass and others. Scan for such dbg.values, and rescue
7754 // them by moving the dbg.value to immediately after the value definition.
7755 // FIXME: Ideally this should never be necessary, and this has the potential
7756 // to re-order dbg.value intrinsics.
7757 bool CodeGenPrepare::placeDbgValues(Function &F) {
7758   bool MadeChange = false;
7759   DominatorTree DT(F);
7760 
7761   for (BasicBlock &BB : F) {
7762     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
7763       Instruction *Insn = &*BI++;
7764       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
7765       if (!DVI)
7766         continue;
7767 
7768       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
7769 
7770       if (!VI || VI->isTerminator())
7771         continue;
7772 
7773       // If VI is a phi in a block with an EHPad terminator, we can't insert
7774       // after it.
7775       if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
7776         continue;
7777 
7778       // If the defining instruction dominates the dbg.value, we do not need
7779       // to move the dbg.value.
7780       if (DT.dominates(VI, DVI))
7781         continue;
7782 
7783       LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
7784                         << *DVI << ' ' << *VI);
7785       DVI->removeFromParent();
7786       if (isa<PHINode>(VI))
7787         DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
7788       else
7789         DVI->insertAfter(VI);
7790       MadeChange = true;
7791       ++NumDbgValueMoved;
7792     }
7793   }
7794   return MadeChange;
7795 }
7796 
7797 /// Scale down both weights to fit into uint32_t.
7798 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
7799   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
7800   uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
7801   NewTrue = NewTrue / Scale;
7802   NewFalse = NewFalse / Scale;
7803 }
7804 
7805 /// Some targets prefer to split a conditional branch like:
7806 /// \code
7807 ///   %0 = icmp ne i32 %a, 0
7808 ///   %1 = icmp ne i32 %b, 0
7809 ///   %or.cond = or i1 %0, %1
7810 ///   br i1 %or.cond, label %TrueBB, label %FalseBB
7811 /// \endcode
7812 /// into multiple branch instructions like:
7813 /// \code
7814 ///   bb1:
7815 ///     %0 = icmp ne i32 %a, 0
7816 ///     br i1 %0, label %TrueBB, label %bb2
7817 ///   bb2:
7818 ///     %1 = icmp ne i32 %b, 0
7819 ///     br i1 %1, label %TrueBB, label %FalseBB
7820 /// \endcode
7821 /// This usually allows instruction selection to do even further optimizations
7822 /// and combine the compare with the branch instruction. Currently this is
7823 /// applied for targets which have "cheap" jump instructions.
7824 ///
7825 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
7826 ///
7827 bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
7828   if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
7829     return false;
7830 
7831   bool MadeChange = false;
7832   for (auto &BB : F) {
7833     // Does this BB end with the following?
7834     //   %cond1 = icmp|fcmp|binary instruction ...
7835     //   %cond2 = icmp|fcmp|binary instruction ...
7836     //   %cond.or = or|and i1 %cond1, cond2
7837     //   br i1 %cond.or label %dest1, label %dest2"
7838     BinaryOperator *LogicOp;
7839     BasicBlock *TBB, *FBB;
7840     if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
7841       continue;
7842 
7843     auto *Br1 = cast<BranchInst>(BB.getTerminator());
7844     if (Br1->getMetadata(LLVMContext::MD_unpredictable))
7845       continue;
7846 
7847     // The merging of mostly empty BB can cause a degenerate branch.
7848     if (TBB == FBB)
7849       continue;
7850 
7851     unsigned Opc;
7852     Value *Cond1, *Cond2;
7853     if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
7854                              m_OneUse(m_Value(Cond2)))))
7855       Opc = Instruction::And;
7856     else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
7857                                  m_OneUse(m_Value(Cond2)))))
7858       Opc = Instruction::Or;
7859     else
7860       continue;
7861 
7862     if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
7863         !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
7864       continue;
7865 
7866     LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
7867 
7868     // Create a new BB.
7869     auto *TmpBB =
7870         BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
7871                            BB.getParent(), BB.getNextNode());
7872 
7873     // Update original basic block by using the first condition directly by the
7874     // branch instruction and removing the no longer needed and/or instruction.
7875     Br1->setCondition(Cond1);
7876     LogicOp->eraseFromParent();
7877 
7878     // Depending on the condition we have to either replace the true or the
7879     // false successor of the original branch instruction.
7880     if (Opc == Instruction::And)
7881       Br1->setSuccessor(0, TmpBB);
7882     else
7883       Br1->setSuccessor(1, TmpBB);
7884 
7885     // Fill in the new basic block.
7886     auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
7887     if (auto *I = dyn_cast<Instruction>(Cond2)) {
7888       I->removeFromParent();
7889       I->insertBefore(Br2);
7890     }
7891 
7892     // Update PHI nodes in both successors. The original BB needs to be
7893     // replaced in one successor's PHI nodes, because the branch comes now from
7894     // the newly generated BB (NewBB). In the other successor we need to add one
7895     // incoming edge to the PHI nodes, because both branch instructions target
7896     // now the same successor. Depending on the original branch condition
7897     // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
7898     // we perform the correct update for the PHI nodes.
7899     // This doesn't change the successor order of the just created branch
7900     // instruction (or any other instruction).
7901     if (Opc == Instruction::Or)
7902       std::swap(TBB, FBB);
7903 
7904     // Replace the old BB with the new BB.
7905     TBB->replacePhiUsesWith(&BB, TmpBB);
7906 
7907     // Add another incoming edge form the new BB.
7908     for (PHINode &PN : FBB->phis()) {
7909       auto *Val = PN.getIncomingValueForBlock(&BB);
7910       PN.addIncoming(Val, TmpBB);
7911     }
7912 
7913     // Update the branch weights (from SelectionDAGBuilder::
7914     // FindMergedConditions).
7915     if (Opc == Instruction::Or) {
7916       // Codegen X | Y as:
7917       // BB1:
7918       //   jmp_if_X TBB
7919       //   jmp TmpBB
7920       // TmpBB:
7921       //   jmp_if_Y TBB
7922       //   jmp FBB
7923       //
7924 
7925       // We have flexibility in setting Prob for BB1 and Prob for NewBB.
7926       // The requirement is that
7927       //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
7928       //     = TrueProb for original BB.
7929       // Assuming the original weights are A and B, one choice is to set BB1's
7930       // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
7931       // assumes that
7932       //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
7933       // Another choice is to assume TrueProb for BB1 equals to TrueProb for
7934       // TmpBB, but the math is more complicated.
7935       uint64_t TrueWeight, FalseWeight;
7936       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
7937         uint64_t NewTrueWeight = TrueWeight;
7938         uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
7939         scaleWeights(NewTrueWeight, NewFalseWeight);
7940         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
7941                          .createBranchWeights(TrueWeight, FalseWeight));
7942 
7943         NewTrueWeight = TrueWeight;
7944         NewFalseWeight = 2 * FalseWeight;
7945         scaleWeights(NewTrueWeight, NewFalseWeight);
7946         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
7947                          .createBranchWeights(TrueWeight, FalseWeight));
7948       }
7949     } else {
7950       // Codegen X & Y as:
7951       // BB1:
7952       //   jmp_if_X TmpBB
7953       //   jmp FBB
7954       // TmpBB:
7955       //   jmp_if_Y TBB
7956       //   jmp FBB
7957       //
7958       //  This requires creation of TmpBB after CurBB.
7959 
7960       // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
7961       // The requirement is that
7962       //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
7963       //     = FalseProb for original BB.
7964       // Assuming the original weights are A and B, one choice is to set BB1's
7965       // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
7966       // assumes that
7967       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
7968       uint64_t TrueWeight, FalseWeight;
7969       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
7970         uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
7971         uint64_t NewFalseWeight = FalseWeight;
7972         scaleWeights(NewTrueWeight, NewFalseWeight);
7973         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
7974                          .createBranchWeights(TrueWeight, FalseWeight));
7975 
7976         NewTrueWeight = 2 * TrueWeight;
7977         NewFalseWeight = FalseWeight;
7978         scaleWeights(NewTrueWeight, NewFalseWeight);
7979         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
7980                          .createBranchWeights(TrueWeight, FalseWeight));
7981       }
7982     }
7983 
7984     ModifiedDT = true;
7985     MadeChange = true;
7986 
7987     LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
7988                TmpBB->dump());
7989   }
7990   return MadeChange;
7991 }
7992